diff gcc/config/ia64/vect.md @ 67:f6334be47118

update gcc from gcc-4.6-20100522 to gcc-4.6-20110318
author nobuyasu <dimolto@cr.ie.u-ryukyu.ac.jp>
date Tue, 22 Mar 2011 17:18:12 +0900
parents 77e2b8dfacca
children 04ced10e8804
line wrap: on
line diff
--- a/gcc/config/ia64/vect.md	Tue May 25 18:58:51 2010 +0900
+++ b/gcc/config/ia64/vect.md	Tue Mar 22 17:18:12 2011 +0900
@@ -1,5 +1,5 @@
 ;; IA-64 machine description for vector operations.
-;; Copyright (C) 2004, 2005, 2007 Free Software Foundation, Inc.
+;; Copyright (C) 2004, 2005, 2007, 2010 Free Software Foundation, Inc.
 ;;
 ;; This file is part of GCC.
 ;;
@@ -24,6 +24,7 @@
 (define_mode_iterator VECINT12 [V8QI V4HI])
 (define_mode_iterator VECINT24 [V4HI V2SI])
 (define_mode_attr vecsize [(V8QI "1") (V4HI "2") (V2SI "4")])
+(define_mode_attr vecwider [(V8QI "V4HI") (V4HI "V2SI")])
 
 (define_expand "mov<mode>"
   [(set (match_operand:VECINT 0 "general_operand" "")
@@ -171,38 +172,73 @@
 		   (match_operand:V8QI 2 "gr_register_operand" "r")))]
   ""
 {
-  rtx r1, l1, r2, l2, rm, lm;
-
-  r1 = gen_reg_rtx (V4HImode);
-  l1 = gen_reg_rtx (V4HImode);
-  r2 = gen_reg_rtx (V4HImode);
-  l2 = gen_reg_rtx (V4HImode);
-
-  /* Zero-extend the QImode elements into two words of HImode elements
-     by interleaving them with zero bytes.  */
-  emit_insn (gen_mix1_r (gen_lowpart (V8QImode, r1),
-                         operands[1], CONST0_RTX (V8QImode)));
-  emit_insn (gen_mix1_r (gen_lowpart (V8QImode, r2),
-                         operands[2], CONST0_RTX (V8QImode)));
-  emit_insn (gen_mix1_l (gen_lowpart (V8QImode, l1),
-                         operands[1], CONST0_RTX (V8QImode)));
-  emit_insn (gen_mix1_l (gen_lowpart (V8QImode, l2),
-                         operands[2], CONST0_RTX (V8QImode)));
-
-  /* Multiply.  */
-  rm = gen_reg_rtx (V4HImode);
-  lm = gen_reg_rtx (V4HImode);
-  emit_insn (gen_mulv4hi3 (rm, r1, r2));
-  emit_insn (gen_mulv4hi3 (lm, l1, l2));
-
-  /* Zap the high order bytes of the HImode elements by overwriting those
-     in one part with the low order bytes of the other.  */
-  emit_insn (gen_mix1_r (operands[0],
-                         gen_lowpart (V8QImode, rm),
-                         gen_lowpart (V8QImode, lm)));
+  rtx l = gen_reg_rtx (V4HImode);
+  rtx h = gen_reg_rtx (V4HImode);
+  emit_insn (gen_vec_widen_umult_lo_v8qi (l, operands[1], operands[2]));
+  emit_insn (gen_vec_widen_umult_hi_v8qi (h, operands[1], operands[2]));
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_vec_pack_trunc_v4hi (operands[0], h, l));
+  else
+    emit_insn (gen_vec_pack_trunc_v4hi (operands[0], l, h));
   DONE;
 })
 
+(define_expand "vec_widen_umult_lo_v8qi"
+  [(match_operand:V4HI 0 "gr_register_operand" "")
+   (match_operand:V8QI 1 "gr_register_operand" "")
+   (match_operand:V8QI 2 "gr_register_operand" "")]
+  ""
+{
+  rtx op1 = gen_reg_rtx (V4HImode);
+  rtx op2 = gen_reg_rtx (V4HImode);
+  emit_insn (gen_vec_unpacku_lo_v8qi (op1, operands[1]));
+  emit_insn (gen_vec_unpacku_lo_v8qi (op2, operands[2]));
+  emit_insn (gen_mulv4hi3 (operands[0], op1, op2));
+  DONE;
+});
+  
+(define_expand "vec_widen_umult_hi_v8qi"
+  [(match_operand:V4HI 0 "gr_register_operand" "")
+   (match_operand:V8QI 1 "gr_register_operand" "")
+   (match_operand:V8QI 2 "gr_register_operand" "")]
+  ""
+{
+  rtx op1 = gen_reg_rtx (V4HImode);
+  rtx op2 = gen_reg_rtx (V4HImode);
+  emit_insn (gen_vec_unpacku_hi_v8qi (op1, operands[1]));
+  emit_insn (gen_vec_unpacku_hi_v8qi (op2, operands[2]));
+  emit_insn (gen_mulv4hi3 (operands[0], op1, op2));
+  DONE;
+});
+  
+(define_expand "vec_widen_smult_lo_v8qi"
+  [(match_operand:V4HI 0 "gr_register_operand" "")
+   (match_operand:V8QI 1 "gr_register_operand" "")
+   (match_operand:V8QI 2 "gr_register_operand" "")]
+  ""
+{
+  rtx op1 = gen_reg_rtx (V4HImode);
+  rtx op2 = gen_reg_rtx (V4HImode);
+  emit_insn (gen_vec_unpacks_lo_v8qi (op1, operands[1]));
+  emit_insn (gen_vec_unpacks_lo_v8qi (op2, operands[2]));
+  emit_insn (gen_mulv4hi3 (operands[0], op1, op2));
+  DONE;
+});
+  
+(define_expand "vec_widen_smult_hi_v8qi"
+  [(match_operand:V4HI 0 "gr_register_operand" "")
+   (match_operand:V8QI 1 "gr_register_operand" "")
+   (match_operand:V8QI 2 "gr_register_operand" "")]
+  ""
+{
+  rtx op1 = gen_reg_rtx (V4HImode);
+  rtx op2 = gen_reg_rtx (V4HImode);
+  emit_insn (gen_vec_unpacks_hi_v8qi (op1, operands[1]));
+  emit_insn (gen_vec_unpacks_hi_v8qi (op2, operands[2]));
+  emit_insn (gen_mulv4hi3 (operands[0], op1, op2));
+  DONE;
+});
+  
 (define_insn "mulv4hi3"
   [(set (match_operand:V4HI 0 "gr_register_operand" "=r")
 	(mult:V4HI (match_operand:V4HI 1 "gr_register_operand" "r")
@@ -211,7 +247,35 @@
   "pmpyshr2 %0 = %1, %2, 0"
   [(set_attr "itanium_class" "mmmul")])
 
-(define_insn "pmpy2_r"
+(define_insn "pmpyshr2"
+  [(set (match_operand:V4HI 0 "gr_register_operand" "=r")
+	(truncate:V4HI
+	  (ashiftrt:V4SI
+	    (mult:V4SI
+	      (sign_extend:V4SI
+		(match_operand:V4HI 1 "gr_register_operand" "r"))
+	      (sign_extend:V4SI
+		(match_operand:V4HI 2 "gr_register_operand" "r")))
+	    (match_operand:SI 3 "pmpyshr_operand" "n"))))]
+  ""
+  "pmpyshr2 %0 = %1, %2, %3"
+  [(set_attr "itanium_class" "mmmul")])
+
+(define_insn "pmpyshr2_u"
+  [(set (match_operand:V4HI 0 "gr_register_operand" "=r")
+	(truncate:V4HI
+	  (lshiftrt:V4SI
+	    (mult:V4SI
+	      (zero_extend:V4SI
+		(match_operand:V4HI 1 "gr_register_operand" "r"))
+	      (zero_extend:V4SI
+		(match_operand:V4HI 2 "gr_register_operand" "r")))
+	    (match_operand:SI 3 "pmpyshr_operand" "n"))))]
+  ""
+  "pmpyshr2.u %0 = %1, %2, %3"
+  [(set_attr "itanium_class" "mmmul")])
+
+(define_insn "pmpy2_even"
   [(set (match_operand:V2SI 0 "gr_register_operand" "=r")
 	(mult:V2SI
 	  (vec_select:V2SI
@@ -223,10 +287,16 @@
 	      (match_operand:V4HI 2 "gr_register_operand" "r"))
 	    (parallel [(const_int 0) (const_int 2)]))))]
   ""
-  "pmpy2.r %0 = %1, %2"
+{
+  /* Recall that vector elements are numbered in memory order.  */
+  if (TARGET_BIG_ENDIAN)
+    return "%,pmpy2.l %0 = %1, %2";
+  else
+    return "%,pmpy2.r %0 = %1, %2";
+}
   [(set_attr "itanium_class" "mmshf")])
 
-(define_insn "pmpy2_l"
+(define_insn "pmpy2_odd"
   [(set (match_operand:V2SI 0 "gr_register_operand" "=r")
 	(mult:V2SI
 	  (vec_select:V2SI
@@ -238,9 +308,138 @@
 	      (match_operand:V4HI 2 "gr_register_operand" "r"))
 	    (parallel [(const_int 1) (const_int 3)]))))]
   ""
-  "pmpy2.l %0 = %1, %2"
+{
+  /* Recall that vector elements are numbered in memory order.  */
+  if (TARGET_BIG_ENDIAN)
+    return "%,pmpy2.r %0 = %1, %2";
+  else
+    return "%,pmpy2.l %0 = %1, %2";
+}
   [(set_attr "itanium_class" "mmshf")])
 
+(define_expand "vec_widen_smult_lo_v4hi"
+  [(match_operand:V2SI 0 "gr_register_operand" "")
+   (match_operand:V4HI 1 "gr_register_operand" "")
+   (match_operand:V4HI 2 "gr_register_operand" "")]
+  ""
+{
+  rtx l = gen_reg_rtx (V4HImode);
+  rtx h = gen_reg_rtx (V4HImode);
+  emit_insn (gen_mulv4hi3 (l, operands[1], operands[2]));
+  emit_insn (gen_pmpyshr2 (h, operands[1], operands[2], GEN_INT (16)));
+  ia64_unpack_assemble (operands[0], l, h, false);
+  DONE;
+})
+
+(define_expand "vec_widen_smult_hi_v4hi"
+  [(match_operand:V2SI 0 "gr_register_operand" "")
+   (match_operand:V4HI 1 "gr_register_operand" "")
+   (match_operand:V4HI 2 "gr_register_operand" "")]
+  ""
+{
+  rtx l = gen_reg_rtx (V4HImode);
+  rtx h = gen_reg_rtx (V4HImode);
+  emit_insn (gen_mulv4hi3 (l, operands[1], operands[2]));
+  emit_insn (gen_pmpyshr2 (h, operands[1], operands[2], GEN_INT (16)));
+  ia64_unpack_assemble (operands[0], l, h, true);
+  DONE;
+})
+
+(define_expand "vec_widen_umult_lo_v4hi"
+  [(match_operand:V2SI 0 "gr_register_operand" "")
+   (match_operand:V4HI 1 "gr_register_operand" "")
+   (match_operand:V4HI 2 "gr_register_operand" "")]
+  ""
+{
+  rtx l = gen_reg_rtx (V4HImode);
+  rtx h = gen_reg_rtx (V4HImode);
+  emit_insn (gen_mulv4hi3 (l, operands[1], operands[2]));
+  emit_insn (gen_pmpyshr2_u (h, operands[1], operands[2], GEN_INT (16)));
+  ia64_unpack_assemble (operands[0], l, h, false);
+  DONE;
+})
+
+(define_expand "vec_widen_umult_hi_v4hi"
+  [(match_operand:V2SI 0 "gr_register_operand" "")
+   (match_operand:V4HI 1 "gr_register_operand" "")
+   (match_operand:V4HI 2 "gr_register_operand" "")]
+  ""
+{
+  rtx l = gen_reg_rtx (V4HImode);
+  rtx h = gen_reg_rtx (V4HImode);
+  emit_insn (gen_mulv4hi3 (l, operands[1], operands[2]));
+  emit_insn (gen_pmpyshr2_u (h, operands[1], operands[2], GEN_INT (16)));
+  ia64_unpack_assemble (operands[0], l, h, true);
+  DONE;
+})
+
+(define_expand "mulv2si3"
+  [(set (match_operand:V2SI 0 "gr_register_operand" "")
+	(mult:V2SI (match_operand:V2SI 1 "gr_register_operand" "r")
+		   (match_operand:V2SI 2 "gr_register_operand" "r")))]
+  ""
+{
+  rtx t0, t1, t2, t3, t4, t5, t6, t7, x;
+  rtx op1h = gen_lowpart (V4HImode, operands[1]);
+  rtx op2h = gen_lowpart (V4HImode, operands[2]);
+
+  t0 = gen_reg_rtx (V4HImode);
+  t1 = gen_reg_rtx (V4HImode);
+  t2 = gen_reg_rtx (V4HImode);
+  t3 = gen_reg_rtx (V4HImode);
+  t4 = gen_reg_rtx (V2SImode);
+  t5 = gen_reg_rtx (V2SImode);
+  t6 = gen_reg_rtx (V2SImode);
+  t7 = gen_reg_rtx (V2SImode);
+
+  /* Consider the HImode components of op1 = DCBA, op2 = ZYXW.
+     Consider .l and .h suffixes below the low and high 16 bits
+     of the full 32-bit product.  */
+
+  /* T0 = CDBA.  */
+  x = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (4, const1_rtx, const0_rtx,
+					     GEN_INT (3), const2_rtx));
+  x = gen_rtx_VEC_SELECT (V4HImode, op1h, x);
+  emit_insn (gen_rtx_SET (VOIDmode, t0, x));
+
+  /* T1 = DZ.l, CY.l, BX.l, AW.l.  */
+  emit_insn (gen_mulv4hi3 (t1, op1h, op2h));
+
+  /* T2 = DZ.h, CY.h, BX.h, AW.h.  */
+  emit_insn (gen_pmpyshr2_u (t2, op1h, op2h, GEN_INT (16)));
+
+  /* T3 = CZ.l, DY.l, AX.l, BW.l.  */
+  emit_insn (gen_mulv4hi3 (t3, t0, op2h));
+
+  /* T4 = CY.h, CY.l, AW.h, AW.l = CY, AW.  */
+  x = gen_lowpart (V4HImode, t4);
+  if (TARGET_BIG_ENDIAN)
+    x = gen_mix2_odd (x, t2, t1);
+  else
+    x = gen_mix2_even (x, t1, t2);
+  emit_insn (x);
+
+  /* T5 = CZ.l, 0, AX.l, 0 = CZ << 16, AX << 16.  */
+  x = gen_lowpart (V4HImode, t5);
+  if (TARGET_BIG_ENDIAN)
+    x = gen_mix2_even (x, t3, CONST0_RTX (V4HImode));
+  else
+    x = gen_mix2_odd (x, CONST0_RTX (V4HImode), t3);
+  emit_insn (x);
+
+  /* T6 = DY.l, 0, BW.l, 0 = DY << 16, BW << 16.  */
+  x = gen_lowpart (V4HImode, t6);
+  if (TARGET_BIG_ENDIAN)
+    x = gen_mix2_odd (x, t3, CONST0_RTX (V4HImode));
+  else
+    x = gen_mix2_even (x, CONST0_RTX (V4HImode), t3);
+  emit_insn (x);
+
+  emit_insn (gen_addv2si3 (t7, t4, t5));
+  emit_insn (gen_addv2si3 (operands[0], t6, t7));
+  DONE;
+})
+
 (define_expand "umax<mode>3"
   [(set (match_operand:VECINT 0 "gr_register_operand" "")
 	(umax:VECINT (match_operand:VECINT 1 "gr_register_operand" "")
@@ -429,16 +628,36 @@
    (match_operand:V2SI 3 "gr_register_operand" "")]
   ""
 {
-  rtx l, r, t;
+  rtx e, o, t;
 
-  r = gen_reg_rtx (V2SImode);
-  l = gen_reg_rtx (V2SImode);
+  e = gen_reg_rtx (V2SImode);
+  o = gen_reg_rtx (V2SImode);
   t = gen_reg_rtx (V2SImode);
 
-  emit_insn (gen_pmpy2_r (r, operands[1], operands[2]));
-  emit_insn (gen_pmpy2_l (l, operands[1], operands[2]));
-  emit_insn (gen_addv2si3 (t, r, operands[3]));
-  emit_insn (gen_addv2si3 (operands[0], t, l));
+  emit_insn (gen_pmpy2_even (e, operands[1], operands[2]));
+  emit_insn (gen_pmpy2_odd (o, operands[1], operands[2]));
+  emit_insn (gen_addv2si3 (t, e, operands[3]));
+  emit_insn (gen_addv2si3 (operands[0], t, o));
+  DONE;
+})
+
+(define_expand "udot_prodv4hi"
+  [(match_operand:V2SI 0 "gr_register_operand" "")
+   (match_operand:V4HI 1 "gr_register_operand" "")
+   (match_operand:V4HI 2 "gr_register_operand" "")
+   (match_operand:V2SI 3 "gr_register_operand" "")]
+  ""
+{
+  rtx l, h, t;
+
+  l = gen_reg_rtx (V2SImode);
+  h = gen_reg_rtx (V2SImode);
+  t = gen_reg_rtx (V2SImode);
+
+  emit_insn (gen_vec_widen_umult_lo_v4hi (l, operands[1], operands[2]));
+  emit_insn (gen_vec_widen_umult_hi_v4hi (h, operands[1], operands[2]));
+  emit_insn (gen_addv2si3 (t, l, operands[3]));
+  emit_insn (gen_addv2si3 (operands[0], t, h));
   DONE;
 })
 
@@ -486,7 +705,7 @@
   "pcmp<vecsize>.gt %0 = %r1, %r2"
   [(set_attr "itanium_class" "mmalua")])
 
-(define_insn "pack2_sss"
+(define_insn "vec_pack_ssat_v4hi"
   [(set (match_operand:V8QI 0 "gr_register_operand" "=r")
 	(vec_concat:V8QI
 	  (ss_truncate:V4QI
@@ -494,10 +713,16 @@
 	  (ss_truncate:V4QI
 	    (match_operand:V4HI 2 "gr_reg_or_0_operand" "rU"))))]
   ""
-  "pack2.sss %0 = %r1, %r2"
+{
+  /* Recall that vector elements are numbered in memory order.  */
+  if (TARGET_BIG_ENDIAN)
+    return "%,pack2.sss %0 = %r2, %r1";
+  else
+    return "%,pack2.sss %0 = %r1, %r2";
+}
   [(set_attr "itanium_class" "mmshf")])
 
-(define_insn "*pack2_uss"
+(define_insn "vec_pack_usat_v4hi"
   [(set (match_operand:V8QI 0 "gr_register_operand" "=r")
 	(vec_concat:V8QI
 	  (us_truncate:V4QI
@@ -505,10 +730,16 @@
 	  (us_truncate:V4QI
 	    (match_operand:V4HI 2 "gr_reg_or_0_operand" "rU"))))]
   ""
-  "pack2.uss %0 = %r1, %r2"
+{
+  /* Recall that vector elements are numbered in memory order.  */
+  if (TARGET_BIG_ENDIAN)
+    return "%,pack2.uss %0 = %r2, %r1";
+  else
+    return "%,pack2.uss %0 = %r1, %r2";
+}
   [(set_attr "itanium_class" "mmshf")])
 
-(define_insn "pack4_sss"
+(define_insn "vec_pack_ssat_v2si"
   [(set (match_operand:V4HI 0 "gr_register_operand" "=r")
 	(vec_concat:V4HI
 	  (ss_truncate:V2HI
@@ -516,93 +747,103 @@
 	  (ss_truncate:V2HI
 	    (match_operand:V2SI 2 "gr_reg_or_0_operand" "rU"))))]
   ""
-  "pack4.sss %0 = %r1, %r2"
+{
+  /* Recall that vector elements are numbered in memory order.  */
+  if (TARGET_BIG_ENDIAN)
+    return "%,pack4.sss %0 = %r2, %r1";
+  else
+    return "%,pack4.sss %0 = %r1, %r2";
+}
   [(set_attr "itanium_class" "mmshf")])
 
-(define_insn "unpack1_l"
+(define_insn "vec_interleave_lowv8qi"
   [(set (match_operand:V8QI 0 "gr_register_operand" "=r")
 	(vec_select:V8QI
 	  (vec_concat:V16QI
 	    (match_operand:V8QI 1 "gr_reg_or_0_operand" "rU")
 	    (match_operand:V8QI 2 "gr_reg_or_0_operand" "rU"))
-	  (parallel [(const_int 0)
-		     (const_int 1)
-		     (const_int 2)
-		     (const_int 3)
-		     (const_int 8)
-		     (const_int 9)
-		     (const_int 10)
-		     (const_int 11)])))]
+	  (parallel [(const_int 0) (const_int 8)
+		     (const_int 1) (const_int 9)
+		     (const_int 2) (const_int 10)
+		     (const_int 3) (const_int 11)])))]
   ""
-  "unpack1.l %0 = %r2, %r1"
+{
+  /* Recall that vector elements are numbered in memory order.  */
+  if (TARGET_BIG_ENDIAN)
+    return "%,unpack1.l %0 = %r1, %r2";
+  else
+    return "%,unpack1.l %0 = %r2, %r1";
+}
   [(set_attr "itanium_class" "mmshf")])
 
-(define_insn "unpack1_h"
+(define_insn "vec_interleave_highv8qi"
   [(set (match_operand:V8QI 0 "gr_register_operand" "=r")
 	(vec_select:V8QI
 	  (vec_concat:V16QI
 	    (match_operand:V8QI 1 "gr_reg_or_0_operand" "rU")
 	    (match_operand:V8QI 2 "gr_reg_or_0_operand" "rU"))
-	  (parallel [(const_int 4)
-		     (const_int 5)
-		     (const_int 6)
-		     (const_int 7)
-		     (const_int 12)
-		     (const_int 13)
-		     (const_int 14)
-		     (const_int 15)])))]
+	  (parallel [(const_int 4) (const_int 12)
+		     (const_int 5) (const_int 13)
+		     (const_int 6) (const_int 14)
+		     (const_int 7) (const_int 15)])))]
   ""
-  "unpack1.h %0 = %r2, %r1"
+{
+  /* Recall that vector elements are numbered in memory order.  */
+  if (TARGET_BIG_ENDIAN)
+    return "%,unpack1.h %0 = %r1, %r2";
+  else
+    return "%,unpack1.h %0 = %r2, %r1";
+}
   [(set_attr "itanium_class" "mmshf")])
 
-(define_insn "mix1_r"
+(define_insn "mix1_even"
   [(set (match_operand:V8QI 0 "gr_register_operand" "=r")
 	(vec_select:V8QI
 	  (vec_concat:V16QI
 	    (match_operand:V8QI 1 "gr_reg_or_0_operand" "rU")
 	    (match_operand:V8QI 2 "gr_reg_or_0_operand" "rU"))
-	  (parallel [(const_int 0)
-		     (const_int 8)
-		     (const_int 2)
-		     (const_int 10)
-		     (const_int 4)
-		     (const_int 12)
-		     (const_int 6)
-		     (const_int 14)])))]
+	  (parallel [(const_int 0) (const_int 8)
+		     (const_int 2) (const_int 10)
+		     (const_int 4) (const_int 12)
+		     (const_int 6) (const_int 14)])))]
   ""
-  "mix1.r %0 = %r2, %r1"
+{
+  /* Recall that vector elements are numbered in memory order.  */
+  if (TARGET_BIG_ENDIAN)
+    return "%,mix1.l %0 = %r1, %r2";
+  else
+    return "%,mix1.r %0 = %r2, %r1";
+}
   [(set_attr "itanium_class" "mmshf")])
 
-(define_insn "mix1_l"
+(define_insn "mix1_odd"
   [(set (match_operand:V8QI 0 "gr_register_operand" "=r")
 	(vec_select:V8QI
 	  (vec_concat:V16QI
 	    (match_operand:V8QI 1 "gr_reg_or_0_operand" "rU")
 	    (match_operand:V8QI 2 "gr_reg_or_0_operand" "rU"))
-	  (parallel [(const_int 1)
-		     (const_int 9)
-		     (const_int 3)
-		     (const_int 11)
-		     (const_int 5)
-		     (const_int 13)
-		     (const_int 7)
-		     (const_int 15)])))]
+	  (parallel [(const_int 1) (const_int 9)
+		     (const_int 3) (const_int 11)
+		     (const_int 5) (const_int 13)
+		     (const_int 7) (const_int 15)])))]
   ""
-  "mix1.l %0 = %r2, %r1"
+{
+  /* Recall that vector elements are numbered in memory order.  */
+  if (TARGET_BIG_ENDIAN)
+    return "%,mix1.r %0 = %r1, %r2";
+  else
+    return "%,mix1.l %0 = %r2, %r1";
+}
   [(set_attr "itanium_class" "mmshf")])
 
 (define_insn "*mux1_rev"
   [(set (match_operand:V8QI 0 "gr_register_operand" "=r")
 	(vec_select:V8QI
 	  (match_operand:V8QI 1 "gr_register_operand" "r")
-	  (parallel [(const_int 7)
-		     (const_int 6)
-		     (const_int 5)
-		     (const_int 4)
-		     (const_int 3)
-		     (const_int 2)
-		     (const_int 1)
-		     (const_int 0)])))]
+	  (parallel [(const_int 7) (const_int 6)
+		     (const_int 5) (const_int 4)
+		     (const_int 3) (const_int 2)
+		     (const_int 1) (const_int 0)])))]
   ""
   "mux1 %0 = %1, @rev"
   [(set_attr "itanium_class" "mmshf")])
@@ -611,14 +852,10 @@
   [(set (match_operand:V8QI 0 "gr_register_operand" "=r")
 	(vec_select:V8QI
 	  (match_operand:V8QI 1 "gr_register_operand" "r")
-	  (parallel [(const_int 0)
-		     (const_int 4)
-		     (const_int 2)
-		     (const_int 6)
-		     (const_int 1)
-		     (const_int 5)
-		     (const_int 3)
-		     (const_int 7)])))]
+	  (parallel [(const_int 0) (const_int 4)
+		     (const_int 2) (const_int 6)
+		     (const_int 1) (const_int 5)
+		     (const_int 3) (const_int 7)])))]
   ""
   "mux1 %0 = %1, @mix"
   [(set_attr "itanium_class" "mmshf")])
@@ -627,30 +864,22 @@
   [(set (match_operand:V8QI 0 "gr_register_operand" "=r")
 	(vec_select:V8QI
 	  (match_operand:V8QI 1 "gr_register_operand" "r")
-	  (parallel [(const_int 0)
-		     (const_int 4)
-		     (const_int 1)
-		     (const_int 5)
-		     (const_int 2)
-		     (const_int 6)
-		     (const_int 3)
-		     (const_int 7)])))]
+	  (parallel [(const_int 0) (const_int 4)
+		     (const_int 1) (const_int 5)
+		     (const_int 2) (const_int 6)
+		     (const_int 3) (const_int 7)])))]
   ""
   "mux1 %0 = %1, @shuf"
   [(set_attr "itanium_class" "mmshf")])
 
-(define_insn "*mux1_alt"
+(define_insn "mux1_alt"
   [(set (match_operand:V8QI 0 "gr_register_operand" "=r")
 	(vec_select:V8QI
 	  (match_operand:V8QI 1 "gr_register_operand" "r")
-	  (parallel [(const_int 0)
-		     (const_int 2)
-		     (const_int 4)
-		     (const_int 6)
-		     (const_int 1)
-		     (const_int 3)
-		     (const_int 5)
-		     (const_int 7)])))]
+	  (parallel [(const_int 0) (const_int 2)
+		     (const_int 4) (const_int 6)
+		     (const_int 1) (const_int 3)
+		     (const_int 5) (const_int 7)])))]
   ""
   "mux1 %0 = %1, @alt"
   [(set_attr "itanium_class" "mmshf")])
@@ -659,14 +888,14 @@
   [(set (match_operand:V8QI 0 "gr_register_operand" "=r")
 	(vec_select:V8QI
 	  (match_operand:V8QI 1 "gr_register_operand" "r")
-	  (parallel [(const_int 0)
-		     (const_int 0)
-		     (const_int 0)
-		     (const_int 0)
-		     (const_int 0)
-		     (const_int 0)
-		     (const_int 0)
-		     (const_int 0)])))]
+	  (parallel [(match_operand 2 "mux1_brcst_element" "")
+		     (match_dup 2)
+		     (match_dup 2)
+		     (match_dup 2)
+		     (match_dup 2)
+		     (match_dup 2)
+		     (match_dup 2)
+		     (match_dup 2)])))]
   ""
   "mux1 %0 = %1, @brcst"
   [(set_attr "itanium_class" "mmshf")])
@@ -679,60 +908,100 @@
   "mux1 %0 = %1, @brcst"
   [(set_attr "itanium_class" "mmshf")])
 
-(define_insn "unpack2_l"
-  [(set (match_operand:V4HI 0 "gr_register_operand" "=r")
-	(vec_select:V4HI
-	  (vec_concat:V8HI
-	    (match_operand:V4HI 1 "gr_reg_or_0_operand" "rU")
-	    (match_operand:V4HI 2 "gr_reg_or_0_operand" "rU"))
-	  (parallel [(const_int 0)
-		     (const_int 4)
-		     (const_int 1)
-		     (const_int 5)])))]
+(define_expand "vec_extract_evenv8qi"
+  [(match_operand:V8QI 0 "gr_register_operand" "")
+   (match_operand:V8QI 1 "gr_register_operand" "")
+   (match_operand:V8QI 2 "gr_register_operand" "")]
   ""
-  "unpack2.l %0 = %r2, %r1"
-  [(set_attr "itanium_class" "mmshf")])
+{
+  rtx temp = gen_reg_rtx (V8QImode);
+  emit_insn (gen_mix1_even (temp, operands[1], operands[2]));
+  emit_insn (gen_mux1_alt (operands[0], temp));
+  DONE;
+})
 
-(define_insn "unpack2_h"
+(define_expand "vec_extract_oddv8qi"
+  [(match_operand:V8QI 0 "gr_register_operand" "")
+   (match_operand:V8QI 1 "gr_register_operand" "")
+   (match_operand:V8QI 2 "gr_register_operand" "")]
+  ""
+{
+  rtx temp = gen_reg_rtx (V8QImode);
+  emit_insn (gen_mix1_odd (temp, operands[1], operands[2]));
+  emit_insn (gen_mux1_alt (operands[0], temp));
+  DONE;
+})
+
+(define_insn "vec_interleave_lowv4hi"
   [(set (match_operand:V4HI 0 "gr_register_operand" "=r")
 	(vec_select:V4HI
 	  (vec_concat:V8HI
 	    (match_operand:V4HI 1 "gr_reg_or_0_operand" "rU")
 	    (match_operand:V4HI 2 "gr_reg_or_0_operand" "rU"))
-	  (parallel [(const_int 2)
-		     (const_int 6)
-		     (const_int 3)
-		     (const_int 7)])))]
+	  (parallel [(const_int 0) (const_int 4)
+		     (const_int 1) (const_int 5)])))]
   ""
-  "unpack2.h %0 = %r2, %r1"
+{
+  /* Recall that vector elements are numbered in memory order.  */
+  if (TARGET_BIG_ENDIAN)
+    return "%,unpack2.l %0 = %r1, %r2";
+  else
+    return "%,unpack2.l %0 = %r2, %r1";
+}
   [(set_attr "itanium_class" "mmshf")])
 
-(define_insn "*mix2_r"
+(define_insn "vec_interleave_highv4hi"
   [(set (match_operand:V4HI 0 "gr_register_operand" "=r")
 	(vec_select:V4HI
 	  (vec_concat:V8HI
 	    (match_operand:V4HI 1 "gr_reg_or_0_operand" "rU")
 	    (match_operand:V4HI 2 "gr_reg_or_0_operand" "rU"))
-	  (parallel [(const_int 0)
-		     (const_int 4)
-		     (const_int 2)
-		     (const_int 6)])))]
+	  (parallel [(const_int 2) (const_int 6)
+		     (const_int 3) (const_int 7)])))]
   ""
-  "mix2.r %0 = %r2, %r1"
+{
+  /* Recall that vector elements are numbered in memory order.  */
+  if (TARGET_BIG_ENDIAN)
+    return "%,unpack2.h %0 = %r1, %r2";
+  else
+    return "%,unpack2.h %0 = %r2, %r1";
+}
   [(set_attr "itanium_class" "mmshf")])
 
-(define_insn "*mix2_l"
+(define_insn "mix2_even"
   [(set (match_operand:V4HI 0 "gr_register_operand" "=r")
 	(vec_select:V4HI
 	  (vec_concat:V8HI
 	    (match_operand:V4HI 1 "gr_reg_or_0_operand" "rU")
 	    (match_operand:V4HI 2 "gr_reg_or_0_operand" "rU"))
-	  (parallel [(const_int 1)
-		     (const_int 5)
-		     (const_int 3)
-		     (const_int 7)])))]
+	  (parallel [(const_int 0) (const_int 4)
+		     (const_int 2) (const_int 6)])))]
   ""
-  "mix2.l %0 = %r2, %r1"
+{
+  /* Recall that vector elements are numbered in memory order.  */
+  if (TARGET_BIG_ENDIAN)
+    return "%,mix2.l %0 = %r1, %r2";
+  else
+    return "%,mix2.r %0 = %r2, %r1";
+}
+  [(set_attr "itanium_class" "mmshf")])
+
+(define_insn "mix2_odd"
+  [(set (match_operand:V4HI 0 "gr_register_operand" "=r")
+	(vec_select:V4HI
+	  (vec_concat:V8HI
+	    (match_operand:V4HI 1 "gr_reg_or_0_operand" "rU")
+	    (match_operand:V4HI 2 "gr_reg_or_0_operand" "rU"))
+	  (parallel [(const_int 1) (const_int 5)
+		     (const_int 3) (const_int 7)])))]
+  ""
+{
+  /* Recall that vector elements are numbered in memory order.  */
+  if (TARGET_BIG_ENDIAN)
+    return "%,mix2.r %0 = %r1, %r2";
+  else
+    return "%,mix2.l %0 = %r2, %r1";
+}
   [(set_attr "itanium_class" "mmshf")])
 
 (define_insn "*mux2"
@@ -745,16 +1014,58 @@
 		     (match_operand 5 "const_int_2bit_operand" "")])))]
   ""
 {
-  int mask;
-  mask  = INTVAL (operands[2]);
-  mask |= INTVAL (operands[3]) << 2;
-  mask |= INTVAL (operands[4]) << 4;
-  mask |= INTVAL (operands[5]) << 6;
+  int mask = 0;
+  if (TARGET_BIG_ENDIAN)
+    {
+      mask |= (3 - INTVAL (operands[2])) << 6;
+      mask |= (3 - INTVAL (operands[3])) << 4;
+      mask |= (3 - INTVAL (operands[4])) << 2;
+      mask |= 3 - INTVAL (operands[5]);
+    }
+  else
+    {
+      mask |= INTVAL (operands[2]);
+      mask |= INTVAL (operands[3]) << 2;
+      mask |= INTVAL (operands[4]) << 4;
+      mask |= INTVAL (operands[5]) << 6;
+    }
   operands[2] = GEN_INT (mask);
   return "%,mux2 %0 = %1, %2";
 }
   [(set_attr "itanium_class" "mmshf")])
 
+(define_expand "vec_extract_evenodd_helper"
+  [(set (match_operand:V4HI 0 "gr_register_operand" "")
+	(vec_select:V4HI
+	  (match_operand:V4HI 1 "gr_register_operand" "")
+	  (parallel [(const_int 0) (const_int 2)
+		     (const_int 1) (const_int 3)])))]
+  "")
+
+(define_expand "vec_extract_evenv4hi"
+  [(match_operand:V4HI 0 "gr_register_operand")
+   (match_operand:V4HI 1 "gr_reg_or_0_operand")
+   (match_operand:V4HI 2 "gr_reg_or_0_operand")]
+  ""
+{
+  rtx temp = gen_reg_rtx (V4HImode);
+  emit_insn (gen_mix2_even (temp, operands[1], operands[2]));
+  emit_insn (gen_vec_extract_evenodd_helper (operands[0], temp));
+  DONE;
+})
+
+(define_expand "vec_extract_oddv4hi"
+  [(match_operand:V4HI 0 "gr_register_operand")
+   (match_operand:V4HI 1 "gr_reg_or_0_operand")
+   (match_operand:V4HI 2 "gr_reg_or_0_operand")]
+  ""
+{
+  rtx temp = gen_reg_rtx (V4HImode);
+  emit_insn (gen_mix2_odd (temp, operands[1], operands[2]));
+  emit_insn (gen_vec_extract_evenodd_helper (operands[0], temp));
+  DONE;
+})
+
 (define_insn "*mux2_brcst_hi"
   [(set (match_operand:V4HI 0 "gr_register_operand" "=r")
 	(vec_duplicate:V4HI
@@ -763,31 +1074,69 @@
   "mux2 %0 = %1, 0"
   [(set_attr "itanium_class" "mmshf")])
 
-;; Note that mix4.r performs the exact same operation.
-(define_insn "*unpack4_l"
+(define_insn "vec_interleave_lowv2si"
+  [(set (match_operand:V2SI 0 "gr_register_operand" "=r")
+	(vec_select:V2SI
+	  (vec_concat:V4SI
+	    (match_operand:V2SI 1 "gr_reg_or_0_operand" "rU")
+	    (match_operand:V2SI 2 "gr_reg_or_0_operand" "rU"))
+	  (parallel [(const_int 0) (const_int 2)])))]
+  ""
+{
+  /* Recall that vector elements are numbered in memory order.  */
+  if (TARGET_BIG_ENDIAN)
+    return "%,unpack4.l %0 = %r1, %r2";
+  else
+    return "%,unpack4.l %0 = %r2, %r1";
+}
+  [(set_attr "itanium_class" "mmshf")])
+
+(define_insn "vec_interleave_highv2si"
   [(set (match_operand:V2SI 0 "gr_register_operand" "=r")
 	(vec_select:V2SI
 	  (vec_concat:V4SI
 	    (match_operand:V2SI 1 "gr_reg_or_0_operand" "rU")
 	    (match_operand:V2SI 2 "gr_reg_or_0_operand" "rU"))
-	  (parallel [(const_int 0)
-		     (const_int 2)])))]
+	  (parallel [(const_int 1) (const_int 3)])))]
   ""
-  "unpack4.l %0 = %r2, %r1"
+{
+  /* Recall that vector elements are numbered in memory order.  */
+  if (TARGET_BIG_ENDIAN)
+    return "%,unpack4.h %0 = %r1, %r2";
+  else
+    return "%,unpack4.h %0 = %r2, %r1";
+}
   [(set_attr "itanium_class" "mmshf")])
 
-;; Note that mix4.l performs the exact same operation.
-(define_insn "*unpack4_h"
-  [(set (match_operand:V2SI 0 "gr_register_operand" "=r")
-	(vec_select:V2SI
-	  (vec_concat:V4SI
-	    (match_operand:V2SI 1 "gr_reg_or_0_operand" "rU")
-	    (match_operand:V2SI 2 "gr_reg_or_0_operand" "rU"))
-	  (parallel [(const_int 1)
-		     (const_int 3)])))]
+(define_expand "vec_extract_evenv2si"
+  [(match_operand:V2SI 0 "gr_register_operand" "")
+   (match_operand:V2SI 1 "gr_register_operand" "")
+   (match_operand:V2SI 2 "gr_register_operand" "")]
   ""
-  "unpack4.h %0 = %r2, %r1"
-  [(set_attr "itanium_class" "mmshf")])
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_vec_interleave_highv2si (operands[0], operands[1],
+					    operands[2]));
+  else
+    emit_insn (gen_vec_interleave_lowv2si (operands[0], operands[1],
+					   operands[2]));
+  DONE;
+})
+
+(define_expand "vec_extract_oddv2si"
+  [(match_operand:V2SI 0 "gr_register_operand" "")
+   (match_operand:V2SI 1 "gr_register_operand" "")
+   (match_operand:V2SI 2 "gr_register_operand" "")]
+  ""
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_vec_interleave_lowv2si (operands[0], operands[1],
+					   operands[2]));
+  else
+    emit_insn (gen_vec_interleave_highv2si (operands[0], operands[1],
+					    operands[2]));
+  DONE;
+})
 
 (define_expand "vec_initv2si"
   [(match_operand:V2SI 0 "gr_register_operand" "")
@@ -810,10 +1159,7 @@
   if (!gr_reg_or_0_operand (op2, SImode))
     op2 = force_reg (SImode, op2);
 
-  if (TARGET_BIG_ENDIAN)
-    x = gen_rtx_VEC_CONCAT (V2SImode, op2, op1);
-  else
-    x = gen_rtx_VEC_CONCAT (V2SImode, op1, op2);
+  x = gen_rtx_VEC_CONCAT (V2SImode, op1, op2);
   emit_insn (gen_rtx_SET (VOIDmode, operands[0], x));
   DONE;
 })
@@ -824,14 +1170,19 @@
 	  (match_operand:SI 1 "gr_reg_or_0_operand" "rO")
 	  (match_operand:SI 2 "gr_reg_or_0_operand" "rO")))]
   ""
-  "unpack4.l %0 = %r2, %r1"
+{
+  /* Recall that vector elements are numbered in memory order.  */
+  if (TARGET_BIG_ENDIAN)
+    return "%,unpack4.l %0 = %r1, %r2";
+  else
+    return "%,unpack4.l %0 = %r2, %r1";
+}
   [(set_attr "itanium_class" "mmshf")])
 
 ;; Missing operations
 ;; padd.uus
 ;; pavg
 ;; pavgsub
-;; pmpyshr, general form
 ;; psad
 ;; pshladd
 ;; pshradd
@@ -903,106 +1254,29 @@
   "fpnegabs %0 = %1"
   [(set_attr "itanium_class" "fmisc")])
 
-;; In order to convince combine to merge plus and mult to a useful fpma,
-;; we need a couple of extra patterns.
 (define_expand "addv2sf3"
-  [(parallel
-    [(set (match_operand:V2SF 0 "fr_register_operand" "")
-	  (plus:V2SF (match_operand:V2SF 1 "fr_register_operand" "")
-		     (match_operand:V2SF 2 "fr_register_operand" "")))
-     (use (match_dup 3))])]
+  [(set (match_operand:V2SF 0 "fr_register_operand" "")
+	(fma:V2SF (match_operand:V2SF 1 "fr_register_operand" "")
+		  (match_dup 3)
+		  (match_operand:V2SF 2 "fr_register_operand" "")))]
   ""
 {
   rtvec v = gen_rtvec (2, CONST1_RTX (SFmode), CONST1_RTX (SFmode));
   operands[3] = force_reg (V2SFmode, gen_rtx_CONST_VECTOR (V2SFmode, v));
-  if (!TARGET_FUSED_MADD)
-    {
-      emit_insn (gen_fpma (operands[0], operands[1], operands[3], operands[2]));
-      DONE;
-    }
 })
 
-;; The split condition here could be combine_completed, if we had such.
-(define_insn_and_split "*addv2sf3_1"
-  [(set (match_operand:V2SF 0 "fr_register_operand" "=f")
-	(plus:V2SF (match_operand:V2SF 1 "fr_register_operand" "f")
-		   (match_operand:V2SF 2 "fr_register_operand" "f")))
-   (use (match_operand:V2SF 3 "fr_register_operand" "f"))]
-  ""
-  "#"
-  "reload_completed"
-  [(set (match_dup 0)
-	(plus:V2SF
-	  (mult:V2SF (match_dup 1) (match_dup 3))
-	  (match_dup 2)))]
-  "")
-
-(define_insn_and_split "*addv2sf3_2"
-  [(set (match_operand:V2SF 0 "fr_register_operand" "=f")
-	(plus:V2SF
-	  (mult:V2SF (match_operand:V2SF 1 "fr_register_operand" "f")
-		     (match_operand:V2SF 2 "fr_register_operand" "f"))
-	  (match_operand:V2SF 3 "fr_register_operand" "f")))
-    (use (match_operand:V2SF 4 "" "X"))]
-  ""
-  "#"
-  ""
-  [(set (match_dup 0)
-	(plus:V2SF
-	  (mult:V2SF (match_dup 1) (match_dup 2))
-	  (match_dup 3)))]
-  "")
-
-;; In order to convince combine to merge minus and mult to a useful fpms,
-;; we need a couple of extra patterns.
 (define_expand "subv2sf3"
-  [(parallel
-    [(set (match_operand:V2SF 0 "fr_register_operand" "")
-	  (minus:V2SF (match_operand:V2SF 1 "fr_register_operand" "")
-		      (match_operand:V2SF 2 "fr_register_operand" "")))
-     (use (match_dup 3))])]
+  [(set (match_operand:V2SF 0 "fr_register_operand" "")
+	(fma:V2SF
+	  (match_operand:V2SF 1 "fr_register_operand" "")
+	  (match_dup 3)
+	  (neg:V2SF (match_operand:V2SF 2 "fr_register_operand" ""))))]
   ""
 {
   rtvec v = gen_rtvec (2, CONST1_RTX (SFmode), CONST1_RTX (SFmode));
   operands[3] = force_reg (V2SFmode, gen_rtx_CONST_VECTOR (V2SFmode, v));
-  if (!TARGET_FUSED_MADD)
-    {
-      emit_insn (gen_fpms (operands[0], operands[1], operands[3], operands[2]));
-      DONE;
-    }
 })
 
-;; The split condition here could be combine_completed, if we had such.
-(define_insn_and_split "*subv2sf3_1"
-  [(set (match_operand:V2SF 0 "fr_register_operand" "=f")
-	(minus:V2SF (match_operand:V2SF 1 "fr_register_operand" "f")
-		    (match_operand:V2SF 2 "fr_register_operand" "f")))
-   (use (match_operand:V2SF 3 "fr_register_operand" "f"))]
-  ""
-  "#"
-  "reload_completed"
-  [(set (match_dup 0)
-	(minus:V2SF
-	  (mult:V2SF (match_dup 1) (match_dup 3))
-	  (match_dup 2)))]
-  "")
-
-(define_insn_and_split "*subv2sf3_2"
-  [(set (match_operand:V2SF 0 "fr_register_operand" "=f")
-	(minus:V2SF
-	  (mult:V2SF (match_operand:V2SF 1 "fr_register_operand" "f")
-		     (match_operand:V2SF 2 "fr_register_operand" "f"))
-	  (match_operand:V2SF 3 "fr_register_operand" "f")))
-    (use (match_operand:V2SF 4 "" "X"))]
-  ""
-  "#"
-  ""
-  [(set (match_dup 0)
-	(minus:V2SF
-	  (mult:V2SF (match_dup 1) (match_dup 2))
-	  (match_dup 3)))]
-  "")
-
 (define_insn "mulv2sf3"
   [(set (match_operand:V2SF 0 "fr_register_operand" "=f")
 	(mult:V2SF (match_operand:V2SF 1 "fr_register_operand" "f")
@@ -1011,22 +1285,22 @@
   "fpmpy %0 = %1, %2"
   [(set_attr "itanium_class" "fmac")])
 
-(define_insn "fpma"
+(define_insn "fmav2sf4"
   [(set (match_operand:V2SF 0 "fr_register_operand" "=f")
-	(plus:V2SF
-	  (mult:V2SF (match_operand:V2SF 1 "fr_register_operand" "f")
-		     (match_operand:V2SF 2 "fr_register_operand" "f"))
+	(fma:V2SF
+	  (match_operand:V2SF 1 "fr_register_operand" "f")
+	  (match_operand:V2SF 2 "fr_register_operand" "f")
 	  (match_operand:V2SF 3 "fr_register_operand" "f")))]
   ""
   "fpma %0 = %1, %2, %3"
   [(set_attr "itanium_class" "fmac")])
 
-(define_insn "fpms"
+(define_insn "fmsv2sf4"
   [(set (match_operand:V2SF 0 "fr_register_operand" "=f")
-	(minus:V2SF
-	  (mult:V2SF (match_operand:V2SF 1 "fr_register_operand" "f")
-		     (match_operand:V2SF 2 "fr_register_operand" "f"))
-	  (match_operand:V2SF 3 "fr_register_operand" "f")))]
+	(fma:V2SF
+	  (match_operand:V2SF 1 "fr_register_operand" "f")
+	  (match_operand:V2SF 2 "fr_register_operand" "f")
+	  (neg:V2SF (match_operand:V2SF 3 "fr_register_operand" "f"))))]
   ""
   "fpms %0 = %1, %2, %3"
   [(set_attr "itanium_class" "fmac")])
@@ -1040,12 +1314,11 @@
   "fpnmpy %0 = %1, %2"
   [(set_attr "itanium_class" "fmac")])
 
-(define_insn "*fpnma"
+(define_insn "fnmav2sf4"
   [(set (match_operand:V2SF 0 "fr_register_operand" "=f")
-	(plus:V2SF
-	  (neg:V2SF
-	    (mult:V2SF (match_operand:V2SF 1 "fr_register_operand" "f")
-		       (match_operand:V2SF 2 "fr_register_operand" "f")))
+	(fma:V2SF
+	  (neg:V2SF (match_operand:V2SF 1 "fr_register_operand" "f"))
+	  (match_operand:V2SF 2 "fr_register_operand" "f")
 	  (match_operand:V2SF 3 "fr_register_operand" "f")))]
   ""
   "fpnma %0 = %1, %2, %3"
@@ -1073,7 +1346,10 @@
   ""
 {
   rtx tmp = gen_reg_rtx (V2SFmode);
-  emit_insn (gen_fswap (tmp, operands[1], CONST0_RTX (V2SFmode)));
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_fswap (tmp, CONST0_RTX (V2SFmode), operands[1]));
+  else
+    emit_insn (gen_fswap (tmp, operands[1], CONST0_RTX (V2SFmode)));
   emit_insn (gen_addv2sf3 (operands[0], operands[1], tmp));
   DONE;
 })
@@ -1084,7 +1360,10 @@
   ""
 {
   rtx tmp = gen_reg_rtx (V2SFmode);
-  emit_insn (gen_fswap (tmp, operands[1], CONST0_RTX (V2SFmode)));
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_fswap (tmp, CONST0_RTX (V2SFmode), operands[1]));
+  else
+    emit_insn (gen_fswap (tmp, operands[1], CONST0_RTX (V2SFmode)));
   emit_insn (gen_smaxv2sf3 (operands[0], operands[1], tmp));
   DONE;
 })
@@ -1095,7 +1374,10 @@
   ""
 {
   rtx tmp = gen_reg_rtx (V2SFmode);
-  emit_insn (gen_fswap (tmp, operands[1], CONST0_RTX (V2SFmode)));
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_fswap (tmp, CONST0_RTX (V2SFmode), operands[1]));
+  else
+    emit_insn (gen_fswap (tmp, operands[1], CONST0_RTX (V2SFmode)));
   emit_insn (gen_sminv2sf3 (operands[0], operands[1], tmp));
   DONE;
 })
@@ -1161,10 +1443,7 @@
   if (!fr_reg_or_fp01_operand (op2, SFmode))
     op2 = force_reg (SFmode, op2);
 
-  if (TARGET_BIG_ENDIAN)
-    emit_insn (gen_fpack (operands[0], op2, op1));
-  else
-    emit_insn (gen_fpack (operands[0], op1, op2));
+  emit_insn (gen_fpack (operands[0], op1, op2));
   DONE;
 })
 
@@ -1174,7 +1453,13 @@
 	  (match_operand:SF 1 "fr_reg_or_fp01_operand" "fG")
 	  (match_operand:SF 2 "fr_reg_or_fp01_operand" "fG")))]
   ""
-  "fpack %0 = %F2, %F1"
+{
+  /* Recall that vector elements are numbered in memory order.  */
+  if (TARGET_BIG_ENDIAN)
+    return "%,fpack %0 = %F1, %F2";
+  else
+    return "%,fpack %0 = %F2, %F1";
+}
   [(set_attr "itanium_class" "fmisc")])
 
 (define_insn "fswap"
@@ -1185,10 +1470,16 @@
 	    (match_operand:V2SF 2 "fr_reg_or_0_operand" "fU"))
 	  (parallel [(const_int 1) (const_int 2)])))]
   ""
-  "fswap %0 = %F1, %F2"
+{
+  /* Recall that vector elements are numbered in memory order.  */
+  if (TARGET_BIG_ENDIAN)
+    return "%,fswap %0 = %F2, %F1";
+  else
+    return "%,fswap %0 = %F1, %F2";
+}
   [(set_attr "itanium_class" "fmisc")])
 
-(define_insn "*fmix_l"
+(define_insn "vec_interleave_highv2sf"
   [(set (match_operand:V2SF 0 "fr_register_operand" "=f")
 	(vec_select:V2SF
 	  (vec_concat:V4SF
@@ -1196,10 +1487,16 @@
 	    (match_operand:V2SF 2 "fr_reg_or_0_operand" "fU"))
 	  (parallel [(const_int 1) (const_int 3)])))]
   ""
-  "fmix.l %0 = %F2, %F1"
+{
+  /* Recall that vector elements are numbered in memory order.  */
+  if (TARGET_BIG_ENDIAN)
+    return "%,fmix.l %0 = %F1, %F2";
+  else
+    return "%,fmix.l %0 = %F2, %F1";
+}
   [(set_attr "itanium_class" "fmisc")])
 
-(define_insn "fmix_r"
+(define_insn "vec_interleave_lowv2sf"
   [(set (match_operand:V2SF 0 "fr_register_operand" "=f")
 	(vec_select:V2SF
 	  (vec_concat:V4SF
@@ -1207,7 +1504,13 @@
 	    (match_operand:V2SF 2 "fr_reg_or_0_operand" "fU"))
 	  (parallel [(const_int 0) (const_int 2)])))]
   ""
-  "fmix.r %0 = %F2, %F1"
+{
+  /* Recall that vector elements are numbered in memory order.  */
+  if (TARGET_BIG_ENDIAN)
+    return "%,fmix.r %0 = %F1, %F2";
+  else
+    return "%,fmix.r %0 = %F2, %F1";
+}
   [(set_attr "itanium_class" "fmisc")])
 
 (define_insn "fmix_lr"
@@ -1218,25 +1521,63 @@
 	    (match_operand:V2SF 2 "fr_reg_or_0_operand" "fU"))
 	  (parallel [(const_int 0) (const_int 3)])))]
   ""
-  "fmix.lr %0 = %F2, %F1"
+{
+  /* Recall that vector elements are numbered in memory order.  */
+  if (TARGET_BIG_ENDIAN)
+    return "%,fmix.lr %0 = %F1, %F2";
+  else
+    return "%,fmix.lr %0 = %F2, %F1";
+}
   [(set_attr "itanium_class" "fmisc")])
 
+(define_expand "vec_extract_evenv2sf"
+  [(match_operand:V2SF 0 "gr_register_operand" "")
+   (match_operand:V2SF 1 "gr_register_operand" "")
+   (match_operand:V2SF 2 "gr_register_operand" "")]
+  ""
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_vec_interleave_highv2sf (operands[0], operands[1],
+					    operands[2]));
+  else
+    emit_insn (gen_vec_interleave_lowv2sf (operands[0], operands[1],
+					   operands[2]));
+  DONE;
+})
+
+(define_expand "vec_extract_oddv2sf"
+  [(match_operand:V2SF 0 "gr_register_operand" "")
+   (match_operand:V2SF 1 "gr_register_operand" "")
+   (match_operand:V2SF 2 "gr_register_operand" "")]
+  ""
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_vec_interleave_lowv2sf (operands[0], operands[1],
+					   operands[2]));
+  else
+    emit_insn (gen_vec_interleave_highv2sf (operands[0], operands[1],
+					    operands[2]));
+  DONE;
+})
+
 (define_expand "vec_setv2sf"
   [(match_operand:V2SF 0 "fr_register_operand" "")
    (match_operand:SF 1 "fr_register_operand" "")
    (match_operand 2 "const_int_operand" "")]
   ""
 {
+  rtx op0 = operands[0];
   rtx tmp = gen_reg_rtx (V2SFmode);
+
   emit_insn (gen_fpack (tmp, operands[1], CONST0_RTX (SFmode)));
 
   switch (INTVAL (operands[2]))
     {
     case 0:
-      emit_insn (gen_fmix_lr (operands[0], tmp, operands[0]));
+      emit_insn (gen_fmix_lr (op0, tmp, op0));
       break;
     case 1:
-      emit_insn (gen_fmix_r (operands[0], operands[0], tmp));
+      emit_insn (gen_vec_interleave_lowv2sf (op0, op0, tmp));
       break;
     default:
       gcc_unreachable ();
@@ -1263,8 +1604,8 @@
 })
 
 (define_insn_and_split "*vec_extractv2sf_0_be"
-  [(set (match_operand:SF 0 "register_operand" "=r,f")
-	(unspec:SF [(match_operand:V2SF 1 "register_operand" "rf,r")
+  [(set (match_operand:SF 0 "register_operand" "=rf,r")
+	(unspec:SF [(match_operand:V2SF 1 "nonimmediate_operand" "m,r")
 		    (const_int 0)]
 		   UNSPEC_VECT_EXTR))]
   "TARGET_BIG_ENDIAN"
@@ -1272,31 +1613,44 @@
   "reload_completed"
   [(set (match_dup 0) (match_dup 1))]
 {
-  if (REG_P (operands[1]) && FR_REGNO_P (REGNO (operands[1])))
-    operands[0] = gen_rtx_REG (V2SFmode, REGNO (operands[0]));
+  if (MEM_P (operands[1]))
+    operands[1] = adjust_address (operands[1], SFmode, 0);
   else
-    operands[1] = gen_rtx_REG (SFmode, REGNO (operands[1]));
+    {
+      emit_insn (gen_lshrdi3 (operands[0], operands[1], GEN_INT (32)));
+      DONE;
+    }
 })
 
-(define_insn_and_split "*vec_extractv2sf_1"
+(define_insn_and_split "*vec_extractv2sf_1_le"
   [(set (match_operand:SF 0 "register_operand" "=r")
 	(unspec:SF [(match_operand:V2SF 1 "register_operand" "r")
 		    (const_int 1)]
 		   UNSPEC_VECT_EXTR))]
-  ""
+  "!TARGET_BIG_ENDIAN"
   "#"
-  "reload_completed"
+  "&& reload_completed"
   [(const_int 0)]
 {
   operands[0] = gen_rtx_REG (DImode, REGNO (operands[0]));
   operands[1] = gen_rtx_REG (DImode, REGNO (operands[1]));
-  if (TARGET_BIG_ENDIAN)
-    emit_move_insn (operands[0], operands[1]);
-  else
-    emit_insn (gen_lshrdi3 (operands[0], operands[1], GEN_INT (32)));
+  emit_insn (gen_lshrdi3 (operands[0], operands[1], GEN_INT (32)));
   DONE;
 })
 
+(define_insn_and_split "*vec_extractv2sf_1_be"
+  [(set (match_operand:SF 0 "register_operand" "=rf")
+	(unspec:SF [(match_operand:V2SF 1 "register_operand" "r")
+		    (const_int 1)]
+		   UNSPEC_VECT_EXTR))]
+  "TARGET_BIG_ENDIAN"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (match_dup 1))]
+{
+  operands[1] = gen_rtx_REG (SFmode, REGNO (operands[1]));
+})
+
 (define_expand "vec_extractv2sf"
   [(set (match_operand:SF 0 "register_operand" "")
 	(unspec:SF [(match_operand:V2SF 1 "register_operand" "")
@@ -1305,6 +1659,72 @@
   ""
   "")
 
+(define_expand "vec_unpacku_lo_<mode>"
+  [(match_operand:<vecwider> 0 "register_operand" "")
+   (match_operand:VECINT12   1 "register_operand" "")]
+  ""
+{
+  ia64_expand_unpack (operands, true, false);
+  DONE;
+})
+
+(define_expand "vec_unpacku_hi_<mode>"
+  [(match_operand:<vecwider> 0 "register_operand" "")
+   (match_operand:VECINT12   1 "register_operand" "")]
+  ""
+{
+  ia64_expand_unpack (operands, true, true);
+  DONE;
+})
+
+(define_expand "vec_unpacks_lo_<mode>"
+  [(match_operand:<vecwider> 0 "register_operand" "")
+   (match_operand:VECINT12   1 "register_operand" "")]
+  ""
+{
+  ia64_expand_unpack (operands, false, false);
+  DONE;
+})
+
+(define_expand "vec_unpacks_hi_<mode>"
+  [(match_operand:<vecwider> 0 "register_operand" "")
+   (match_operand:VECINT12   1 "register_operand" "")]
+  ""
+{
+  ia64_expand_unpack (operands, false, true);
+  DONE;
+})
+
+(define_expand "vec_pack_trunc_v4hi"
+  [(match_operand:V8QI 0 "gr_register_operand" "")
+   (match_operand:V4HI 1 "gr_register_operand" "")
+   (match_operand:V4HI 2 "gr_register_operand" "")]
+  ""
+{
+  rtx op1 = gen_lowpart (V8QImode, operands[1]);
+  rtx op2 = gen_lowpart (V8QImode, operands[2]);
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_vec_extract_oddv8qi (operands[0], op1, op2));
+  else
+    emit_insn (gen_vec_extract_evenv8qi (operands[0], op1, op2));
+  DONE;
+})
+
+(define_expand "vec_pack_trunc_v2si"
+  [(match_operand:V4HI 0 "gr_register_operand" "")
+   (match_operand:V2SI 1 "gr_register_operand" "")
+   (match_operand:V2SI 2 "gr_register_operand" "")]
+  ""
+{
+  rtx op1 = gen_lowpart (V4HImode, operands[1]);
+  rtx op2 = gen_lowpart (V4HImode, operands[2]);
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_vec_extract_oddv4hi (operands[0], op1, op2));
+  else
+    emit_insn (gen_vec_extract_evenv4hi (operands[0], op1, op2));
+  DONE;
+})
+
 ;; Missing operations
 ;; fprcpa
 ;; fpsqrta