diff gcc/config/spu/spu.c @ 47:3bfb6c00c1e0

update it from 4.4.2 to 4.4.3.
author kent <kent@cr.ie.u-ryukyu.ac.jp>
date Sun, 07 Feb 2010 17:44:34 +0900
parents 58ad6c70ea60
children 77e2b8dfacca
line wrap: on
line diff
--- a/gcc/config/spu/spu.c	Sun Feb 07 17:43:48 2010 +0900
+++ b/gcc/config/spu/spu.c	Sun Feb 07 17:44:34 2010 +0900
@@ -189,9 +189,9 @@
 static void spu_va_start (tree, rtx);
 static tree spu_gimplify_va_arg_expr (tree valist, tree type,
 				      gimple_seq * pre_p, gimple_seq * post_p);
-static int regno_aligned_for_load (int regno);
 static int store_with_one_insn_p (rtx mem);
 static int mem_is_padded_component_ref (rtx x);
+static int reg_aligned_for_addr (rtx x);
 static bool spu_assemble_integer (rtx x, unsigned int size, int aligned_p);
 static void spu_asm_globalize_label (FILE * file, const char *name);
 static unsigned char spu_rtx_costs (rtx x, int code, int outer_code,
@@ -210,6 +210,7 @@
 static int spu_sms_res_mii (struct ddg *g);
 static void asm_file_start (void);
 static unsigned int spu_section_type_flags (tree, const char *, int);
+static rtx spu_expand_load (rtx, rtx, rtx, int);
 
 extern const char *reg_names[];
 rtx spu_compare_op0, spu_compare_op1;
@@ -576,66 +577,85 @@
 void
 spu_expand_extv (rtx ops[], int unsignedp)
 {
+  rtx dst = ops[0], src = ops[1];
   HOST_WIDE_INT width = INTVAL (ops[2]);
   HOST_WIDE_INT start = INTVAL (ops[3]);
-  HOST_WIDE_INT src_size, dst_size;
-  enum machine_mode src_mode, dst_mode;
-  rtx dst = ops[0], src = ops[1];
-  rtx s;
-
-  dst = adjust_operand (ops[0], 0);
-  dst_mode = GET_MODE (dst);
-  dst_size = GET_MODE_BITSIZE (GET_MODE (dst));
-
-  src = adjust_operand (src, &start);
-  src_mode = GET_MODE (src);
-  src_size = GET_MODE_BITSIZE (GET_MODE (src));
-
-  if (start > 0)
+  HOST_WIDE_INT align_mask;
+  rtx s0, s1, mask, r0;
+
+  gcc_assert (REG_P (dst) && GET_MODE (dst) == TImode);
+
+  if (MEM_P (src))
     {
-      s = gen_reg_rtx (src_mode);
-      switch (src_mode)
+      /* First, determine if we need 1 TImode load or 2.  We need only 1
+         if the bits being extracted do not cross the alignment boundary
+         as determined by the MEM and its address. */
+
+      align_mask = -MEM_ALIGN (src);
+      if ((start & align_mask) == ((start + width - 1) & align_mask))
+	{
+	  /* Alignment is sufficient for 1 load. */
+	  s0 = gen_reg_rtx (TImode);
+	  r0 = spu_expand_load (s0, 0, src, start / 8);
+	  start &= 7;
+	  if (r0)
+	    emit_insn (gen_rotqby_ti (s0, s0, r0));
+	}
+      else
 	{
-	case SImode:
-	  emit_insn (gen_ashlsi3 (s, src, GEN_INT (start)));
-	  break;
-	case DImode:
-	  emit_insn (gen_ashldi3 (s, src, GEN_INT (start)));
-	  break;
-	case TImode:
-	  emit_insn (gen_ashlti3 (s, src, GEN_INT (start)));
-	  break;
-	default:
-	  abort ();
+	  /* Need 2 loads. */
+	  s0 = gen_reg_rtx (TImode);
+	  s1 = gen_reg_rtx (TImode);
+	  r0 = spu_expand_load (s0, s1, src, start / 8);
+	  start &= 7;
+
+	  gcc_assert (start + width <= 128);
+	  if (r0)
+	    {
+	      rtx r1 = gen_reg_rtx (SImode);
+	      mask = gen_reg_rtx (TImode);
+	      emit_move_insn (mask, GEN_INT (-1));
+	      emit_insn (gen_rotqby_ti (s0, s0, r0));
+	      emit_insn (gen_rotqby_ti (s1, s1, r0));
+	      if (GET_CODE (r0) == CONST_INT)
+		r1 = GEN_INT (INTVAL (r0) & 15);
+	      else
+		emit_insn (gen_andsi3 (r1, r0, GEN_INT (15)));
+	      emit_insn (gen_shlqby_ti (mask, mask, r1));
+	      emit_insn (gen_selb (s0, s1, s0, mask));
+	    }
 	}
-      src = s;
+
     }
-
-  if (width < src_size)
+  else if (GET_CODE (src) == SUBREG)
     {
-      rtx pat;
-      int icode;
-      switch (src_mode)
-	{
-	case SImode:
-	  icode = unsignedp ? CODE_FOR_lshrsi3 : CODE_FOR_ashrsi3;
-	  break;
-	case DImode:
-	  icode = unsignedp ? CODE_FOR_lshrdi3 : CODE_FOR_ashrdi3;
-	  break;
-	case TImode:
-	  icode = unsignedp ? CODE_FOR_lshrti3 : CODE_FOR_ashrti3;
-	  break;
-	default:
-	  abort ();
-	}
-      s = gen_reg_rtx (src_mode);
-      pat = GEN_FCN (icode) (s, src, GEN_INT (src_size - width));
-      emit_insn (pat);
-      src = s;
+      rtx r = SUBREG_REG (src);
+      gcc_assert (REG_P (r) && SCALAR_INT_MODE_P (GET_MODE (r)));
+      s0 = gen_reg_rtx (TImode);
+      if (GET_MODE_SIZE (GET_MODE (r)) < GET_MODE_SIZE (TImode))
+	emit_insn (gen_rtx_SET (VOIDmode, s0, gen_rtx_ZERO_EXTEND (TImode, r)));
+      else
+	emit_move_insn (s0, src);
+    }
+  else
+    {
+      gcc_assert (REG_P (src) && GET_MODE (src) == TImode);
+      s0 = gen_reg_rtx (TImode);
+      emit_move_insn (s0, src);
     }
 
-  convert_move (dst, src, unsignedp);
+  /* Now s0 is TImode and contains the bits to extract at start. */
+
+  if (start)
+    emit_insn (gen_rotlti3 (s0, s0, GEN_INT (start)));
+
+  if (128 - width)
+    {
+      tree c = build_int_cst (NULL_TREE, 128 - width);
+      s0 = expand_shift (RSHIFT_EXPR, TImode, s0, c, s0, unsignedp);
+    }
+
+  emit_move_insn (dst, s0);
 }
 
 void
@@ -728,38 +748,41 @@
     }
   if (GET_CODE (ops[0]) == MEM)
     {
-      rtx aligned = gen_reg_rtx (SImode);
       rtx low = gen_reg_rtx (SImode);
-      rtx addr = gen_reg_rtx (SImode);
       rtx rotl = gen_reg_rtx (SImode);
       rtx mask0 = gen_reg_rtx (TImode);
+      rtx addr;
+      rtx addr0;
+      rtx addr1;
       rtx mem;
 
-      emit_move_insn (addr, XEXP (ops[0], 0));
-      emit_insn (gen_andsi3 (aligned, addr, GEN_INT (-16)));
+      addr = force_reg (Pmode, XEXP (ops[0], 0));
+      addr0 = gen_rtx_AND (Pmode, addr, GEN_INT (-16));
       emit_insn (gen_andsi3 (low, addr, GEN_INT (15)));
       emit_insn (gen_negsi2 (rotl, low));
       emit_insn (gen_rotqby_ti (shift_reg, shift_reg, rotl));
       emit_insn (gen_rotqmby_ti (mask0, mask, rotl));
-      mem = change_address (ops[0], TImode, aligned);
+      mem = change_address (ops[0], TImode, addr0);
       set_mem_alias_set (mem, 0);
       emit_move_insn (dst, mem);
       emit_insn (gen_selb (dst, dst, shift_reg, mask0));
-      emit_move_insn (mem, dst);
       if (start + width > MEM_ALIGN (ops[0]))
 	{
 	  rtx shl = gen_reg_rtx (SImode);
 	  rtx mask1 = gen_reg_rtx (TImode);
 	  rtx dst1 = gen_reg_rtx (TImode);
 	  rtx mem1;
+	  addr1 = plus_constant (addr, 16);
+	  addr1 = gen_rtx_AND (Pmode, addr1, GEN_INT (-16));
 	  emit_insn (gen_subsi3 (shl, GEN_INT (16), low));
 	  emit_insn (gen_shlqby_ti (mask1, mask, shl));
-	  mem1 = adjust_address (mem, TImode, 16);
+	  mem1 = change_address (ops[0], TImode, addr1);
 	  set_mem_alias_set (mem1, 0);
 	  emit_move_insn (dst1, mem1);
 	  emit_insn (gen_selb (dst1, dst1, shift_reg, mask1));
 	  emit_move_insn (mem1, dst1);
 	}
+      emit_move_insn (mem, dst);
     }
   else
     emit_insn (gen_selb (dst, copy_rtx (dst), shift_reg, mask));
@@ -1585,6 +1608,13 @@
       output_addr_const (file, GEN_INT (val));
       return;
 
+    case 'v':
+    case 'w':
+      constant_to_array (mode, x, arr);
+      val = (((arr[0] << 1) + (arr[1] >> 7)) & 0xff) - 127;
+      output_addr_const (file, GEN_INT (code == 'w' ? -val : val));
+      return;
+
     case 0:
       if (xcode == REG)
 	fprintf (file, "%s", reg_names[REGNO (x)]);
@@ -1597,7 +1627,7 @@
       return;
 
       /* unused letters
-	              o qr  uvw yz
+	              o qr  u   yz
 	AB            OPQR  UVWXYZ */
     default:
       output_operand_lossage ("invalid %%xn code");
@@ -1618,6 +1648,8 @@
   rtx pic_reg = pic_offset_table_rtx;
   if (!reload_completed && !reload_in_progress)
     abort ();
+  if (current_function_is_leaf && !df_regs_ever_live_p (LAST_ARG_REGNUM))
+    pic_reg = gen_rtx_REG (SImode, LAST_ARG_REGNUM);
   return pic_reg;
 }
 
@@ -2765,6 +2797,25 @@
 
   pad_bb ();
 
+  for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
+    if (NONJUMP_INSN_P (insn) && INSN_CODE (insn) == CODE_FOR_hbr)
+      {
+	/* Adjust the LABEL_REF in a hint when we have inserted a nop
+	   between its branch label and the branch .  We don't move the
+	   label because GCC expects it at the beginning of the block. */
+	rtx unspec = SET_SRC (XVECEXP (PATTERN (insn), 0, 0));
+	rtx label_ref = XVECEXP (unspec, 0, 0);
+	rtx label = XEXP (label_ref, 0);
+	rtx branch;
+	int offset = 0;
+	for (branch = NEXT_INSN (label);
+	     !JUMP_P (branch) && !CALL_P (branch);
+	     branch = NEXT_INSN (branch))
+	  if (NONJUMP_INSN_P (branch))
+	    offset += get_attr_length (branch);
+	if (offset > 0)
+	  XVECEXP (unspec, 0, 0) = plus_constant (label_ref, offset);
+      }
 
   if (spu_flag_var_tracking)
     {
@@ -2972,7 +3023,7 @@
       insn = ready[i];
       if (INSN_CODE (insn) == -1
 	  || INSN_CODE (insn) == CODE_FOR_blockage
-	  || INSN_CODE (insn) == CODE_FOR__spu_convert)
+	  || (INSN_P (insn) && get_attr_length (insn) == 0))
 	{
 	  ready[i] = ready[nready - 1];
 	  ready[nready - 1] = insn;
@@ -3103,8 +3154,8 @@
       || INSN_CODE (dep_insn) == CODE_FOR_blockage)
     return 0;
 
-  if (INSN_CODE (insn) == CODE_FOR__spu_convert
-      || INSN_CODE (dep_insn) == CODE_FOR__spu_convert)
+  if ((INSN_P (insn) && get_attr_length (insn) == 0)
+      || (INSN_P (dep_insn) && get_attr_length (dep_insn) == 0))
     return 0;
 
   /* Make sure hbrps are spread out. */
@@ -3503,6 +3554,58 @@
   return val >= low && val <= high;
 }
 
+/* TRUE when op is an immediate and an exact power of 2, and given that
+   OP is 2^scale, scale >= LOW && scale <= HIGH.  When OP is a vector,
+   all entries must be the same. */
+bool
+exp2_immediate_p (rtx op, enum machine_mode mode, int low, int high)
+{
+  enum machine_mode int_mode;
+  HOST_WIDE_INT val;
+  unsigned char arr[16];
+  int bytes, i, j;
+
+  gcc_assert (GET_CODE (op) == CONST_INT || GET_CODE (op) == CONST_DOUBLE
+	      || GET_CODE (op) == CONST_VECTOR);
+
+  if (GET_CODE (op) == CONST_VECTOR
+      && !const_vector_immediate_p (op))
+    return 0;
+
+  if (GET_MODE (op) != VOIDmode)
+    mode = GET_MODE (op);
+
+  constant_to_array (mode, op, arr);
+
+  if (VECTOR_MODE_P (mode))
+    mode = GET_MODE_INNER (mode);
+
+  bytes = GET_MODE_SIZE (mode);
+  int_mode = mode_for_size (GET_MODE_BITSIZE (mode), MODE_INT, 0);
+
+  /* Check that bytes are repeated. */
+  for (i = bytes; i < 16; i += bytes)
+    for (j = 0; j < bytes; j++)
+      if (arr[j] != arr[i + j])
+	return 0;
+
+  val = arr[0];
+  for (j = 1; j < bytes; j++)
+    val = (val << 8) | arr[j];
+
+  val = trunc_int_for_mode (val, int_mode);
+
+  /* Currently, we only handle SFmode */
+  gcc_assert (mode == SFmode);
+  if (mode == SFmode)
+    {
+      int exp = (val >> 23) - 127;
+      return val > 0 && (val & 0x007fffff) == 0
+	     &&  exp >= low && exp <= high;
+    }
+  return FALSE;
+}
+
 /* We accept:
    - any 32-bit constant (SImode, SFmode)
    - any constant that can be generated with fsmbi (any mode)
@@ -3533,44 +3636,36 @@
 /* Valid address are:
    - symbol_ref, label_ref, const
    - reg
-   - reg + const, where either reg or const is 16 byte aligned
+   - reg + const_int, where const_int is 16 byte aligned
    - reg + reg, alignment doesn't matter
   The alignment matters in the reg+const case because lqd and stqd
-  ignore the 4 least significant bits of the const.  (TODO: It might be
-  preferable to allow any alignment and fix it up when splitting.) */
+  ignore the 4 least significant bits of the const.  We only care about
+  16 byte modes because the expand phase will change all smaller MEM
+  references to TImode.  */
 int
 spu_legitimate_address (enum machine_mode mode ATTRIBUTE_UNUSED,
 			rtx x, int reg_ok_strict)
 {
-  if (mode == TImode && GET_CODE (x) == AND
+  int aligned = GET_MODE_SIZE (mode) >= 16;
+  if (aligned
+      && GET_CODE (x) == AND
       && GET_CODE (XEXP (x, 1)) == CONST_INT
-      && INTVAL (XEXP (x, 1)) == (HOST_WIDE_INT) -16)
+      && INTVAL (XEXP (x, 1)) == (HOST_WIDE_INT) - 16)
     x = XEXP (x, 0);
   switch (GET_CODE (x))
     {
+    case LABEL_REF:
     case SYMBOL_REF:
-    case LABEL_REF:
-      return !TARGET_LARGE_MEM;
-
     case CONST:
-      if (!TARGET_LARGE_MEM && GET_CODE (XEXP (x, 0)) == PLUS)
-	{
-	  rtx sym = XEXP (XEXP (x, 0), 0);
-	  rtx cst = XEXP (XEXP (x, 0), 1);
-
-	  /* Accept any symbol_ref + constant, assuming it does not
-	     wrap around the local store addressability limit.  */
-	  if (GET_CODE (sym) == SYMBOL_REF && GET_CODE (cst) == CONST_INT)
-	    return 1;
-	}
-      return 0;
+      return !TARGET_LARGE_MEM;
 
     case CONST_INT:
       return INTVAL (x) >= 0 && INTVAL (x) <= 0x3ffff;
 
     case SUBREG:
       x = XEXP (x, 0);
-      gcc_assert (GET_CODE (x) == REG);
+      if (REG_P (x))
+	return 0;
 
     case REG:
       return INT_REG_OK_FOR_BASE_P (x, reg_ok_strict);
@@ -3584,29 +3679,25 @@
 	  op0 = XEXP (op0, 0);
 	if (GET_CODE (op1) == SUBREG)
 	  op1 = XEXP (op1, 0);
-	/* We can't just accept any aligned register because CSE can
-	   change it to a register that is not marked aligned and then
-	   recog will fail.   So we only accept frame registers because
-	   they will only be changed to other frame registers. */
 	if (GET_CODE (op0) == REG
 	    && INT_REG_OK_FOR_BASE_P (op0, reg_ok_strict)
 	    && GET_CODE (op1) == CONST_INT
 	    && INTVAL (op1) >= -0x2000
 	    && INTVAL (op1) <= 0x1fff
-	    && (regno_aligned_for_load (REGNO (op0)) || (INTVAL (op1) & 15) == 0))
-	  return 1;
+	    && (!aligned || (INTVAL (op1) & 15) == 0))
+	  return TRUE;
 	if (GET_CODE (op0) == REG
 	    && INT_REG_OK_FOR_BASE_P (op0, reg_ok_strict)
 	    && GET_CODE (op1) == REG
 	    && INT_REG_OK_FOR_INDEX_P (op1, reg_ok_strict))
-	  return 1;
+	  return TRUE;
       }
       break;
 
     default:
       break;
     }
-  return 0;
+  return FALSE;
 }
 
 /* When the address is reg + const_int, force the const_int into a
@@ -4061,60 +4152,14 @@
     }
 }
 
-/* This is called to decide when we can simplify a load instruction.  We
-   must only return true for registers which we know will always be
-   aligned.  Taking into account that CSE might replace this reg with
-   another one that has not been marked aligned.  
-   So this is really only true for frame, stack and virtual registers,
-   which we know are always aligned and should not be adversely effected
-   by CSE.  */
+/* This is called any time we inspect the alignment of a register for
+   addresses.  */
 static int
-regno_aligned_for_load (int regno)
-{
-  return regno == FRAME_POINTER_REGNUM
-    || (frame_pointer_needed && regno == HARD_FRAME_POINTER_REGNUM)
-    || regno == ARG_POINTER_REGNUM
-    || regno == STACK_POINTER_REGNUM
-    || (regno >= FIRST_VIRTUAL_REGISTER 
-	&& regno <= LAST_VIRTUAL_REGISTER);
-}
-
-/* Return TRUE when mem is known to be 16-byte aligned. */
-int
-aligned_mem_p (rtx mem)
+reg_aligned_for_addr (rtx x)
 {
-  if (MEM_ALIGN (mem) >= 128)
-    return 1;
-  if (GET_MODE_SIZE (GET_MODE (mem)) >= 16)
-    return 1;
-  if (GET_CODE (XEXP (mem, 0)) == PLUS)
-    {
-      rtx p0 = XEXP (XEXP (mem, 0), 0);
-      rtx p1 = XEXP (XEXP (mem, 0), 1);
-      if (regno_aligned_for_load (REGNO (p0)))
-	{
-	  if (GET_CODE (p1) == REG && regno_aligned_for_load (REGNO (p1)))
-	    return 1;
-	  if (GET_CODE (p1) == CONST_INT && (INTVAL (p1) & 15) == 0)
-	    return 1;
-	}
-    }
-  else if (GET_CODE (XEXP (mem, 0)) == REG)
-    {
-      if (regno_aligned_for_load (REGNO (XEXP (mem, 0))))
-	return 1;
-    }
-  else if (ALIGNED_SYMBOL_REF_P (XEXP (mem, 0)))
-    return 1;
-  else if (GET_CODE (XEXP (mem, 0)) == CONST)
-    {
-      rtx p0 = XEXP (XEXP (XEXP (mem, 0), 0), 0);
-      rtx p1 = XEXP (XEXP (XEXP (mem, 0), 0), 1);
-      if (GET_CODE (p0) == SYMBOL_REF
-	  && GET_CODE (p1) == CONST_INT && (INTVAL (p1) & 15) == 0)
-	return 1;
-    }
-  return 0;
+  int regno =
+    REGNO (x) < FIRST_PSEUDO_REGISTER ? ORIGINAL_REGNO (x) : REGNO (x);
+  return REGNO_POINTER_ALIGN (regno) >= 128;
 }
 
 /* Encode symbol attributes (local vs. global, tls model) of a SYMBOL_REF
@@ -4143,9 +4188,12 @@
 static int
 store_with_one_insn_p (rtx mem)
 {
+  enum machine_mode mode = GET_MODE (mem);
   rtx addr = XEXP (mem, 0);
-  if (GET_MODE (mem) == BLKmode)
+  if (mode == BLKmode)
     return 0;
+  if (GET_MODE_SIZE (mode) >= 16)
+    return 1;
   /* Only static objects. */
   if (GET_CODE (addr) == SYMBOL_REF)
     {
@@ -4169,6 +4217,22 @@
   return 0;
 }
 
+/* Return 1 when the address is not valid for a simple load and store as
+   required by the '_mov*' patterns.   We could make this less strict
+   for loads, but we prefer mem's to look the same so they are more
+   likely to be merged.  */
+static int
+address_needs_split (rtx mem)
+{
+  if (GET_MODE_SIZE (GET_MODE (mem)) < 16
+      && (GET_MODE_SIZE (GET_MODE (mem)) < 4
+	  || !(store_with_one_insn_p (mem)
+	       || mem_is_padded_component_ref (mem))))
+    return 1;
+
+  return 0;
+}
+
 int
 spu_expand_mov (rtx * ops, enum machine_mode mode)
 {
@@ -4213,54 +4277,63 @@
 	return spu_split_immediate (ops);
       return 0;
     }
-  else
+
+  /* Catch the SImode immediates greater than 0x7fffffff, and sign
+     extend them. */
+  if (GET_CODE (ops[1]) == CONST_INT)
     {
-      if (GET_CODE (ops[0]) == MEM)
-	{
-	  if (!spu_valid_move (ops))
-	    {
-	      emit_insn (gen_store (ops[0], ops[1], gen_reg_rtx (TImode),
-				    gen_reg_rtx (TImode)));
-	      return 1;
-	    }
-	}
-      else if (GET_CODE (ops[1]) == MEM)
+      HOST_WIDE_INT val = trunc_int_for_mode (INTVAL (ops[1]), mode);
+      if (val != INTVAL (ops[1]))
 	{
-	  if (!spu_valid_move (ops))
-	    {
-	      emit_insn (gen_load
-			 (ops[0], ops[1], gen_reg_rtx (TImode),
-			  gen_reg_rtx (SImode)));
-	      return 1;
-	    }
-	}
-      /* Catch the SImode immediates greater than 0x7fffffff, and sign
-         extend them. */
-      if (GET_CODE (ops[1]) == CONST_INT)
-	{
-	  HOST_WIDE_INT val = trunc_int_for_mode (INTVAL (ops[1]), mode);
-	  if (val != INTVAL (ops[1]))
-	    {
-	      emit_move_insn (ops[0], GEN_INT (val));
-	      return 1;
-	    }
+	  emit_move_insn (ops[0], GEN_INT (val));
+	  return 1;
 	}
     }
+  if (MEM_P (ops[0]))
+    return spu_split_store (ops);
+  if (MEM_P (ops[1]))
+    return spu_split_load (ops);
+
   return 0;
 }
 
-void
-spu_split_load (rtx * ops)
+static void
+spu_convert_move (rtx dst, rtx src)
 {
-  enum machine_mode mode = GET_MODE (ops[0]);
-  rtx addr, load, rot, mem, p0, p1;
+  enum machine_mode mode = GET_MODE (dst);
+  enum machine_mode int_mode = mode_for_size (GET_MODE_BITSIZE (mode), MODE_INT, 0);
+  rtx reg;
+  gcc_assert (GET_MODE (src) == TImode);
+  reg = int_mode != mode ? gen_reg_rtx (int_mode) : dst;
+  emit_insn (gen_rtx_SET (VOIDmode, reg,
+	       gen_rtx_TRUNCATE (int_mode,
+		 gen_rtx_LSHIFTRT (TImode, src,
+		   GEN_INT (int_mode == DImode ? 64 : 96)))));
+  if (int_mode != mode)
+    {
+      reg = simplify_gen_subreg (mode, reg, int_mode, 0);
+      emit_move_insn (dst, reg);
+    }
+}
+
+/* Load TImode values into DST0 and DST1 (when it is non-NULL) using
+   the address from SRC and SRC+16.  Return a REG or CONST_INT that
+   specifies how many bytes to rotate the loaded registers, plus any
+   extra from EXTRA_ROTQBY.  The address and rotate amounts are
+   normalized to improve merging of loads and rotate computations. */
+static rtx
+spu_expand_load (rtx dst0, rtx dst1, rtx src, int extra_rotby)
+{
+  rtx addr = XEXP (src, 0);
+  rtx p0, p1, rot, addr0, addr1;
   int rot_amt;
 
-  addr = XEXP (ops[1], 0);
-
   rot = 0;
   rot_amt = 0;
-  if (GET_CODE (addr) == PLUS)
+
+  if (MEM_ALIGN (src) >= 128)
+    /* Address is already aligned; simply perform a TImode load.  */ ;
+  else if (GET_CODE (addr) == PLUS)
     {
       /* 8 cases:
          aligned reg   + aligned reg     => lqx
@@ -4274,12 +4347,34 @@
        */
       p0 = XEXP (addr, 0);
       p1 = XEXP (addr, 1);
-      if (REG_P (p0) && !regno_aligned_for_load (REGNO (p0)))
+      if (!reg_aligned_for_addr (p0))
 	{
-	  if (REG_P (p1) && !regno_aligned_for_load (REGNO (p1)))
+	  if (REG_P (p1) && !reg_aligned_for_addr (p1))
+	    {
+	      rot = gen_reg_rtx (SImode);
+	      emit_insn (gen_addsi3 (rot, p0, p1));
+	    }
+	  else if (GET_CODE (p1) == CONST_INT && (INTVAL (p1) & 15))
 	    {
-	      emit_insn (gen_addsi3 (ops[3], p0, p1));
-	      rot = ops[3];
+	      if (INTVAL (p1) > 0
+		  && REG_POINTER (p0)
+		  && INTVAL (p1) * BITS_PER_UNIT
+		     < REGNO_POINTER_ALIGN (REGNO (p0)))
+		{
+		  rot = gen_reg_rtx (SImode);
+		  emit_insn (gen_addsi3 (rot, p0, p1));
+		  addr = p0;
+		}
+	      else
+		{
+		  rtx x = gen_reg_rtx (SImode);
+		  emit_move_insn (x, p1);
+		  if (!spu_arith_operand (p1, SImode))
+		    p1 = x;
+		  rot = gen_reg_rtx (SImode);
+		  emit_insn (gen_addsi3 (rot, p0, p1));
+		  addr = gen_rtx_PLUS (Pmode, p0, x);
+		}
 	    }
 	  else
 	    rot = p0;
@@ -4289,16 +4384,21 @@
 	  if (GET_CODE (p1) == CONST_INT && (INTVAL (p1) & 15))
 	    {
 	      rot_amt = INTVAL (p1) & 15;
-	      p1 = GEN_INT (INTVAL (p1) & -16);
-	      addr = gen_rtx_PLUS (SImode, p0, p1);
+	      if (INTVAL (p1) & -16)
+		{
+		  p1 = GEN_INT (INTVAL (p1) & -16);
+		  addr = gen_rtx_PLUS (SImode, p0, p1);
+		}
+	      else
+		addr = p0;
 	    }
-	  else if (REG_P (p1) && !regno_aligned_for_load (REGNO (p1)))
+	  else if (REG_P (p1) && !reg_aligned_for_addr (p1))
 	    rot = p1;
 	}
     }
-  else if (GET_CODE (addr) == REG)
+  else if (REG_P (addr))
     {
-      if (!regno_aligned_for_load (REGNO (addr)))
+      if (!reg_aligned_for_addr (addr))
 	rot = addr;
     }
   else if (GET_CODE (addr) == CONST)
@@ -4317,7 +4417,10 @@
 	    addr = XEXP (XEXP (addr, 0), 0);
 	}
       else
-	rot = addr;
+	{
+	  rot = gen_reg_rtx (Pmode);
+	  emit_move_insn (rot, addr);
+	}
     }
   else if (GET_CODE (addr) == CONST_INT)
     {
@@ -4325,49 +4428,96 @@
       addr = GEN_INT (rot_amt & -16);
     }
   else if (!ALIGNED_SYMBOL_REF_P (addr))
-    rot = addr;
-
-  if (GET_MODE_SIZE (mode) < 4)
-    rot_amt += GET_MODE_SIZE (mode) - 4;
+    {
+      rot = gen_reg_rtx (Pmode);
+      emit_move_insn (rot, addr);
+    }
+
+  rot_amt += extra_rotby;
 
   rot_amt &= 15;
 
   if (rot && rot_amt)
     {
-      emit_insn (gen_addsi3 (ops[3], rot, GEN_INT (rot_amt)));
-      rot = ops[3];
+      rtx x = gen_reg_rtx (SImode);
+      emit_insn (gen_addsi3 (x, rot, GEN_INT (rot_amt)));
+      rot = x;
       rot_amt = 0;
     }
-
-  load = ops[2];
-
-  addr = gen_rtx_AND (SImode, copy_rtx (addr), GEN_INT (-16));
-  mem = change_address (ops[1], TImode, addr);
-
-  emit_insn (gen_movti (load, mem));
+  if (!rot && rot_amt)
+    rot = GEN_INT (rot_amt);
+
+  addr0 = copy_rtx (addr);
+  addr0 = gen_rtx_AND (SImode, copy_rtx (addr), GEN_INT (-16));
+  emit_insn (gen__movti (dst0, change_address (src, TImode, addr0)));
+
+  if (dst1)
+    {
+      addr1 = plus_constant (copy_rtx (addr), 16);
+      addr1 = gen_rtx_AND (SImode, addr1, GEN_INT (-16));
+      emit_insn (gen__movti (dst1, change_address (src, TImode, addr1)));
+    }
+
+  return rot;
+}
+
+int
+spu_split_load (rtx * ops)
+{
+  enum machine_mode mode = GET_MODE (ops[0]);
+  rtx addr, load, rot;
+  int rot_amt;
+
+  if (GET_MODE_SIZE (mode) >= 16)
+    return 0;
+
+  addr = XEXP (ops[1], 0);
+  gcc_assert (GET_CODE (addr) != AND);
+
+  if (!address_needs_split (ops[1]))
+    {
+      ops[1] = change_address (ops[1], TImode, addr);
+      load = gen_reg_rtx (TImode);
+      emit_insn (gen__movti (load, ops[1]));
+      spu_convert_move (ops[0], load);
+      return 1;
+    }
+
+  rot_amt = GET_MODE_SIZE (mode) < 4 ? GET_MODE_SIZE (mode) - 4 : 0;
+
+  load = gen_reg_rtx (TImode);
+  rot = spu_expand_load (load, 0, ops[1], rot_amt);
 
   if (rot)
     emit_insn (gen_rotqby_ti (load, load, rot));
-  else if (rot_amt)
-    emit_insn (gen_rotlti3 (load, load, GEN_INT (rot_amt * 8)));
-
-  if (reload_completed)
-    emit_move_insn (ops[0], gen_rtx_REG (GET_MODE (ops[0]), REGNO (load)));
-  else
-    emit_insn (gen_spu_convert (ops[0], load));
+
+  spu_convert_move (ops[0], load);
+  return 1;
 }
 
-void
+int
 spu_split_store (rtx * ops)
 {
   enum machine_mode mode = GET_MODE (ops[0]);
-  rtx pat = ops[2];
-  rtx reg = ops[3];
+  rtx reg;
   rtx addr, p0, p1, p1_lo, smem;
   int aform;
   int scalar;
 
+  if (GET_MODE_SIZE (mode) >= 16)
+    return 0;
+
   addr = XEXP (ops[0], 0);
+  gcc_assert (GET_CODE (addr) != AND);
+
+  if (!address_needs_split (ops[0]))
+    {
+      reg = gen_reg_rtx (TImode);
+      emit_insn (gen_spu_convert (reg, ops[1]));
+      ops[0] = change_address (ops[0], TImode, addr);
+      emit_move_insn (ops[0], reg);
+      return 1;
+    }
 
   if (GET_CODE (addr) == PLUS)
     {
@@ -4379,19 +4529,31 @@
          unaligned reg + aligned reg     => lqx, c?x, shuf, stqx
          unaligned reg + unaligned reg   => lqx, c?x, shuf, stqx
          unaligned reg + aligned const   => lqd, c?d, shuf, stqx
-         unaligned reg + unaligned const -> not allowed by legitimate address
+         unaligned reg + unaligned const -> lqx, c?d, shuf, stqx
        */
       aform = 0;
       p0 = XEXP (addr, 0);
       p1 = p1_lo = XEXP (addr, 1);
-      if (GET_CODE (p0) == REG && GET_CODE (p1) == CONST_INT)
+      if (REG_P (p0) && GET_CODE (p1) == CONST_INT)
 	{
 	  p1_lo = GEN_INT (INTVAL (p1) & 15);
-	  p1 = GEN_INT (INTVAL (p1) & -16);
-	  addr = gen_rtx_PLUS (SImode, p0, p1);
+	  if (reg_aligned_for_addr (p0))
+	    {
+	      p1 = GEN_INT (INTVAL (p1) & -16);
+	      if (p1 == const0_rtx)
+		addr = p0;
+	      else
+		addr = gen_rtx_PLUS (SImode, p0, p1);
+	    }
+	  else
+	    {
+	      rtx x = gen_reg_rtx (SImode);
+	      emit_move_insn (x, p1);
+	      addr = gen_rtx_PLUS (SImode, p0, x);
+	    }
 	}
     }
-  else if (GET_CODE (addr) == REG)
+  else if (REG_P (addr))
     {
       aform = 0;
       p0 = addr;
@@ -4405,31 +4567,34 @@
       p1_lo = addr;
       if (ALIGNED_SYMBOL_REF_P (addr))
 	p1_lo = const0_rtx;
-      else if (GET_CODE (addr) == CONST)
+      else if (GET_CODE (addr) == CONST
+	       && GET_CODE (XEXP (addr, 0)) == PLUS
+	       && ALIGNED_SYMBOL_REF_P (XEXP (XEXP (addr, 0), 0))
+	       && GET_CODE (XEXP (XEXP (addr, 0), 1)) == CONST_INT)
 	{
-	  if (GET_CODE (XEXP (addr, 0)) == PLUS
-	      && ALIGNED_SYMBOL_REF_P (XEXP (XEXP (addr, 0), 0))
-	      && GET_CODE (XEXP (XEXP (addr, 0), 1)) == CONST_INT)
-	    {
-	      HOST_WIDE_INT v = INTVAL (XEXP (XEXP (addr, 0), 1));
-	      if ((v & -16) != 0)
-		addr = gen_rtx_CONST (Pmode,
-				      gen_rtx_PLUS (Pmode,
-						    XEXP (XEXP (addr, 0), 0),
-						    GEN_INT (v & -16)));
-	      else
-		addr = XEXP (XEXP (addr, 0), 0);
-	      p1_lo = GEN_INT (v & 15);
-	    }
+	  HOST_WIDE_INT v = INTVAL (XEXP (XEXP (addr, 0), 1));
+	  if ((v & -16) != 0)
+	    addr = gen_rtx_CONST (Pmode,
+				  gen_rtx_PLUS (Pmode,
+						XEXP (XEXP (addr, 0), 0),
+						GEN_INT (v & -16)));
+	  else
+	    addr = XEXP (XEXP (addr, 0), 0);
+	  p1_lo = GEN_INT (v & 15);
 	}
       else if (GET_CODE (addr) == CONST_INT)
 	{
 	  p1_lo = GEN_INT (INTVAL (addr) & 15);
 	  addr = GEN_INT (INTVAL (addr) & -16);
 	}
+      else
+	{
+	  p1_lo = gen_reg_rtx (SImode);
+	  emit_move_insn (p1_lo, addr);
+	}
     }
 
-  addr = gen_rtx_AND (SImode, copy_rtx (addr), GEN_INT (-16));
+  reg = gen_reg_rtx (TImode);
 
   scalar = store_with_one_insn_p (ops[0]);
   if (!scalar)
@@ -4439,11 +4604,12 @@
          possible, and copying the flags will prevent that in certain
          cases, e.g. consider the volatile flag. */
 
+      rtx pat = gen_reg_rtx (TImode);
       rtx lmem = change_address (ops[0], TImode, copy_rtx (addr));
       set_mem_alias_set (lmem, 0);
       emit_insn (gen_movti (reg, lmem));
 
-      if (!p0 || regno_aligned_for_load (REGNO (p0)))
+      if (!p0 || reg_aligned_for_addr (p0))
 	p0 = stack_pointer_rtx;
       if (!p1_lo)
 	p1_lo = const0_rtx;
@@ -4451,17 +4617,6 @@
       emit_insn (gen_cpat (pat, p0, p1_lo, GEN_INT (GET_MODE_SIZE (mode))));
       emit_insn (gen_shufb (reg, ops[1], reg, pat));
     }
-  else if (reload_completed)
-    {
-      if (GET_CODE (ops[1]) == REG)
-	emit_move_insn (reg, gen_rtx_REG (GET_MODE (reg), REGNO (ops[1])));
-      else if (GET_CODE (ops[1]) == SUBREG)
-	emit_move_insn (reg,
-			gen_rtx_REG (GET_MODE (reg),
-				     REGNO (SUBREG_REG (ops[1]))));
-      else
-	abort ();
-    }
   else
     {
       if (GET_CODE (ops[1]) == REG)
@@ -4473,15 +4628,16 @@
     }
 
   if (GET_MODE_SIZE (mode) < 4 && scalar)
-    emit_insn (gen_shlqby_ti
-	       (reg, reg, GEN_INT (4 - GET_MODE_SIZE (mode))));
-
-  smem = change_address (ops[0], TImode, addr);
+    emit_insn (gen_ashlti3
+	       (reg, reg, GEN_INT (32 - GET_MODE_BITSIZE (mode))));
+
+  smem = change_address (ops[0], TImode, copy_rtx (addr));
   /* We can't use the previous alias set because the memory has changed
      size and can potentially overlap objects of other types.  */
   set_mem_alias_set (smem, 0);
 
   emit_insn (gen_movti (smem, reg));
+  return 1;
 }
 
 /* Return TRUE if X is MEM which is a struct member reference
@@ -4580,37 +4736,6 @@
     }
 }
 
-int
-spu_valid_move (rtx * ops)
-{
-  enum machine_mode mode = GET_MODE (ops[0]);
-  if (!register_operand (ops[0], mode) && !register_operand (ops[1], mode))
-    return 0;
-
-  /* init_expr_once tries to recog against load and store insns to set
-     the direct_load[] and direct_store[] arrays.  We always want to
-     consider those loads and stores valid.  init_expr_once is called in
-     the context of a dummy function which does not have a decl. */
-  if (cfun->decl == 0)
-    return 1;
-
-  /* Don't allows loads/stores which would require more than 1 insn.
-     During and after reload we assume loads and stores only take 1
-     insn. */
-  if (GET_MODE_SIZE (mode) < 16 && !reload_in_progress && !reload_completed)
-    {
-      if (GET_CODE (ops[0]) == MEM
-	  && (GET_MODE_SIZE (mode) < 4
-	      || !(store_with_one_insn_p (ops[0])
-		   || mem_is_padded_component_ref (ops[0]))))
-	return 0;
-      if (GET_CODE (ops[1]) == MEM
-	  && (GET_MODE_SIZE (mode) < 4 || !aligned_mem_p (ops[1])))
-	return 0;
-    }
-  return 1;
-}
-
 /* Return TRUE if x is a CONST_INT, CONST_DOUBLE or CONST_VECTOR that
    can be generated using the fsmbi instruction. */
 int
@@ -6324,12 +6449,25 @@
 
 void
 spu_init_expanders (void)
-{   
-  /* HARD_FRAME_REGISTER is only 128 bit aligned when
-   * frame_pointer_needed is true.  We don't know that until we're
-   * expanding the prologue. */
+{
   if (cfun)
-    REGNO_POINTER_ALIGN (HARD_FRAME_POINTER_REGNUM) = 8;
+    {
+      rtx r0, r1;
+      /* HARD_FRAME_REGISTER is only 128 bit aligned when
+         frame_pointer_needed is true.  We don't know that until we're
+         expanding the prologue. */
+      REGNO_POINTER_ALIGN (HARD_FRAME_POINTER_REGNUM) = 8;
+
+      /* A number of passes use LAST_VIRTUAL_REGISTER+1 and
+	 LAST_VIRTUAL_REGISTER+2 to test the back-end.  We want them
+	 to be treated as aligned, so generate them here. */
+      r0 = gen_reg_rtx (SImode);
+      r1 = gen_reg_rtx (SImode);
+      mark_reg_pointer (r0, 128);
+      mark_reg_pointer (r1, 128);
+      gcc_assert (REGNO (r0) == LAST_VIRTUAL_REGISTER + 1
+		  && REGNO (r1) == LAST_VIRTUAL_REGISTER + 2);
+    }
 }
 
 static enum machine_mode
@@ -6372,5 +6510,53 @@
   return default_section_type_flags (decl, name, reloc);
 }
 
+/* Generate a constant or register which contains 2^SCALE.  We assume
+   the result is valid for MODE.  Currently, MODE must be V4SFmode and
+   SCALE must be SImode. */
+rtx
+spu_gen_exp2 (enum machine_mode mode, rtx scale)
+{
+  gcc_assert (mode == V4SFmode);
+  gcc_assert (GET_MODE (scale) == SImode || GET_CODE (scale) == CONST_INT);
+  if (GET_CODE (scale) != CONST_INT)
+    {
+      /* unsigned int exp = (127 + scale) << 23;
+	__vector float m = (__vector float) spu_splats (exp); */
+      rtx reg = force_reg (SImode, scale);
+      rtx exp = gen_reg_rtx (SImode);
+      rtx mul = gen_reg_rtx (mode);
+      emit_insn (gen_addsi3 (exp, reg, GEN_INT (127)));
+      emit_insn (gen_ashlsi3 (exp, exp, GEN_INT (23)));
+      emit_insn (gen_spu_splats (mul, gen_rtx_SUBREG (GET_MODE_INNER (mode), exp, 0)));
+      return mul;
+    }
+  else
+    {
+      HOST_WIDE_INT exp = 127 + INTVAL (scale);
+      unsigned char arr[16];
+      arr[0] = arr[4] = arr[8] = arr[12] = exp >> 1;
+      arr[1] = arr[5] = arr[9] = arr[13] = exp << 7;
+      arr[2] = arr[6] = arr[10] = arr[14] = 0;
+      arr[3] = arr[7] = arr[11] = arr[15] = 0;
+      return array_to_constant (mode, arr);
+    }
+}
+
+/* After reload, just change the convert into a move instruction
+   or a dead instruction. */
+void
+spu_split_convert (rtx ops[])
+{
+  if (REGNO (ops[0]) == REGNO (ops[1]))
+    emit_note (NOTE_INSN_DELETED);
+  else
+    {
+      /* Use TImode always as this might help hard reg copyprop.  */
+      rtx op0 = gen_rtx_REG (TImode, REGNO (ops[0]));
+      rtx op1 = gen_rtx_REG (TImode, REGNO (ops[1]));
+      emit_insn (gen_move_insn (op0, op1));
+    }
+}
+
 #include "gt-spu.h"