diff gcc/config/rs6000/rs6000-p8swap.c @ 131:84e7813d76e9

gcc-8.2
author mir3636
date Thu, 25 Oct 2018 07:37:49 +0900
parents 04ced10e8804
children 1830386684a0
line wrap: on
line diff
--- a/gcc/config/rs6000/rs6000-p8swap.c	Fri Oct 27 22:46:09 2017 +0900
+++ b/gcc/config/rs6000/rs6000-p8swap.c	Thu Oct 25 07:37:49 2018 +0900
@@ -1,6 +1,6 @@
 /* Subroutines used to remove unnecessary doubleword swaps
    for p8 little-endian VSX code.
-   Copyright (C) 1991-2017 Free Software Foundation, Inc.
+   Copyright (C) 1991-2018 Free Software Foundation, Inc.
 
    This file is part of GCC.
 
@@ -18,6 +18,8 @@
    along with GCC; see the file COPYING3.  If not see
    <http://www.gnu.org/licenses/>.  */
 
+#define IN_TARGET_CODE 1
+
 #include "config.h"
 #include "system.h"
 #include "coretypes.h"
@@ -34,6 +36,7 @@
 #include "expr.h"
 #include "output.h"
 #include "tree-pass.h"
+#include "rtx-vector-builder.h"
 
 /* Analyze vector computations and remove unnecessary doubleword
    swaps (xxswapdi instructions).  This pass is performed only
@@ -325,6 +328,174 @@
   return 1;
 }
 
+/* Return true iff EXPR represents the sum of two registers.  */
+bool
+rs6000_sum_of_two_registers_p (const_rtx expr)
+{
+  if (GET_CODE (expr) == PLUS)
+    {
+      const_rtx operand1 = XEXP (expr, 0);
+      const_rtx operand2 = XEXP (expr, 1);
+      return (REG_P (operand1) && REG_P (operand2));
+    }
+  return false;
+}
+
+/* Return true iff EXPR represents an address expression that masks off
+   the low-order 4 bits in the style of an lvx or stvx rtl pattern.  */
+bool
+rs6000_quadword_masked_address_p (const_rtx expr)
+{
+  if (GET_CODE (expr) == AND)
+    {
+      const_rtx operand1 = XEXP (expr, 0);
+      const_rtx operand2 = XEXP (expr, 1);
+      if ((REG_P (operand1) || rs6000_sum_of_two_registers_p (operand1))
+	  && CONST_SCALAR_INT_P (operand2) && INTVAL (operand2) == -16)
+	return true;
+    }
+  return false;
+}
+
+/* Return TRUE if INSN represents a swap of a swapped load from memory
+   and the memory address is quad-word aligned.  */
+static bool
+quad_aligned_load_p (swap_web_entry *insn_entry, rtx_insn *insn)
+{
+  unsigned uid = INSN_UID (insn);
+  if (!insn_entry[uid].is_swap || insn_entry[uid].is_load)
+    return false;
+
+  struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+
+  /* Since insn is known to represent a swap instruction, we know it
+     "uses" only one input variable.  */
+  df_ref use = DF_INSN_INFO_USES (insn_info);
+
+  /* Figure out where this input variable is defined.  */
+  struct df_link *def_link = DF_REF_CHAIN (use);
+
+  /* If there is no definition or the definition is artificial or there are
+     multiple definitions, punt.  */
+  if (!def_link || !def_link->ref || DF_REF_IS_ARTIFICIAL (def_link->ref)
+      || def_link->next)
+    return false;
+
+  rtx def_insn = DF_REF_INSN (def_link->ref);
+  unsigned uid2 = INSN_UID (def_insn);
+  /* We're looking for a load-with-swap insn.  If this is not that,
+     return false.  */
+  if (!insn_entry[uid2].is_load || !insn_entry[uid2].is_swap)
+    return false;
+
+  /* If the source of the rtl def is not a set from memory, return
+     false.  */
+  rtx body = PATTERN (def_insn);
+  if (GET_CODE (body) != SET
+      || GET_CODE (SET_SRC (body)) != VEC_SELECT
+      || GET_CODE (XEXP (SET_SRC (body), 0)) != MEM)
+    return false;
+
+  rtx mem = XEXP (SET_SRC (body), 0);
+  rtx base_reg = XEXP (mem, 0);
+  return ((REG_P (base_reg) || rs6000_sum_of_two_registers_p (base_reg))
+	  && MEM_ALIGN (mem) >= 128) ? true : false;
+}
+
+/* Return TRUE if INSN represents a store-with-swap of a swapped value
+   and the memory address is quad-word aligned.  */
+static bool
+quad_aligned_store_p (swap_web_entry *insn_entry, rtx_insn *insn)
+{
+  unsigned uid = INSN_UID (insn);
+  if (!insn_entry[uid].is_swap || !insn_entry[uid].is_store)
+    return false;
+
+  rtx body = PATTERN (insn);
+  rtx dest_address = XEXP (SET_DEST (body), 0);
+  rtx swap_reg = XEXP (SET_SRC (body), 0);
+
+  /* If the base address for the memory expression is not represented
+     by a single register and is not the sum of two registers, punt.  */
+  if (!REG_P (dest_address) && !rs6000_sum_of_two_registers_p (dest_address))
+    return false;
+
+  /* Confirm that the value to be stored is produced by a swap
+     instruction.  */
+  struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+  df_ref use;
+  FOR_EACH_INSN_INFO_USE (use, insn_info)
+    {
+      struct df_link *def_link = DF_REF_CHAIN (use);
+
+      /* If this is not the definition of the candidate swap register,
+	 then skip it.  I am interested in a different definition.  */
+      if (!rtx_equal_p (DF_REF_REG (use), swap_reg))
+	continue;
+
+      /* If there is no def or the def is artifical or there are
+	 multiple defs, punt.  */
+      if (!def_link || !def_link->ref || DF_REF_IS_ARTIFICIAL (def_link->ref)
+	  || def_link->next)
+	return false;
+
+      rtx def_insn = DF_REF_INSN (def_link->ref);
+      unsigned uid2 = INSN_UID (def_insn);
+
+      /* If this source value is not a simple swap, return false */
+      if (!insn_entry[uid2].is_swap || insn_entry[uid2].is_load
+	  || insn_entry[uid2].is_store)
+	return false;
+
+      /* I've processed the use that I care about, so break out of
+	 this loop.  */
+      break;
+    }
+
+  /* At this point, we know the source data comes from a swap.  The
+     remaining question is whether the memory address is aligned.  */
+  rtx set = single_set (insn);
+  if (set)
+    {
+      rtx dest = SET_DEST (set);
+      if (MEM_P (dest))
+	return (MEM_ALIGN (dest) >= 128);
+    }
+  return false;
+}
+
+/* Return 1 iff UID, known to reference a swap, is both fed by a load
+   and a feeder of a store.  */
+static unsigned int
+swap_feeds_both_load_and_store (swap_web_entry *insn_entry)
+{
+  rtx insn = insn_entry->insn;
+  struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+  df_ref def, use;
+  struct df_link *link = 0;
+  rtx_insn *load = 0, *store = 0;
+  bool fed_by_load = 0;
+  bool feeds_store = 0;
+
+  FOR_EACH_INSN_INFO_USE (use, insn_info)
+    {
+      link = DF_REF_CHAIN (use);
+      load = DF_REF_INSN (link->ref);
+      if (insn_is_load_p (load) && insn_is_swap_p (load))
+	fed_by_load = 1;
+    }
+
+  FOR_EACH_INSN_INFO_DEF (def, insn_info)
+    {
+      link = DF_REF_CHAIN (def);
+      store = DF_REF_INSN (link->ref);
+      if (insn_is_store_p (store) && insn_is_swap_p (store))
+	feeds_store = 1;
+    }
+
+  return fed_by_load && feeds_store;
+}
+
 /* Return TRUE if insn is a swap fed by a load from the constant pool.  */
 static bool
 const_load_sequence_p (swap_web_entry *insn_entry, rtx insn)
@@ -337,6 +508,9 @@
 
   struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
   df_ref use;
+
+  /* Iterate over the definitions that are used by this insn.  Since
+     this is known to be a swap insn, expect only one used definnition.  */
   FOR_EACH_INSN_INFO_USE (use, insn_info)
     {
       struct df_link *def_link = DF_REF_CHAIN (use);
@@ -419,8 +593,10 @@
 	         remove this special test.  */
 	      rtx const_vector = get_pool_constant (base);
 	      if (GET_CODE (const_vector) == SYMBOL_REF
-		  && !CONSTANT_POOL_ADDRESS_P (const_vector))
-		    return false;
+		  && CONSTANT_POOL_ADDRESS_P (const_vector))
+		const_vector = get_pool_constant (const_vector);
+	      if (GET_CODE (const_vector) != CONST_VECTOR)
+		return false;
 	    }
 	}
     }
@@ -567,6 +743,7 @@
 	  {
 	  default:
 	    break;
+	  case UNSPEC_VBPERMQ:
 	  case UNSPEC_VMRGH_DIRECT:
 	  case UNSPEC_VMRGL_DIRECT:
 	  case UNSPEC_VPACK_SIGN_SIGN_SAT:
@@ -578,6 +755,7 @@
 	  case UNSPEC_VPERM_UNS:
 	  case UNSPEC_VPERMHI:
 	  case UNSPEC_VPERMSI:
+	  case UNSPEC_VPERMXOR:
 	  case UNSPEC_VPKPX:
 	  case UNSPEC_VSLDOI:
 	  case UNSPEC_VSLO:
@@ -588,8 +766,13 @@
 	  case UNSPEC_VSUMSWS:
 	  case UNSPEC_VSUMSWS_DIRECT:
 	  case UNSPEC_VSX_CONCAT:
+	  case UNSPEC_VSX_CVDPSPN:
+	  case UNSPEC_VSX_CVSPDP:
+	  case UNSPEC_VSX_CVSPDPN:
+	  case UNSPEC_VSX_EXTRACT:
 	  case UNSPEC_VSX_SET:
 	  case UNSPEC_VSX_SLDWI:
+	  case UNSPEC_VSX_VSLO:
 	  case UNSPEC_VUNPACK_HI_SIGN:
 	  case UNSPEC_VUNPACK_HI_SIGN_DIRECT:
 	  case UNSPEC_VUNPACK_LO_SIGN:
@@ -600,12 +783,6 @@
 	  case UNSPEC_VUPKLPX:
 	  case UNSPEC_VUPKLS_V4SF:
 	  case UNSPEC_VUPKLU_V4SF:
-	  case UNSPEC_VSX_CVDPSPN:
-	  case UNSPEC_VSX_CVSPDP:
-	  case UNSPEC_VSX_CVSPDPN:
-	  case UNSPEC_VSX_EXTRACT:
-	  case UNSPEC_VSX_VSLO:
-	  case UNSPEC_VSX_VEC_INIT:
 	    return 0;
 	  case UNSPEC_VSPLT_DIRECT:
 	  case UNSPEC_VSX_XXSPLTD:
@@ -699,10 +876,11 @@
   if (insn_entry[i].is_store)
     {
       if (GET_CODE (body) == SET
-	  && GET_CODE (SET_SRC (body)) != UNSPEC)
+	  && GET_CODE (SET_SRC (body)) != UNSPEC
+	  && GET_CODE (SET_SRC (body)) != VEC_SELECT)
 	{
 	  rtx lhs = SET_DEST (body);
-	  /* Even without a swap, the LHS might be a vec_select for, say,
+	  /* Even without a swap, the RHS might be a vec_select for, say,
 	     a byte-reversing store.  */
 	  if (GET_CODE (lhs) != MEM)
 	    return 0;
@@ -929,23 +1107,24 @@
     }
 }
 
-/* OP is either a CONST_VECTOR or an expression containing one.
+/* *OP_PTR is either a CONST_VECTOR or an expression containing one.
    Swap the first half of the vector with the second in the first
    case.  Recurse to find it in the second.  */
 static void
-swap_const_vector_halves (rtx op)
+swap_const_vector_halves (rtx *op_ptr)
 {
   int i;
+  rtx op = *op_ptr;
   enum rtx_code code = GET_CODE (op);
   if (GET_CODE (op) == CONST_VECTOR)
     {
-      int half_units = GET_MODE_NUNITS (GET_MODE (op)) / 2;
-      for (i = 0; i < half_units; ++i)
-	{
-	  rtx temp = CONST_VECTOR_ELT (op, i);
-	  CONST_VECTOR_ELT (op, i) = CONST_VECTOR_ELT (op, i + half_units);
-	  CONST_VECTOR_ELT (op, i + half_units) = temp;
-	}
+      int units = GET_MODE_NUNITS (GET_MODE (op));
+      rtx_vector_builder builder (GET_MODE (op), units, 1);
+      for (i = 0; i < units / 2; ++i)
+	builder.quick_push (CONST_VECTOR_ELT (op, i + units / 2));
+      for (i = 0; i < units / 2; ++i)
+	builder.quick_push (CONST_VECTOR_ELT (op, i));
+      *op_ptr = builder.build ();
     }
   else
     {
@@ -953,10 +1132,10 @@
       const char *fmt = GET_RTX_FORMAT (code);
       for (i = 0; i < GET_RTX_LENGTH (code); ++i)
 	if (fmt[i] == 'e' || fmt[i] == 'u')
-	  swap_const_vector_halves (XEXP (op, i));
+	  swap_const_vector_halves (&XEXP (op, i));
 	else if (fmt[i] == 'E')
 	  for (j = 0; j < XVECLEN (op, i); ++j)
-	    swap_const_vector_halves (XVECEXP (op, i, j));
+	    swap_const_vector_halves (&XVECEXP (op, i, j));
     }
 }
 
@@ -1249,8 +1428,7 @@
       {
 	/* A CONST_VECTOR will only show up somewhere in the RHS of a SET.  */
 	gcc_assert (GET_CODE (body) == SET);
-	rtx rhs = SET_SRC (body);
-	swap_const_vector_halves (rhs);
+	swap_const_vector_halves (&SET_SRC (body));
 	if (dump_file)
 	  fprintf (dump_file, "Swapping constant halves in insn %d\n", i);
 	break;
@@ -1318,7 +1496,263 @@
   insn->set_deleted ();
 }
 
-/* Given that swap_insn represents a swap of a load of a constant
+/* Make NEW_MEM_EXP's attributes and flags resemble those of
+   ORIGINAL_MEM_EXP.  */
+static void
+mimic_memory_attributes_and_flags (rtx new_mem_exp, const_rtx original_mem_exp)
+{
+  RTX_FLAG (new_mem_exp, jump) = RTX_FLAG (original_mem_exp, jump);
+  RTX_FLAG (new_mem_exp, call) = RTX_FLAG (original_mem_exp, call);
+  RTX_FLAG (new_mem_exp, unchanging) = RTX_FLAG (original_mem_exp, unchanging);
+  RTX_FLAG (new_mem_exp, volatil) = RTX_FLAG (original_mem_exp, volatil);
+  RTX_FLAG (new_mem_exp, frame_related) =
+    RTX_FLAG (original_mem_exp, frame_related);
+
+  /* The following fields may not be used with MEM subexpressions */
+  RTX_FLAG (new_mem_exp, in_struct) = RTX_FLAG (original_mem_exp, in_struct);
+  RTX_FLAG (new_mem_exp, return_val) = RTX_FLAG (original_mem_exp, return_val);
+
+  struct mem_attrs original_attrs = *get_mem_attrs(original_mem_exp);
+
+  alias_set_type set = original_attrs.alias;
+  set_mem_alias_set (new_mem_exp, set);
+
+  addr_space_t addrspace = original_attrs.addrspace;
+  set_mem_addr_space (new_mem_exp, addrspace);
+
+  unsigned int align = original_attrs.align;
+  set_mem_align (new_mem_exp, align);
+
+  tree expr = original_attrs.expr;
+  set_mem_expr (new_mem_exp, expr);
+
+  if (original_attrs.offset_known_p)
+    {
+      HOST_WIDE_INT offset = original_attrs.offset;
+      set_mem_offset (new_mem_exp, offset);
+    }
+  else
+    clear_mem_offset (new_mem_exp);
+
+  if (original_attrs.size_known_p)
+    {
+      HOST_WIDE_INT size = original_attrs.size;
+      set_mem_size (new_mem_exp, size);
+    }
+  else
+    clear_mem_size (new_mem_exp);
+}
+
+/* Generate an rtx expression to represent use of the stvx insn to store
+   the value represented by register SRC_EXP into the memory at address
+   DEST_EXP, with vector mode MODE.  */
+rtx
+rs6000_gen_stvx (enum machine_mode mode, rtx dest_exp, rtx src_exp)
+{
+  rtx stvx;
+
+  if (mode == V16QImode)
+    stvx = gen_altivec_stvx_v16qi (src_exp, dest_exp);
+  else if (mode == V8HImode)
+    stvx = gen_altivec_stvx_v8hi (src_exp, dest_exp);
+#ifdef HAVE_V8HFmode
+  else if (mode == V8HFmode)
+    stvx = gen_altivec_stvx_v8hf (src_exp, dest_exp);
+#endif
+  else if (mode == V4SImode)
+    stvx = gen_altivec_stvx_v4si (src_exp, dest_exp);
+  else if (mode == V4SFmode)
+    stvx = gen_altivec_stvx_v4sf (src_exp, dest_exp);
+  else if (mode == V2DImode)
+    stvx = gen_altivec_stvx_v2di (src_exp, dest_exp);
+  else if (mode == V2DFmode)
+    stvx = gen_altivec_stvx_v2df (src_exp, dest_exp);
+  else if (mode == V1TImode)
+    stvx = gen_altivec_stvx_v1ti (src_exp, dest_exp);
+  else
+    /* KFmode, TFmode, other modes not expected in this context.  */
+    gcc_unreachable ();
+
+  rtx new_mem_exp = SET_DEST (PATTERN (stvx));
+  mimic_memory_attributes_and_flags (new_mem_exp, dest_exp);
+  return stvx;
+}
+
+/* Given that STORE_INSN represents an aligned store-with-swap of a
+   swapped value, replace the store with an aligned store (without
+   swap) and replace the swap with a copy insn.  */
+static void
+replace_swapped_aligned_store (swap_web_entry *insn_entry,
+			       rtx_insn *store_insn)
+{
+  unsigned uid = INSN_UID (store_insn);
+  gcc_assert (insn_entry[uid].is_swap && insn_entry[uid].is_store);
+
+  rtx body = PATTERN (store_insn);
+  rtx dest_address = XEXP (SET_DEST (body), 0);
+  rtx swap_reg = XEXP (SET_SRC (body), 0);
+  gcc_assert (REG_P (dest_address)
+	      || rs6000_sum_of_two_registers_p (dest_address));
+
+  /* Find the swap instruction that provides the value to be stored by
+   * this store-with-swap instruction. */
+  struct df_insn_info *insn_info = DF_INSN_INFO_GET (store_insn);
+  df_ref use;
+  rtx_insn *swap_insn = NULL;
+  unsigned uid2 = 0;
+  FOR_EACH_INSN_INFO_USE (use, insn_info)
+    {
+      struct df_link *def_link = DF_REF_CHAIN (use);
+
+      /* if this is not the definition of the candidate swap register,
+	 then skip it.  I am only interested in the swap insnd.  */
+      if (!rtx_equal_p (DF_REF_REG (use), swap_reg))
+	continue;
+
+      /* If there is no def or the def is artifical or there are
+	 multiple defs, we should not be here.  */
+      gcc_assert (def_link && def_link->ref && !def_link->next
+		  && !DF_REF_IS_ARTIFICIAL (def_link->ref));
+
+      swap_insn = DF_REF_INSN (def_link->ref);
+      uid2 = INSN_UID (swap_insn);
+
+      /* If this source value is not a simple swap, we should not be here.  */
+      gcc_assert (insn_entry[uid2].is_swap && !insn_entry[uid2].is_load
+		  && !insn_entry[uid2].is_store);
+
+      /* We've processed the use we care about, so break out of
+	 this loop.  */
+      break;
+    }
+
+  /* At this point, swap_insn and uid2 represent the swap instruction
+     that feeds the store.  */
+  gcc_assert (swap_insn);
+  rtx set = single_set (store_insn);
+  gcc_assert (set);
+  rtx dest_exp = SET_DEST (set);
+  rtx src_exp = XEXP (SET_SRC (body), 0);
+  enum machine_mode mode = GET_MODE (dest_exp);
+  gcc_assert (MEM_P (dest_exp));
+  gcc_assert (MEM_ALIGN (dest_exp) >= 128);
+
+  /* Replace the copy with a new insn.  */
+  rtx stvx;
+  stvx = rs6000_gen_stvx (mode, dest_exp, src_exp);
+
+  rtx_insn *new_insn = emit_insn_before (stvx, store_insn);
+  rtx new_body = PATTERN (new_insn);
+
+  gcc_assert ((GET_CODE (new_body) == SET)
+	      && (GET_CODE (SET_DEST (new_body)) == MEM));
+
+  set_block_for_insn (new_insn, BLOCK_FOR_INSN (store_insn));
+  df_insn_rescan (new_insn);
+
+  df_insn_delete (store_insn);
+  remove_insn (store_insn);
+  store_insn->set_deleted ();
+
+  /* Replace the swap with a copy.  */
+  uid2 = INSN_UID (swap_insn);
+  mark_swaps_for_removal (insn_entry, uid2);
+  replace_swap_with_copy (insn_entry, uid2);
+}
+
+/* Generate an rtx expression to represent use of the lvx insn to load
+   from memory SRC_EXP into register DEST_EXP with vector mode MODE. */
+rtx
+rs6000_gen_lvx (enum machine_mode mode, rtx dest_exp, rtx src_exp)
+{
+  rtx lvx;
+
+  if (mode == V16QImode)
+    lvx = gen_altivec_lvx_v16qi (dest_exp, src_exp);
+  else if (mode == V8HImode)
+    lvx = gen_altivec_lvx_v8hi (dest_exp, src_exp);
+#ifdef HAVE_V8HFmode
+  else if (mode == V8HFmode)
+    lvx = gen_altivec_lvx_v8hf (dest_exp, src_exp);
+#endif
+  else if (mode == V4SImode)
+    lvx = gen_altivec_lvx_v4si (dest_exp, src_exp);
+  else if (mode == V4SFmode)
+    lvx = gen_altivec_lvx_v4sf (dest_exp, src_exp);
+  else if (mode == V2DImode)
+    lvx = gen_altivec_lvx_v2di (dest_exp, src_exp);
+  else if (mode == V2DFmode)
+    lvx = gen_altivec_lvx_v2df (dest_exp, src_exp);
+  else if (mode == V1TImode)
+    lvx = gen_altivec_lvx_v1ti (dest_exp, src_exp);
+  else
+    /* KFmode, TFmode, other modes not expected in this context.  */
+    gcc_unreachable ();
+
+  rtx new_mem_exp = SET_SRC (PATTERN (lvx));
+  mimic_memory_attributes_and_flags (new_mem_exp, src_exp);
+
+  return lvx;
+}
+
+/* Given that SWAP_INSN represents a swap of an aligned
+   load-with-swap, replace the load with an aligned load (without
+   swap) and replace the swap with a copy insn.  */
+static void
+replace_swapped_aligned_load (swap_web_entry *insn_entry, rtx swap_insn)
+{
+  /* Find the load.  */
+  unsigned uid = INSN_UID (swap_insn);
+  /* Only call this if quad_aligned_load_p (swap_insn).  */
+  gcc_assert (insn_entry[uid].is_swap && !insn_entry[uid].is_load);
+  struct df_insn_info *insn_info = DF_INSN_INFO_GET (swap_insn);
+
+  /* Since insn is known to represent a swap instruction, we know it
+     "uses" only one input variable.  */
+  df_ref use = DF_INSN_INFO_USES (insn_info);
+
+  /* Figure out where this input variable is defined.  */
+  struct df_link *def_link = DF_REF_CHAIN (use);
+  gcc_assert (def_link && !def_link->next);
+  gcc_assert (def_link && def_link->ref &&
+	      !DF_REF_IS_ARTIFICIAL (def_link->ref) && !def_link->next);
+
+  rtx_insn *def_insn = DF_REF_INSN (def_link->ref);
+  unsigned uid2 = INSN_UID (def_insn);
+
+  /* We're expecting a load-with-swap insn.  */
+  gcc_assert (insn_entry[uid2].is_load && insn_entry[uid2].is_swap);
+
+  /* We expect this to be a set to memory, with source representing a
+     swap (indicated by code VEC_SELECT).  */
+  rtx body = PATTERN (def_insn);
+  gcc_assert ((GET_CODE (body) == SET)
+	      && (GET_CODE (SET_SRC (body)) == VEC_SELECT)
+	      && (GET_CODE (XEXP (SET_SRC (body), 0)) == MEM));
+
+  rtx src_exp = XEXP (SET_SRC (body), 0);
+  enum machine_mode mode = GET_MODE (src_exp);
+  rtx lvx = rs6000_gen_lvx (mode, SET_DEST (body), src_exp);
+
+  rtx_insn *new_insn = emit_insn_before (lvx, def_insn);
+  rtx new_body = PATTERN (new_insn);
+
+  gcc_assert ((GET_CODE (new_body) == SET)
+	      && (GET_CODE (SET_SRC (new_body)) == MEM));
+
+  set_block_for_insn (new_insn, BLOCK_FOR_INSN (def_insn));
+  df_insn_rescan (new_insn);
+
+  df_insn_delete (def_insn);
+  remove_insn (def_insn);
+  def_insn->set_deleted ();
+
+  /* Replace the swap with a copy.  */
+  mark_swaps_for_removal (insn_entry, uid);
+  replace_swap_with_copy (insn_entry, uid);
+}
+
+/* Given that SWAP_INSN represents a swap of a load of a constant
    vector value, replace with a single instruction that loads a
    swapped variant of the original constant.
 
@@ -2027,6 +2461,14 @@
 	  && !insn_entry[i].is_swap && !insn_entry[i].is_swappable)
 	root->web_not_optimizable = 1;
 
+      /* If we have a swap that is both fed by a permuting load
+	 and a feeder of a permuting store, then the optimization
+	 isn't appropriate.  (Consider vec_xl followed by vec_xst_be.)  */
+      else if (insn_entry[i].is_swap && !insn_entry[i].is_load
+	       && !insn_entry[i].is_store
+	       && swap_feeds_both_load_and_store (&insn_entry[i]))
+	root->web_not_optimizable = 1;
+
       /* If we have permuting loads or stores that are not accompanied
 	 by a register swap, the optimization isn't appropriate.  */
       else if (insn_entry[i].is_load && insn_entry[i].is_swap)
@@ -2101,8 +2543,17 @@
   /* Clean up.  */
   free (insn_entry);
 
-  /* Use additional pass over rtl to replace swap(load(vector constant))
-     with load(swapped vector constant). */
+  /* Use a second pass over rtl to detect that certain vector values
+     fetched from or stored to memory on quad-word aligned addresses
+     can use lvx/stvx without swaps.  */
+
+  /* First, rebuild ud chains.  */
+  df_remove_problem (df_chain);
+  df_process_deferred_rescans ();
+  df_set_flags (DF_RD_PRUNE_DEAD_DEFS);
+  df_chain_add_problem (DF_UD_CHAIN);
+  df_analyze ();
+
   swap_web_entry *pass2_insn_entry;
   pass2_insn_entry = XCNEWVEC (swap_web_entry, get_max_uid ());
 
@@ -2131,13 +2582,69 @@
     if (pass2_insn_entry[i].is_swap && !pass2_insn_entry[i].is_load
 	&& !pass2_insn_entry[i].is_store)
       {
-	insn = pass2_insn_entry[i].insn;
-	if (const_load_sequence_p (pass2_insn_entry, insn))
-	  replace_swapped_load_constant (pass2_insn_entry, insn);
+	/* Replace swap of aligned load-swap with aligned unswapped
+	   load.  */
+	rtx_insn *rtx_insn = pass2_insn_entry[i].insn;
+	if (quad_aligned_load_p (pass2_insn_entry, rtx_insn))
+	  replace_swapped_aligned_load (pass2_insn_entry, rtx_insn);
+      }
+    else if (pass2_insn_entry[i].is_swap && pass2_insn_entry[i].is_store)
+      {
+	/* Replace aligned store-swap of swapped value with aligned
+	   unswapped store.  */
+	rtx_insn *rtx_insn = pass2_insn_entry[i].insn;
+	if (quad_aligned_store_p (pass2_insn_entry, rtx_insn))
+	  replace_swapped_aligned_store (pass2_insn_entry, rtx_insn);
       }
 
   /* Clean up.  */
   free (pass2_insn_entry);
+
+  /* Use a third pass over rtl to replace swap(load(vector constant))
+     with load(swapped vector constant).  */
+
+  /* First, rebuild ud chains.  */
+  df_remove_problem (df_chain);
+  df_process_deferred_rescans ();
+  df_set_flags (DF_RD_PRUNE_DEAD_DEFS);
+  df_chain_add_problem (DF_UD_CHAIN);
+  df_analyze ();
+
+  swap_web_entry *pass3_insn_entry;
+  pass3_insn_entry = XCNEWVEC (swap_web_entry, get_max_uid ());
+
+  /* Walk the insns to gather basic data.  */
+  FOR_ALL_BB_FN (bb, fun)
+    FOR_BB_INSNS_SAFE (bb, insn, curr_insn)
+    {
+      unsigned int uid = INSN_UID (insn);
+      if (NONDEBUG_INSN_P (insn))
+	{
+	  pass3_insn_entry[uid].insn = insn;
+
+	  pass3_insn_entry[uid].is_relevant = 1;
+	  pass3_insn_entry[uid].is_load = insn_is_load_p (insn);
+	  pass3_insn_entry[uid].is_store = insn_is_store_p (insn);
+
+	  /* Determine if this is a doubleword swap.  If not,
+	     determine whether it can legally be swapped.  */
+	  if (insn_is_swap_p (insn))
+	    pass3_insn_entry[uid].is_swap = 1;
+	}
+    }
+
+  e = get_max_uid ();
+  for (unsigned i = 0; i < e; ++i)
+    if (pass3_insn_entry[i].is_swap && !pass3_insn_entry[i].is_load
+	&& !pass3_insn_entry[i].is_store)
+      {
+	insn = pass3_insn_entry[i].insn;
+	if (const_load_sequence_p (pass3_insn_entry, insn))
+	  replace_swapped_load_constant (pass3_insn_entry, insn);
+      }
+
+  /* Clean up.  */
+  free (pass3_insn_entry);
   return 0;
 }