diff gcc/lower-subreg.c @ 111:04ced10e8804

gcc 7
author kono
date Fri, 27 Oct 2017 22:46:09 +0900
parents f6334be47118
children 84e7813d76e9
line wrap: on
line diff
--- a/gcc/lower-subreg.c	Sun Aug 21 07:07:55 2011 +0900
+++ b/gcc/lower-subreg.c	Fri Oct 27 22:46:09 2017 +0900
@@ -1,5 +1,5 @@
 /* Decompose multiword subregs.
-   Copyright (C) 2007, 2008, 2009, 2010 Free Software Foundation, Inc.
+   Copyright (C) 2007-2017 Free Software Foundation, Inc.
    Contributed by Richard Henderson <rth@redhat.com>
 		  Ian Lance Taylor <iant@google.com>
 
@@ -22,39 +22,57 @@
 #include "config.h"
 #include "system.h"
 #include "coretypes.h"
-#include "machmode.h"
-#include "tm.h"
+#include "backend.h"
 #include "rtl.h"
+#include "tree.h"
+#include "cfghooks.h"
+#include "df.h"
+#include "memmodel.h"
 #include "tm_p.h"
-#include "timevar.h"
-#include "flags.h"
+#include "expmed.h"
 #include "insn-config.h"
-#include "obstack.h"
-#include "basic-block.h"
+#include "emit-rtl.h"
 #include "recog.h"
-#include "bitmap.h"
+#include "cfgrtl.h"
+#include "cfgbuild.h"
 #include "dce.h"
 #include "expr.h"
-#include "except.h"
-#include "regs.h"
 #include "tree-pass.h"
-#include "df.h"
+#include "lower-subreg.h"
+#include "rtl-iter.h"
+#include "target.h"
 
-#ifdef STACK_GROWS_DOWNWARD
-# undef STACK_GROWS_DOWNWARD
-# define STACK_GROWS_DOWNWARD 1
-#else
-# define STACK_GROWS_DOWNWARD 0
-#endif
-
-DEF_VEC_P (bitmap);
-DEF_VEC_ALLOC_P (bitmap,heap);
 
 /* Decompose multi-word pseudo-registers into individual
-   pseudo-registers when possible.  This is possible when all the uses
-   of a multi-word register are via SUBREG, or are copies of the
-   register to another location.  Breaking apart the register permits
-   more CSE and permits better register allocation.  */
+   pseudo-registers when possible and profitable.  This is possible
+   when all the uses of a multi-word register are via SUBREG, or are
+   copies of the register to another location.  Breaking apart the
+   register permits more CSE and permits better register allocation.
+   This is profitable if the machine does not have move instructions
+   to do this.
+
+   This pass only splits moves with modes that are wider than
+   word_mode and ASHIFTs, LSHIFTRTs, ASHIFTRTs and ZERO_EXTENDs with
+   integer modes that are twice the width of word_mode.  The latter
+   could be generalized if there was a need to do this, but the trend in
+   architectures is to not need this.
+
+   There are two useful preprocessor defines for use by maintainers:
+
+   #define LOG_COSTS 1
+
+   if you wish to see the actual cost estimates that are being used
+   for each mode wider than word mode and the cost estimates for zero
+   extension and the shifts.   This can be useful when port maintainers
+   are tuning insn rtx costs.
+
+   #define FORCE_LOWERING 1
+
+   if you wish to test the pass with all the transformation forced on.
+   This can be useful for finding bugs in the transformations.  */
+
+#define LOG_COSTS 0
+#define FORCE_LOWERING 0
 
 /* Bit N in this bitmap is set if regno N is used in a context in
    which we can decompose it.  */
@@ -72,10 +90,199 @@
 
 /* Bit N in the bitmap in element M of this array is set if there is a
    copy from reg M to reg N.  */
-static VEC(bitmap,heap) *reg_copy_graph;
+static vec<bitmap> reg_copy_graph;
+
+struct target_lower_subreg default_target_lower_subreg;
+#if SWITCHABLE_TARGET
+struct target_lower_subreg *this_target_lower_subreg
+  = &default_target_lower_subreg;
+#endif
+
+#define twice_word_mode \
+  this_target_lower_subreg->x_twice_word_mode
+#define choices \
+  this_target_lower_subreg->x_choices
+
+/* RTXes used while computing costs.  */
+struct cost_rtxes {
+  /* Source and target registers.  */
+  rtx source;
+  rtx target;
+
+  /* A twice_word_mode ZERO_EXTEND of SOURCE.  */
+  rtx zext;
+
+  /* A shift of SOURCE.  */
+  rtx shift;
+
+  /* A SET of TARGET.  */
+  rtx set;
+};
+
+/* Return the cost of a CODE shift in mode MODE by OP1 bits, using the
+   rtxes in RTXES.  SPEED_P selects between the speed and size cost.  */
+
+static int
+shift_cost (bool speed_p, struct cost_rtxes *rtxes, enum rtx_code code,
+	    machine_mode mode, int op1)
+{
+  PUT_CODE (rtxes->shift, code);
+  PUT_MODE (rtxes->shift, mode);
+  PUT_MODE (rtxes->source, mode);
+  XEXP (rtxes->shift, 1) = GEN_INT (op1);
+  return set_src_cost (rtxes->shift, mode, speed_p);
+}
+
+/* For each X in the range [0, BITS_PER_WORD), set SPLITTING[X]
+   to true if it is profitable to split a double-word CODE shift
+   of X + BITS_PER_WORD bits.  SPEED_P says whether we are testing
+   for speed or size profitability.
+
+   Use the rtxes in RTXES to calculate costs.  WORD_MOVE_ZERO_COST is
+   the cost of moving zero into a word-mode register.  WORD_MOVE_COST
+   is the cost of moving between word registers.  */
+
+static void
+compute_splitting_shift (bool speed_p, struct cost_rtxes *rtxes,
+			 bool *splitting, enum rtx_code code,
+			 int word_move_zero_cost, int word_move_cost)
+{
+  int wide_cost, narrow_cost, upper_cost, i;
+
+  for (i = 0; i < BITS_PER_WORD; i++)
+    {
+      wide_cost = shift_cost (speed_p, rtxes, code, twice_word_mode,
+			      i + BITS_PER_WORD);
+      if (i == 0)
+	narrow_cost = word_move_cost;
+      else
+	narrow_cost = shift_cost (speed_p, rtxes, code, word_mode, i);
+
+      if (code != ASHIFTRT)
+	upper_cost = word_move_zero_cost;
+      else if (i == BITS_PER_WORD - 1)
+	upper_cost = word_move_cost;
+      else
+	upper_cost = shift_cost (speed_p, rtxes, code, word_mode,
+				 BITS_PER_WORD - 1);
+
+      if (LOG_COSTS)
+	fprintf (stderr, "%s %s by %d: original cost %d, split cost %d + %d\n",
+		 GET_MODE_NAME (twice_word_mode), GET_RTX_NAME (code),
+		 i + BITS_PER_WORD, wide_cost, narrow_cost, upper_cost);
+
+      if (FORCE_LOWERING || wide_cost >= narrow_cost + upper_cost)
+	splitting[i] = true;
+    }
+}
+
+/* Compute what we should do when optimizing for speed or size; SPEED_P
+   selects which.  Use RTXES for computing costs.  */
+
+static void
+compute_costs (bool speed_p, struct cost_rtxes *rtxes)
+{
+  unsigned int i;
+  int word_move_zero_cost, word_move_cost;
 
-/* Return whether X is a simple object which we can take a word_mode
-   subreg of.  */
+  PUT_MODE (rtxes->target, word_mode);
+  SET_SRC (rtxes->set) = CONST0_RTX (word_mode);
+  word_move_zero_cost = set_rtx_cost (rtxes->set, speed_p);
+
+  SET_SRC (rtxes->set) = rtxes->source;
+  word_move_cost = set_rtx_cost (rtxes->set, speed_p);
+
+  if (LOG_COSTS)
+    fprintf (stderr, "%s move: from zero cost %d, from reg cost %d\n",
+	     GET_MODE_NAME (word_mode), word_move_zero_cost, word_move_cost);
+
+  for (i = 0; i < MAX_MACHINE_MODE; i++)
+    {
+      machine_mode mode = (machine_mode) i;
+      int factor = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
+      if (factor > 1)
+	{
+	  int mode_move_cost;
+
+	  PUT_MODE (rtxes->target, mode);
+	  PUT_MODE (rtxes->source, mode);
+	  mode_move_cost = set_rtx_cost (rtxes->set, speed_p);
+
+	  if (LOG_COSTS)
+	    fprintf (stderr, "%s move: original cost %d, split cost %d * %d\n",
+		     GET_MODE_NAME (mode), mode_move_cost,
+		     word_move_cost, factor);
+
+	  if (FORCE_LOWERING || mode_move_cost >= word_move_cost * factor)
+	    {
+	      choices[speed_p].move_modes_to_split[i] = true;
+	      choices[speed_p].something_to_do = true;
+	    }
+	}
+    }
+
+  /* For the moves and shifts, the only case that is checked is one
+     where the mode of the target is an integer mode twice the width
+     of the word_mode.
+
+     If it is not profitable to split a double word move then do not
+     even consider the shifts or the zero extension.  */
+  if (choices[speed_p].move_modes_to_split[(int) twice_word_mode])
+    {
+      int zext_cost;
+
+      /* The only case here to check to see if moving the upper part with a
+	 zero is cheaper than doing the zext itself.  */
+      PUT_MODE (rtxes->source, word_mode);
+      zext_cost = set_src_cost (rtxes->zext, twice_word_mode, speed_p);
+
+      if (LOG_COSTS)
+	fprintf (stderr, "%s %s: original cost %d, split cost %d + %d\n",
+		 GET_MODE_NAME (twice_word_mode), GET_RTX_NAME (ZERO_EXTEND),
+		 zext_cost, word_move_cost, word_move_zero_cost);
+
+      if (FORCE_LOWERING || zext_cost >= word_move_cost + word_move_zero_cost)
+	choices[speed_p].splitting_zext = true;
+
+      compute_splitting_shift (speed_p, rtxes,
+			       choices[speed_p].splitting_ashift, ASHIFT,
+			       word_move_zero_cost, word_move_cost);
+      compute_splitting_shift (speed_p, rtxes,
+			       choices[speed_p].splitting_lshiftrt, LSHIFTRT,
+			       word_move_zero_cost, word_move_cost);
+      compute_splitting_shift (speed_p, rtxes,
+			       choices[speed_p].splitting_ashiftrt, ASHIFTRT,
+			       word_move_zero_cost, word_move_cost);
+    }
+}
+
+/* Do one-per-target initialisation.  This involves determining
+   which operations on the machine are profitable.  If none are found,
+   then the pass just returns when called.  */
+
+void
+init_lower_subreg (void)
+{
+  struct cost_rtxes rtxes;
+
+  memset (this_target_lower_subreg, 0, sizeof (*this_target_lower_subreg));
+
+  twice_word_mode = GET_MODE_2XWIDER_MODE (word_mode).require ();
+
+  rtxes.target = gen_rtx_REG (word_mode, LAST_VIRTUAL_REGISTER + 1);
+  rtxes.source = gen_rtx_REG (word_mode, LAST_VIRTUAL_REGISTER + 2);
+  rtxes.set = gen_rtx_SET (rtxes.target, rtxes.source);
+  rtxes.zext = gen_rtx_ZERO_EXTEND (twice_word_mode, rtxes.source);
+  rtxes.shift = gen_rtx_ASHIFT (twice_word_mode, rtxes.source, const0_rtx);
+
+  if (LOG_COSTS)
+    fprintf (stderr, "\nSize costs\n==========\n\n");
+  compute_costs (false, &rtxes);
+
+  if (LOG_COSTS)
+    fprintf (stderr, "\nSpeed costs\n===========\n\n");
+  compute_costs (true, &rtxes);
+}
 
 static bool
 simple_move_operand (rtx x)
@@ -94,22 +301,25 @@
 
   if (MEM_P (x)
       && (MEM_VOLATILE_P (x)
-	  || mode_dependent_address_p (XEXP (x, 0))))
+	  || mode_dependent_address_p (XEXP (x, 0), MEM_ADDR_SPACE (x))))
     return false;
 
   return true;
 }
 
-/* If INSN is a single set between two objects, return the single set.
-   Such an insn can always be decomposed.  INSN should have been
-   passed to recog and extract_insn before this is called.  */
+/* If INSN is a single set between two objects that we want to split,
+   return the single set.  SPEED_P says whether we are optimizing
+   INSN for speed or size.
+
+   INSN should have been passed to recog and extract_insn before this
+   is called.  */
 
 static rtx
-simple_move (rtx insn)
+simple_move (rtx_insn *insn, bool speed_p)
 {
   rtx x;
   rtx set;
-  enum machine_mode mode;
+  machine_mode mode;
 
   if (recog_data.n_operands != 2)
     return NULL_RTX;
@@ -138,10 +348,9 @@
      registers.  That means that we can't decompose if this is a
      non-integer mode for which there is no integer mode of the same
      size.  */
-  mode = GET_MODE (SET_SRC (set));
+  mode = GET_MODE (SET_DEST (set));
   if (!SCALAR_INT_MODE_P (mode)
-      && (mode_for_size (GET_MODE_SIZE (mode) * BITS_PER_UNIT, MODE_INT, 0)
-	  == BLKmode))
+      && !int_mode_for_size (GET_MODE_BITSIZE (mode), 0).exists ())
     return NULL_RTX;
 
   /* Reject PARTIAL_INT modes.  They are used for processor specific
@@ -149,6 +358,9 @@
   if (GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
     return NULL_RTX;
 
+  if (!choices[speed_p].move_modes_to_split[(int) mode])
+    return NULL_RTX;
+
   return set;
 }
 
@@ -172,14 +384,11 @@
   if (HARD_REGISTER_NUM_P (rd) || HARD_REGISTER_NUM_P (rs))
     return false;
 
-  if (GET_MODE_SIZE (GET_MODE (dest)) <= UNITS_PER_WORD)
-    return false;
-
-  b = VEC_index (bitmap, reg_copy_graph, rs);
+  b = reg_copy_graph[rs];
   if (b == NULL)
     {
       b = BITMAP_ALLOC (NULL);
-      VEC_replace (bitmap, reg_copy_graph, rs, b);
+      reg_copy_graph[rs] = b;
     }
 
   bitmap_set_bit (b, rd);
@@ -196,10 +405,7 @@
 static void
 propagate_pseudo_copies (void)
 {
-  bitmap queue, propagate;
-
-  queue = BITMAP_ALLOC (NULL);
-  propagate = BITMAP_ALLOC (NULL);
+  auto_bitmap queue, propagate;
 
   bitmap_copy (queue, decomposable_context);
   do
@@ -211,7 +417,7 @@
 
       EXECUTE_IF_SET_IN_BITMAP (queue, 0, i, iter)
 	{
-	  bitmap b = VEC_index (bitmap, reg_copy_graph, i);
+	  bitmap b = reg_copy_graph[i];
 	  if (b)
 	    bitmap_ior_and_compl_into (propagate, b, non_decomposable_context);
 	}
@@ -220,138 +426,136 @@
       bitmap_ior_into (decomposable_context, propagate);
     }
   while (!bitmap_empty_p (queue));
-
-  BITMAP_FREE (queue);
-  BITMAP_FREE (propagate);
 }
 
 /* A pointer to one of these values is passed to
-   find_decomposable_subregs via for_each_rtx.  */
+   find_decomposable_subregs.  */
 
 enum classify_move_insn
 {
   /* Not a simple move from one location to another.  */
   NOT_SIMPLE_MOVE,
-  /* A simple move from one pseudo-register to another.  */
-  SIMPLE_PSEUDO_REG_MOVE,
-  /* A simple move involving a non-pseudo-register.  */
+  /* A simple move we want to decompose.  */
+  DECOMPOSABLE_SIMPLE_MOVE,
+  /* Any other simple move.  */
   SIMPLE_MOVE
 };
 
-/* This is called via for_each_rtx.  If we find a SUBREG which we
-   could use to decompose a pseudo-register, set a bit in
-   DECOMPOSABLE_CONTEXT.  If we find an unadorned register which is
-   not a simple pseudo-register copy, DATA will point at the type of
-   move, and we set a bit in DECOMPOSABLE_CONTEXT or
-   NON_DECOMPOSABLE_CONTEXT as appropriate.  */
+/* If we find a SUBREG in *LOC which we could use to decompose a
+   pseudo-register, set a bit in DECOMPOSABLE_CONTEXT.  If we find an
+   unadorned register which is not a simple pseudo-register copy,
+   DATA will point at the type of move, and we set a bit in
+   DECOMPOSABLE_CONTEXT or NON_DECOMPOSABLE_CONTEXT as appropriate.  */
 
-static int
-find_decomposable_subregs (rtx *px, void *data)
+static void
+find_decomposable_subregs (rtx *loc, enum classify_move_insn *pcmi)
 {
-  enum classify_move_insn *pcmi = (enum classify_move_insn *) data;
-  rtx x = *px;
+  subrtx_var_iterator::array_type array;
+  FOR_EACH_SUBRTX_VAR (iter, array, *loc, NONCONST)
+    {
+      rtx x = *iter;
+      if (GET_CODE (x) == SUBREG)
+	{
+	  rtx inner = SUBREG_REG (x);
+	  unsigned int regno, outer_size, inner_size, outer_words, inner_words;
+
+	  if (!REG_P (inner))
+	    continue;
 
-  if (x == NULL_RTX)
-    return 0;
+	  regno = REGNO (inner);
+	  if (HARD_REGISTER_NUM_P (regno))
+	    {
+	      iter.skip_subrtxes ();
+	      continue;
+	    }
+
+	  outer_size = GET_MODE_SIZE (GET_MODE (x));
+	  inner_size = GET_MODE_SIZE (GET_MODE (inner));
+	  outer_words = (outer_size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
+	  inner_words = (inner_size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
 
-  if (GET_CODE (x) == SUBREG)
-    {
-      rtx inner = SUBREG_REG (x);
-      unsigned int regno, outer_size, inner_size, outer_words, inner_words;
+	  /* We only try to decompose single word subregs of multi-word
+	     registers.  When we find one, we return -1 to avoid iterating
+	     over the inner register.
 
-      if (!REG_P (inner))
-	return 0;
+	     ??? This doesn't allow, e.g., DImode subregs of TImode values
+	     on 32-bit targets.  We would need to record the way the
+	     pseudo-register was used, and only decompose if all the uses
+	     were the same number and size of pieces.  Hopefully this
+	     doesn't happen much.  */
 
-      regno = REGNO (inner);
-      if (HARD_REGISTER_NUM_P (regno))
-	return -1;
+	  if (outer_words == 1 && inner_words > 1)
+	    {
+	      bitmap_set_bit (decomposable_context, regno);
+	      iter.skip_subrtxes ();
+	      continue;
+	    }
 
-      outer_size = GET_MODE_SIZE (GET_MODE (x));
-      inner_size = GET_MODE_SIZE (GET_MODE (inner));
-      outer_words = (outer_size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
-      inner_words = (inner_size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
+	  /* If this is a cast from one mode to another, where the modes
+	     have the same size, and they are not tieable, then mark this
+	     register as non-decomposable.  If we decompose it we are
+	     likely to mess up whatever the backend is trying to do.  */
+	  if (outer_words > 1
+	      && outer_size == inner_size
+	      && !targetm.modes_tieable_p (GET_MODE (x), GET_MODE (inner)))
+	    {
+	      bitmap_set_bit (non_decomposable_context, regno);
+	      bitmap_set_bit (subreg_context, regno);
+	      iter.skip_subrtxes ();
+	      continue;
+	    }
+	}
+      else if (REG_P (x))
+	{
+	  unsigned int regno;
 
-      /* We only try to decompose single word subregs of multi-word
-	 registers.  When we find one, we return -1 to avoid iterating
-	 over the inner register.
+	  /* We will see an outer SUBREG before we see the inner REG, so
+	     when we see a plain REG here it means a direct reference to
+	     the register.
 
-	 ??? This doesn't allow, e.g., DImode subregs of TImode values
-	 on 32-bit targets.  We would need to record the way the
-	 pseudo-register was used, and only decompose if all the uses
-	 were the same number and size of pieces.  Hopefully this
-	 doesn't happen much.  */
+	     If this is not a simple copy from one location to another,
+	     then we can not decompose this register.  If this is a simple
+	     copy we want to decompose, and the mode is right,
+	     then we mark the register as decomposable.
+	     Otherwise we don't say anything about this register --
+	     it could be decomposed, but whether that would be
+	     profitable depends upon how it is used elsewhere.
 
-      if (outer_words == 1 && inner_words > 1)
-	{
-	  bitmap_set_bit (decomposable_context, regno);
-	  return -1;
-	}
+	     We only set bits in the bitmap for multi-word
+	     pseudo-registers, since those are the only ones we care about
+	     and it keeps the size of the bitmaps down.  */
 
-      /* If this is a cast from one mode to another, where the modes
-	 have the same size, and they are not tieable, then mark this
-	 register as non-decomposable.  If we decompose it we are
-	 likely to mess up whatever the backend is trying to do.  */
-      if (outer_words > 1
-	  && outer_size == inner_size
-	  && !MODES_TIEABLE_P (GET_MODE (x), GET_MODE (inner)))
+	  regno = REGNO (x);
+	  if (!HARD_REGISTER_NUM_P (regno)
+	      && GET_MODE_SIZE (GET_MODE (x)) > UNITS_PER_WORD)
+	    {
+	      switch (*pcmi)
+		{
+		case NOT_SIMPLE_MOVE:
+		  bitmap_set_bit (non_decomposable_context, regno);
+		  break;
+		case DECOMPOSABLE_SIMPLE_MOVE:
+		  if (targetm.modes_tieable_p (GET_MODE (x), word_mode))
+		    bitmap_set_bit (decomposable_context, regno);
+		  break;
+		case SIMPLE_MOVE:
+		  break;
+		default:
+		  gcc_unreachable ();
+		}
+	    }
+	}
+      else if (MEM_P (x))
 	{
-	  bitmap_set_bit (non_decomposable_context, regno);
-	  bitmap_set_bit (subreg_context, regno);
-	  return -1;
+	  enum classify_move_insn cmi_mem = NOT_SIMPLE_MOVE;
+
+	  /* Any registers used in a MEM do not participate in a
+	     SIMPLE_MOVE or DECOMPOSABLE_SIMPLE_MOVE.  Do our own recursion
+	     here, and return -1 to block the parent's recursion.  */
+	  find_decomposable_subregs (&XEXP (x, 0), &cmi_mem);
+	  iter.skip_subrtxes ();
 	}
     }
-  else if (REG_P (x))
-    {
-      unsigned int regno;
-
-      /* We will see an outer SUBREG before we see the inner REG, so
-	 when we see a plain REG here it means a direct reference to
-	 the register.
-
-	 If this is not a simple copy from one location to another,
-	 then we can not decompose this register.  If this is a simple
-	 copy from one pseudo-register to another, and the mode is right
-	 then we mark the register as decomposable.
-	 Otherwise we don't say anything about this register --
-	 it could be decomposed, but whether that would be
-	 profitable depends upon how it is used elsewhere.
-
-	 We only set bits in the bitmap for multi-word
-	 pseudo-registers, since those are the only ones we care about
-	 and it keeps the size of the bitmaps down.  */
-
-      regno = REGNO (x);
-      if (!HARD_REGISTER_NUM_P (regno)
-	  && GET_MODE_SIZE (GET_MODE (x)) > UNITS_PER_WORD)
-	{
-	  switch (*pcmi)
-	    {
-	    case NOT_SIMPLE_MOVE:
-	      bitmap_set_bit (non_decomposable_context, regno);
-	      break;
-	    case SIMPLE_PSEUDO_REG_MOVE:
-	      if (MODES_TIEABLE_P (GET_MODE (x), word_mode))
-		bitmap_set_bit (decomposable_context, regno);
-	      break;
-	    case SIMPLE_MOVE:
-	      break;
-	    default:
-	      gcc_unreachable ();
-	    }
-	}
-    }
-  else if (MEM_P (x))
-    {
-      enum classify_move_insn cmi_mem = NOT_SIMPLE_MOVE;
-
-      /* Any registers used in a MEM do not participate in a
-	 SIMPLE_MOVE or SIMPLE_PSEUDO_REG_MOVE.  Do our own recursion
-	 here, and return -1 to block the parent's recursion.  */
-      for_each_rtx (&XEXP (x, 0), find_decomposable_subregs, &cmi_mem);
-      return -1;
-    }
-
-  return 0;
 }
 
 /* Decompose REGNO into word-sized components.  We smash the REG node
@@ -392,11 +596,11 @@
 /* Get a SUBREG of a CONCATN.  */
 
 static rtx
-simplify_subreg_concatn (enum machine_mode outermode, rtx op,
+simplify_subreg_concatn (machine_mode outermode, rtx op,
 			 unsigned int byte)
 {
   unsigned int inner_size;
-  enum machine_mode innermode, partmode;
+  machine_mode innermode, partmode;
   rtx part;
   unsigned int final_offset;
 
@@ -405,26 +609,25 @@
 
   innermode = GET_MODE (op);
   gcc_assert (byte < GET_MODE_SIZE (innermode));
-  gcc_assert (GET_MODE_SIZE (outermode) <= GET_MODE_SIZE (innermode));
+  if (GET_MODE_SIZE (outermode) > GET_MODE_SIZE (innermode))
+    return NULL_RTX;
 
   inner_size = GET_MODE_SIZE (innermode) / XVECLEN (op, 0);
   part = XVECEXP (op, 0, byte / inner_size);
   partmode = GET_MODE (part);
 
+  final_offset = byte % inner_size;
+  if (final_offset + GET_MODE_SIZE (outermode) > inner_size)
+    return NULL_RTX;
+
   /* VECTOR_CSTs in debug expressions are expanded into CONCATN instead of
      regular CONST_VECTORs.  They have vector or integer modes, depending
      on the capabilities of the target.  Cope with them.  */
   if (partmode == VOIDmode && VECTOR_MODE_P (innermode))
     partmode = GET_MODE_INNER (innermode);
   else if (partmode == VOIDmode)
-    {
-      enum mode_class mclass = GET_MODE_CLASS (innermode);
-      partmode = mode_for_size (inner_size * BITS_PER_UNIT, mclass, 0);
-    }
-
-  final_offset = byte % inner_size;
-  if (final_offset + GET_MODE_SIZE (outermode) > inner_size)
-    return NULL_RTX;
+    partmode = mode_for_size (inner_size * BITS_PER_UNIT,
+			      GET_MODE_CLASS (innermode), 0).require ();
 
   return simplify_gen_subreg (outermode, part, partmode, final_offset);
 }
@@ -432,8 +635,8 @@
 /* Wrapper around simplify_gen_subreg which handles CONCATN.  */
 
 static rtx
-simplify_gen_subreg_concatn (enum machine_mode outermode, rtx op,
-			     enum machine_mode innermode, unsigned int byte)
+simplify_gen_subreg_concatn (machine_mode outermode, rtx op,
+			     machine_mode innermode, unsigned int byte)
 {
   rtx ret;
 
@@ -456,10 +659,8 @@
       if (op2 == NULL_RTX)
 	{
 	  /* We don't handle paradoxical subregs here.  */
-	  gcc_assert (GET_MODE_SIZE (outermode)
-		      <= GET_MODE_SIZE (GET_MODE (op)));
-	  gcc_assert (GET_MODE_SIZE (GET_MODE (op))
-		      <= GET_MODE_SIZE (GET_MODE (SUBREG_REG (op))));
+	  gcc_assert (!paradoxical_subreg_p (outermode, GET_MODE (op)));
+	  gcc_assert (!paradoxical_subreg_p (op));
 	  op2 = simplify_subreg_concatn (outermode, SUBREG_REG (op),
 					 byte + SUBREG_BYTE (op));
 	  gcc_assert (op2 != NULL_RTX);
@@ -480,10 +681,7 @@
      resolve_simple_move will ask for the high part of the paradoxical
      subreg, which does not have a value.  Just return a zero.  */
   if (ret == NULL_RTX
-      && GET_CODE (op) == SUBREG
-      && SUBREG_BYTE (op) == 0
-      && (GET_MODE_SIZE (innermode)
-	  > GET_MODE_SIZE (GET_MODE (SUBREG_REG (op)))))
+      && paradoxical_subreg_p (op))
     return CONST0_RTX (outermode);
 
   gcc_assert (ret != NULL_RTX);
@@ -510,81 +708,49 @@
   return resolve_reg_p (SUBREG_REG (x));
 }
 
-/* This is called via for_each_rtx.  Look for SUBREGs which need to be
-   decomposed.  */
+/* Look for SUBREGs in *LOC which need to be decomposed.  */
 
-static int
-resolve_subreg_use (rtx *px, void *data)
+static bool
+resolve_subreg_use (rtx *loc, rtx insn)
 {
-  rtx insn = (rtx) data;
-  rtx x = *px;
-
-  if (x == NULL_RTX)
-    return 0;
-
-  if (resolve_subreg_p (x))
+  subrtx_ptr_iterator::array_type array;
+  FOR_EACH_SUBRTX_PTR (iter, array, loc, NONCONST)
     {
-      x = simplify_subreg_concatn (GET_MODE (x), SUBREG_REG (x),
-				   SUBREG_BYTE (x));
+      rtx *loc = *iter;
+      rtx x = *loc;
+      if (resolve_subreg_p (x))
+	{
+	  x = simplify_subreg_concatn (GET_MODE (x), SUBREG_REG (x),
+				       SUBREG_BYTE (x));
 
-      /* It is possible for a note to contain a reference which we can
-	 decompose.  In this case, return 1 to the caller to indicate
-	 that the note must be removed.  */
-      if (!x)
-	{
-	  gcc_assert (!insn);
-	  return 1;
+	  /* It is possible for a note to contain a reference which we can
+	     decompose.  In this case, return 1 to the caller to indicate
+	     that the note must be removed.  */
+	  if (!x)
+	    {
+	      gcc_assert (!insn);
+	      return true;
+	    }
+
+	  validate_change (insn, loc, x, 1);
+	  iter.skip_subrtxes ();
 	}
-
-      validate_change (insn, px, x, 1);
-      return -1;
+      else if (resolve_reg_p (x))
+	/* Return 1 to the caller to indicate that we found a direct
+	   reference to a register which is being decomposed.  This can
+	   happen inside notes, multiword shift or zero-extend
+	   instructions.  */
+	return true;
     }
 
-  if (resolve_reg_p (x))
-    {
-      /* Return 1 to the caller to indicate that we found a direct
-	 reference to a register which is being decomposed.  This can
-	 happen inside notes, multiword shift or zero-extend
-	 instructions.  */
-      return 1;
-    }
-
-  return 0;
-}
-
-/* This is called via for_each_rtx.  Look for SUBREGs which can be
-   decomposed and decomposed REGs that need copying.  */
-
-static int
-adjust_decomposed_uses (rtx *px, void *data ATTRIBUTE_UNUSED)
-{
-  rtx x = *px;
-
-  if (x == NULL_RTX)
-    return 0;
-
-  if (resolve_subreg_p (x))
-    {
-      x = simplify_subreg_concatn (GET_MODE (x), SUBREG_REG (x),
-				   SUBREG_BYTE (x));
-
-      if (x)
-	*px = x;
-      else
-	x = copy_rtx (*px);
-    }
-
-  if (resolve_reg_p (x))
-    *px = copy_rtx (x);
-
-  return 0;
+  return false;
 }
 
 /* Resolve any decomposed registers which appear in register notes on
    INSN.  */
 
 static void
-resolve_reg_notes (rtx insn)
+resolve_reg_notes (rtx_insn *insn)
 {
   rtx *pnote, note;
 
@@ -592,7 +758,7 @@
   if (note)
     {
       int old_count = num_validated_changes ();
-      if (for_each_rtx (&XEXP (note, 0), resolve_subreg_use, NULL))
+      if (resolve_subreg_use (&XEXP (note, 0), NULL_RTX))
 	remove_note (insn, note);
       else
 	if (old_count != num_validated_changes ())
@@ -634,8 +800,15 @@
       unsigned int regno = REGNO (x);
 
       if (HARD_REGISTER_NUM_P (regno))
-	return (validate_subreg (word_mode, GET_MODE (x), x, UNITS_PER_WORD)
-		&& HARD_REGNO_MODE_OK (regno, word_mode));
+	{
+	  unsigned int byte, num_bytes;
+
+	  num_bytes = GET_MODE_SIZE (GET_MODE (x));
+	  for (byte = 0; byte < num_bytes; byte += UNITS_PER_WORD)
+	    if (simplify_subreg_regno (regno, GET_MODE (x), byte, word_mode) < 0)
+	      return false;
+	  return true;
+	}
       else
 	return !bitmap_bit_p (subreg_context, regno);
     }
@@ -647,11 +820,12 @@
    we don't change anything, return INSN, otherwise return the start
    of the sequence of moves.  */
 
-static rtx
-resolve_simple_move (rtx set, rtx insn)
+static rtx_insn *
+resolve_simple_move (rtx set, rtx_insn *insn)
 {
-  rtx src, dest, real_dest, insns;
-  enum machine_mode orig_mode, dest_mode;
+  rtx src, dest, real_dest;
+  rtx_insn *insns;
+  machine_mode orig_mode, dest_mode;
   unsigned int words;
   bool pushing;
 
@@ -660,8 +834,7 @@
   orig_mode = GET_MODE (dest);
 
   words = (GET_MODE_SIZE (orig_mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
-  if (words <= 1)
-    return insn;
+  gcc_assert (words > 1);
 
   start_sequence ();
 
@@ -693,7 +866,8 @@
 	  || (GET_MODE_SIZE (orig_mode)
 	      != GET_MODE_SIZE (GET_MODE (SUBREG_REG (dest))))))
     {
-      rtx reg, minsn, smove;
+      rtx reg, smove;
+      rtx_insn *minsn;
 
       reg = gen_reg_rtx (orig_mode);
       minsn = emit_move_insn (reg, src);
@@ -728,9 +902,9 @@
       int acg;
 
       if (MEM_P (src))
-	for_each_rtx (&XEXP (src, 0), resolve_subreg_use, NULL_RTX);
+	resolve_subreg_use (&XEXP (src, 0), NULL_RTX);
       if (MEM_P (dest))
-	for_each_rtx (&XEXP (dest, 0), resolve_subreg_use, NULL_RTX);
+	resolve_subreg_use (&XEXP (dest, 0), NULL_RTX);
       acg = apply_change_group ();
       gcc_assert (acg);
     }
@@ -745,7 +919,20 @@
       rtx reg;
 
       reg = gen_reg_rtx (orig_mode);
-      emit_move_insn (reg, src);
+
+      if (AUTO_INC_DEC)
+	{
+	  rtx_insn *move = emit_move_insn (reg, src);
+	  if (MEM_P (src))
+	    {
+	      rtx note = find_reg_note (insn, REG_INC, NULL_RTX);
+	      if (note)
+		add_reg_note (move, REG_INC, XEXP (note, 0));
+	    }
+	}
+      else
+	emit_move_insn (reg, src);
+
       src = reg;
     }
 
@@ -767,11 +954,7 @@
       if (real_dest == NULL_RTX)
 	real_dest = dest;
       if (!SCALAR_INT_MODE_P (dest_mode))
-	{
-	  dest_mode = mode_for_size (GET_MODE_SIZE (dest_mode) * BITS_PER_UNIT,
-				     MODE_INT, 0);
-	  gcc_assert (dest_mode != BLKmode);
-	}
+	dest_mode = int_mode_for_mode (dest_mode).require ();
       dest = gen_reg_rtx (dest_mode);
       if (REG_P (real_dest))
 	REG_ATTRS (dest) = REG_ATTRS (real_dest);
@@ -827,7 +1010,8 @@
 
   if (real_dest != NULL_RTX)
     {
-      rtx mdest, minsn, smove;
+      rtx mdest, smove;
+      rtx_insn *minsn;
 
       if (dest_mode == orig_mode)
 	mdest = dest;
@@ -835,6 +1019,14 @@
 	mdest = simplify_gen_subreg (orig_mode, dest, GET_MODE (dest), 0);
       minsn = emit_move_insn (real_dest, mdest);
 
+  if (AUTO_INC_DEC && MEM_P (real_dest)
+      && !(resolve_reg_p (real_dest) || resolve_subreg_p (real_dest)))
+    {
+      rtx note = find_reg_note (insn, REG_INC, NULL_RTX);
+      if (note)
+	add_reg_note (minsn, REG_INC, XEXP (note, 0));
+    }
+
       smove = single_set (minsn);
       gcc_assert (smove != NULL_RTX);
 
@@ -848,7 +1040,13 @@
 
   emit_insn_before (insns, insn);
 
-  delete_insn (insn);
+  /* If we get here via self-recursion, then INSN is not yet in the insns
+     chain and delete_insn will fail.  We only want to remove INSN from the
+     current sequence.  See PR56738.  */
+  if (in_sequence_p ())
+    remove_insn (insn);
+  else
+    delete_insn (insn);
 
   return insns;
 }
@@ -857,10 +1055,10 @@
    component registers.  Return whether we changed something.  */
 
 static bool
-resolve_clobber (rtx pat, rtx insn)
+resolve_clobber (rtx pat, rtx_insn *insn)
 {
   rtx reg;
-  enum machine_mode orig_mode;
+  machine_mode orig_mode;
   unsigned int words, i;
   int ret;
 
@@ -898,7 +1096,7 @@
    whether we changed something.  */
 
 static bool
-resolve_use (rtx pat, rtx insn)
+resolve_use (rtx pat, rtx_insn *insn)
 {
   if (resolve_reg_p (XEXP (pat, 0)) || resolve_subreg_p (XEXP (pat, 0)))
     {
@@ -914,21 +1112,39 @@
 /* A VAR_LOCATION can be simplified.  */
 
 static void
-resolve_debug (rtx insn)
+resolve_debug (rtx_insn *insn)
 {
-  for_each_rtx (&PATTERN (insn), adjust_decomposed_uses, NULL_RTX);
+  subrtx_ptr_iterator::array_type array;
+  FOR_EACH_SUBRTX_PTR (iter, array, &PATTERN (insn), NONCONST)
+    {
+      rtx *loc = *iter;
+      rtx x = *loc;
+      if (resolve_subreg_p (x))
+	{
+	  x = simplify_subreg_concatn (GET_MODE (x), SUBREG_REG (x),
+				       SUBREG_BYTE (x));
+
+	  if (x)
+	    *loc = x;
+	  else
+	    x = copy_rtx (*loc);
+	}
+      if (resolve_reg_p (x))
+	*loc = copy_rtx (x);
+    }
 
   df_insn_rescan (insn);
 
   resolve_reg_notes (insn);
 }
 
-/* Checks if INSN is a decomposable multiword-shift or zero-extend and
-   sets the decomposable_context bitmap accordingly.  A non-zero value
-   is returned if a decomposable insn has been found.  */
+/* Check if INSN is a decomposable multiword-shift or zero-extend and
+   set the decomposable_context bitmap accordingly.  SPEED_P is true
+   if we are optimizing INSN for speed rather than size.  Return true
+   if INSN is decomposable.  */
 
-static int
-find_decomposable_shift_zext (rtx insn)
+static bool
+find_decomposable_shift_zext (rtx_insn *insn, bool speed_p)
 {
   rtx set;
   rtx op;
@@ -936,41 +1152,47 @@
 
   set = single_set (insn);
   if (!set)
-    return 0;
+    return false;
 
   op = SET_SRC (set);
   if (GET_CODE (op) != ASHIFT
       && GET_CODE (op) != LSHIFTRT
+      && GET_CODE (op) != ASHIFTRT
       && GET_CODE (op) != ZERO_EXTEND)
-    return 0;
+    return false;
 
   op_operand = XEXP (op, 0);
   if (!REG_P (SET_DEST (set)) || !REG_P (op_operand)
       || HARD_REGISTER_NUM_P (REGNO (SET_DEST (set)))
       || HARD_REGISTER_NUM_P (REGNO (op_operand))
-      || !SCALAR_INT_MODE_P (GET_MODE (op)))
-    return 0;
+      || GET_MODE (op) != twice_word_mode)
+    return false;
 
   if (GET_CODE (op) == ZERO_EXTEND)
     {
       if (GET_MODE (op_operand) != word_mode
-	  || GET_MODE_BITSIZE (GET_MODE (op)) != 2 * BITS_PER_WORD)
-	return 0;
+	  || !choices[speed_p].splitting_zext)
+	return false;
     }
   else /* left or right shift */
     {
+      bool *splitting = (GET_CODE (op) == ASHIFT
+			 ? choices[speed_p].splitting_ashift
+			 : GET_CODE (op) == ASHIFTRT
+			 ? choices[speed_p].splitting_ashiftrt
+			 : choices[speed_p].splitting_lshiftrt);
       if (!CONST_INT_P (XEXP (op, 1))
-	  || INTVAL (XEXP (op, 1)) < BITS_PER_WORD
-	  || GET_MODE_BITSIZE (GET_MODE (op_operand)) != 2 * BITS_PER_WORD)
-	return 0;
+	  || !IN_RANGE (INTVAL (XEXP (op, 1)), BITS_PER_WORD,
+			2 * BITS_PER_WORD - 1)
+	  || !splitting[INTVAL (XEXP (op, 1)) - BITS_PER_WORD])
+	return false;
+
+      bitmap_set_bit (decomposable_context, REGNO (op_operand));
     }
 
   bitmap_set_bit (decomposable_context, REGNO (SET_DEST (set)));
 
-  if (GET_CODE (op) != ZERO_EXTEND)
-    bitmap_set_bit (decomposable_context, REGNO (op_operand));
-
-  return 1;
+  return true;
 }
 
 /* Decompose a more than word wide shift (in INSN) of a multiword
@@ -978,38 +1200,44 @@
    and 'set to zero' insn.  Return a pointer to the new insn when a
    replacement was done.  */
 
-static rtx
-resolve_shift_zext (rtx insn)
+static rtx_insn *
+resolve_shift_zext (rtx_insn *insn)
 {
   rtx set;
   rtx op;
   rtx op_operand;
-  rtx insns;
-  rtx src_reg, dest_reg, dest_zero;
+  rtx_insn *insns;
+  rtx src_reg, dest_reg, dest_upper, upper_src = NULL_RTX;
   int src_reg_num, dest_reg_num, offset1, offset2, src_offset;
+  scalar_int_mode inner_mode;
 
   set = single_set (insn);
   if (!set)
-    return NULL_RTX;
+    return NULL;
 
   op = SET_SRC (set);
   if (GET_CODE (op) != ASHIFT
       && GET_CODE (op) != LSHIFTRT
+      && GET_CODE (op) != ASHIFTRT
       && GET_CODE (op) != ZERO_EXTEND)
-    return NULL_RTX;
+    return NULL;
 
   op_operand = XEXP (op, 0);
+  if (!is_a <scalar_int_mode> (GET_MODE (op_operand), &inner_mode))
+    return NULL;
 
+  /* We can tear this operation apart only if the regs were already
+     torn apart.  */
   if (!resolve_reg_p (SET_DEST (set)) && !resolve_reg_p (op_operand))
-    return NULL_RTX;
+    return NULL;
 
   /* src_reg_num is the number of the word mode register which we
      are operating on.  For a left shift and a zero_extend on little
      endian machines this is register 0.  */
-  src_reg_num = GET_CODE (op) == LSHIFTRT ? 1 : 0;
+  src_reg_num = (GET_CODE (op) == LSHIFTRT || GET_CODE (op) == ASHIFTRT)
+		? 1 : 0;
 
-  if (WORDS_BIG_ENDIAN
-      && GET_MODE_SIZE (GET_MODE (op_operand)) > UNITS_PER_WORD)
+  if (WORDS_BIG_ENDIAN && GET_MODE_SIZE (inner_mode) > UNITS_PER_WORD)
     src_reg_num = 1 - src_reg_num;
 
   if (GET_CODE (op) == ZERO_EXTEND)
@@ -1026,12 +1254,17 @@
   dest_reg = simplify_gen_subreg_concatn (word_mode, SET_DEST (set),
                                           GET_MODE (SET_DEST (set)),
                                           offset1);
-  dest_zero = simplify_gen_subreg_concatn (word_mode, SET_DEST (set),
-                                           GET_MODE (SET_DEST (set)),
-                                           offset2);
+  dest_upper = simplify_gen_subreg_concatn (word_mode, SET_DEST (set),
+					    GET_MODE (SET_DEST (set)),
+					    offset2);
   src_reg = simplify_gen_subreg_concatn (word_mode, op_operand,
                                          GET_MODE (op_operand),
                                          src_offset);
+  if (GET_CODE (op) == ASHIFTRT
+      && INTVAL (XEXP (op, 1)) != 2 * BITS_PER_WORD - 1)
+    upper_src = expand_shift (RSHIFT_EXPR, word_mode, copy_rtx (src_reg),
+			      BITS_PER_WORD - 1, NULL_RTX, 0);
+
   if (GET_CODE (op) != ZERO_EXTEND)
     {
       int shift_count = INTVAL (XEXP (op, 1));
@@ -1039,14 +1272,18 @@
 	src_reg = expand_shift (GET_CODE (op) == ASHIFT ?
 				LSHIFT_EXPR : RSHIFT_EXPR,
 				word_mode, src_reg,
-				build_int_cst (NULL_TREE,
-					       shift_count - BITS_PER_WORD),
-				dest_reg, 1);
+				shift_count - BITS_PER_WORD,
+				dest_reg, GET_CODE (op) != ASHIFTRT);
     }
 
   if (dest_reg != src_reg)
     emit_move_insn (dest_reg, src_reg);
-  emit_move_insn (dest_zero, CONST0_RTX (word_mode));
+  if (GET_CODE (op) != ASHIFTRT)
+    emit_move_insn (dest_upper, CONST0_RTX (word_mode));
+  else if (INTVAL (XEXP (op, 1)) == 2 * BITS_PER_WORD - 1)
+    emit_move_insn (dest_upper, copy_rtx (src_reg));
+  else
+    emit_move_insn (dest_upper, upper_src);
   insns = get_insns ();
 
   end_sequence ();
@@ -1055,7 +1292,7 @@
 
   if (dump_file)
     {
-      rtx in;
+      rtx_insn *in;
       fprintf (dump_file, "; Replacing insn: %d with insns: ", INSN_UID (insn));
       for (in = insns; in != insn; in = NEXT_INSN (in))
 	fprintf (dump_file, "%d ", INSN_UID (in));
@@ -1066,18 +1303,81 @@
   return insns;
 }
 
-/* Look for registers which are always accessed via word-sized SUBREGs
-   or via copies.  Decompose these registers into several word-sized
-   pseudo-registers.  */
+/* Print to dump_file a description of what we're doing with shift code CODE.
+   SPLITTING[X] is true if we are splitting shifts by X + BITS_PER_WORD.  */
+
+static void
+dump_shift_choices (enum rtx_code code, bool *splitting)
+{
+  int i;
+  const char *sep;
+
+  fprintf (dump_file,
+	   "  Splitting mode %s for %s lowering with shift amounts = ",
+	   GET_MODE_NAME (twice_word_mode), GET_RTX_NAME (code));
+  sep = "";
+  for (i = 0; i < BITS_PER_WORD; i++)
+    if (splitting[i])
+      {
+	fprintf (dump_file, "%s%d", sep, i + BITS_PER_WORD);
+	sep = ",";
+      }
+  fprintf (dump_file, "\n");
+}
+
+/* Print to dump_file a description of what we're doing when optimizing
+   for speed or size; SPEED_P says which.  DESCRIPTION is a description
+   of the SPEED_P choice.  */
 
 static void
-decompose_multiword_subregs (void)
+dump_choices (bool speed_p, const char *description)
+{
+  unsigned int i;
+
+  fprintf (dump_file, "Choices when optimizing for %s:\n", description);
+
+  for (i = 0; i < MAX_MACHINE_MODE; i++)
+    if (GET_MODE_SIZE ((machine_mode) i) > UNITS_PER_WORD)
+      fprintf (dump_file, "  %s mode %s for copy lowering.\n",
+	       choices[speed_p].move_modes_to_split[i]
+	       ? "Splitting"
+	       : "Skipping",
+	       GET_MODE_NAME ((machine_mode) i));
+
+  fprintf (dump_file, "  %s mode %s for zero_extend lowering.\n",
+	   choices[speed_p].splitting_zext ? "Splitting" : "Skipping",
+	   GET_MODE_NAME (twice_word_mode));
+
+  dump_shift_choices (ASHIFT, choices[speed_p].splitting_ashift);
+  dump_shift_choices (LSHIFTRT, choices[speed_p].splitting_lshiftrt);
+  dump_shift_choices (ASHIFTRT, choices[speed_p].splitting_ashiftrt);
+  fprintf (dump_file, "\n");
+}
+
+/* Look for registers which are always accessed via word-sized SUBREGs
+   or -if DECOMPOSE_COPIES is true- via copies.  Decompose these
+   registers into several word-sized pseudo-registers.  */
+
+static void
+decompose_multiword_subregs (bool decompose_copies)
 {
   unsigned int max;
   basic_block bb;
+  bool speed_p;
 
-  if (df)
-    df_set_flags (DF_DEFER_INSN_RESCAN);
+  if (dump_file)
+    {
+      dump_choices (false, "size");
+      dump_choices (true, "speed");
+    }
+
+  /* Check if this target even has any modes to consider lowering.   */
+  if (!choices[false].something_to_do && !choices[true].something_to_do)
+    {
+      if (dump_file)
+	fprintf (dump_file, "Nothing to do!\n");
+      return;
+    }
 
   max = max_reg_num ();
 
@@ -1087,36 +1387,51 @@
      all the insns.  */
   {
     unsigned int i;
+    bool useful_modes_seen = false;
 
     for (i = FIRST_PSEUDO_REGISTER; i < max; ++i)
+      if (regno_reg_rtx[i] != NULL)
+	{
+	  machine_mode mode = GET_MODE (regno_reg_rtx[i]);
+	  if (choices[false].move_modes_to_split[(int) mode]
+	      || choices[true].move_modes_to_split[(int) mode])
+	    {
+	      useful_modes_seen = true;
+	      break;
+	    }
+	}
+
+    if (!useful_modes_seen)
       {
-	if (regno_reg_rtx[i] != NULL
-	    && GET_MODE_SIZE (GET_MODE (regno_reg_rtx[i])) > UNITS_PER_WORD)
-	  break;
+	if (dump_file)
+	  fprintf (dump_file, "Nothing to lower in this function.\n");
+	return;
       }
-    if (i == max)
-      return;
   }
 
   if (df)
-    run_word_dce ();
+    {
+      df_set_flags (DF_DEFER_INSN_RESCAN);
+      run_word_dce ();
+    }
 
-  /* FIXME: When the dataflow branch is merged, we can change this
-     code to look for each multi-word pseudo-register and to find each
-     insn which sets or uses that register.  That should be faster
-     than scanning all the insns.  */
+  /* FIXME: It may be possible to change this code to look for each
+     multi-word pseudo-register and to find each insn which sets or
+     uses that register.  That should be faster than scanning all the
+     insns.  */
 
   decomposable_context = BITMAP_ALLOC (NULL);
   non_decomposable_context = BITMAP_ALLOC (NULL);
   subreg_context = BITMAP_ALLOC (NULL);
 
-  reg_copy_graph = VEC_alloc (bitmap, heap, max);
-  VEC_safe_grow (bitmap, heap, reg_copy_graph, max);
-  memset (VEC_address (bitmap, reg_copy_graph), 0, sizeof (bitmap) * max);
+  reg_copy_graph.create (max);
+  reg_copy_graph.safe_grow_cleared (max);
+  memset (reg_copy_graph.address (), 0, sizeof (bitmap) * max);
 
-  FOR_EACH_BB (bb)
+  speed_p = optimize_function_for_speed_p (cfun);
+  FOR_EACH_BB_FN (bb, cfun)
     {
-      rtx insn;
+      rtx_insn *insn;
 
       FOR_BB_INSNS (bb, insn)
 	{
@@ -1129,20 +1444,28 @@
 	      || GET_CODE (PATTERN (insn)) == USE)
 	    continue;
 
-	  if (find_decomposable_shift_zext (insn))
+	  recog_memoized (insn);
+
+	  if (find_decomposable_shift_zext (insn, speed_p))
 	    continue;
 
-	  recog_memoized (insn);
 	  extract_insn (insn);
 
-	  set = simple_move (insn);
+	  set = simple_move (insn, speed_p);
 
 	  if (!set)
 	    cmi = NOT_SIMPLE_MOVE;
 	  else
 	    {
+	      /* We mark pseudo-to-pseudo copies as decomposable during the
+		 second pass only.  The first pass is so early that there is
+		 good chance such moves will be optimized away completely by
+		 subsequent optimizations anyway.
+
+		 However, we call find_pseudo_copy even during the first pass
+		 so as to properly set up the reg_copy_graph.  */
 	      if (find_pseudo_copy (set))
-		cmi = SIMPLE_PSEUDO_REG_MOVE;
+		cmi = decompose_copies? DECOMPOSABLE_SIMPLE_MOVE : SIMPLE_MOVE;
 	      else
 		cmi = SIMPLE_MOVE;
 	    }
@@ -1150,9 +1473,7 @@
 	  n = recog_data.n_operands;
 	  for (i = 0; i < n; ++i)
 	    {
-	      for_each_rtx (&recog_data.operand[i],
-			    find_decomposable_subregs,
-			    &cmi);
+	      find_decomposable_subregs (&recog_data.operand[i], &cmi);
 
 	      /* We handle ASM_OPERANDS as a special case to support
 		 things like x86 rdtsc which returns a DImode value.
@@ -1172,7 +1493,6 @@
   bitmap_and_compl_into (decomposable_context, non_decomposable_context);
   if (!bitmap_empty_p (decomposable_context))
     {
-      sbitmap sub_blocks;
       unsigned int i;
       sbitmap_iterator sbi;
       bitmap_iterator iter;
@@ -1180,15 +1500,15 @@
 
       propagate_pseudo_copies ();
 
-      sub_blocks = sbitmap_alloc (last_basic_block);
-      sbitmap_zero (sub_blocks);
+      auto_sbitmap sub_blocks (last_basic_block_for_fn (cfun));
+      bitmap_clear (sub_blocks);
 
       EXECUTE_IF_SET_IN_BITMAP (decomposable_context, 0, regno, iter)
 	decompose_register (regno);
 
-      FOR_EACH_BB (bb)
+      FOR_EACH_BB_FN (bb, cfun)
 	{
-	  rtx insn;
+	  rtx_insn *insn;
 
 	  FOR_BB_INSNS (bb, insn)
 	    {
@@ -1212,10 +1532,10 @@
 		  recog_memoized (insn);
 		  extract_insn (insn);
 
-		  set = simple_move (insn);
+		  set = simple_move (insn, speed_p);
 		  if (set)
 		    {
-		      rtx orig_insn = insn;
+		      rtx_insn *orig_insn = insn;
 		      bool cfi = control_flow_insn_p (insn);
 
 		      /* We can end up splitting loads to multi-word pseudos
@@ -1240,12 +1560,12 @@
 			  extract_insn (insn);
 
 			  if (cfi)
-			    SET_BIT (sub_blocks, bb->index);
+			    bitmap_set_bit (sub_blocks, bb->index);
 			}
 		    }
 		  else
 		    {
-		      rtx decomposed_shift;
+		      rtx_insn *decomposed_shift;
 
 		      decomposed_shift = resolve_shift_zext (insn);
 		      if (decomposed_shift != NULL_RTX)
@@ -1257,9 +1577,7 @@
 		    }
 
 		  for (i = recog_data.n_operands - 1; i >= 0; --i)
-		    for_each_rtx (recog_data.operand_loc[i],
-				  resolve_subreg_use,
-				  insn);
+		    resolve_subreg_use (recog_data.operand_loc[i], insn);
 
 		  resolve_reg_notes (insn);
 
@@ -1285,12 +1603,12 @@
 	 of a basic block, split those blocks now.  Note that we only handle
 	 the case where splitting a load has caused multiple possibly trapping
 	 loads to appear.  */
-      EXECUTE_IF_SET_IN_SBITMAP (sub_blocks, 0, i, sbi)
+      EXECUTE_IF_SET_IN_BITMAP (sub_blocks, 0, i, sbi)
 	{
-	  rtx insn, end;
+	  rtx_insn *insn, *end;
 	  edge fallthru;
 
-	  bb = BASIC_BLOCK (i);
+	  bb = BASIC_BLOCK_FOR_FN (cfun, i);
 	  insn = BB_HEAD (bb);
 	  end = BB_END (bb);
 
@@ -1310,91 +1628,104 @@
 	        insn = NEXT_INSN (insn);
 	    }
 	}
-
-      sbitmap_free (sub_blocks);
     }
 
   {
     unsigned int i;
     bitmap b;
 
-    FOR_EACH_VEC_ELT (bitmap, reg_copy_graph, i, b)
+    FOR_EACH_VEC_ELT (reg_copy_graph, i, b)
       if (b)
 	BITMAP_FREE (b);
   }
 
-  VEC_free (bitmap, heap, reg_copy_graph);
+  reg_copy_graph.release ();
 
   BITMAP_FREE (decomposable_context);
   BITMAP_FREE (non_decomposable_context);
   BITMAP_FREE (subreg_context);
 }
 
-/* Gate function for lower subreg pass.  */
-
-static bool
-gate_handle_lower_subreg (void)
-{
-  return flag_split_wide_types != 0;
-}
-
 /* Implement first lower subreg pass.  */
 
-static unsigned int
-rest_of_handle_lower_subreg (void)
+namespace {
+
+const pass_data pass_data_lower_subreg =
+{
+  RTL_PASS, /* type */
+  "subreg1", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_LOWER_SUBREG, /* tv_id */
+  0, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  0, /* todo_flags_finish */
+};
+
+class pass_lower_subreg : public rtl_opt_pass
 {
-  decompose_multiword_subregs ();
-  return 0;
+public:
+  pass_lower_subreg (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_lower_subreg, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  virtual bool gate (function *) { return flag_split_wide_types != 0; }
+  virtual unsigned int execute (function *)
+    {
+      decompose_multiword_subregs (false);
+      return 0;
+    }
+
+}; // class pass_lower_subreg
+
+} // anon namespace
+
+rtl_opt_pass *
+make_pass_lower_subreg (gcc::context *ctxt)
+{
+  return new pass_lower_subreg (ctxt);
 }
 
 /* Implement second lower subreg pass.  */
 
-static unsigned int
-rest_of_handle_lower_subreg2 (void)
-{
-  decompose_multiword_subregs ();
-  return 0;
-}
+namespace {
 
-struct rtl_opt_pass pass_lower_subreg =
+const pass_data pass_data_lower_subreg2 =
 {
- {
-  RTL_PASS,
-  "subreg1",	                        /* name */
-  gate_handle_lower_subreg,             /* gate */
-  rest_of_handle_lower_subreg,          /* execute */
-  NULL,                                 /* sub */
-  NULL,                                 /* next */
-  0,                                    /* static_pass_number */
-  TV_LOWER_SUBREG,                      /* tv_id */
-  0,                                    /* properties_required */
-  0,                                    /* properties_provided */
-  0,                                    /* properties_destroyed */
-  0,                                    /* todo_flags_start */
-  TODO_dump_func |
-  TODO_ggc_collect |
-  TODO_verify_flow                      /* todo_flags_finish */
- }
+  RTL_PASS, /* type */
+  "subreg2", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_LOWER_SUBREG, /* tv_id */
+  0, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  TODO_df_finish, /* todo_flags_finish */
 };
 
-struct rtl_opt_pass pass_lower_subreg2 =
+class pass_lower_subreg2 : public rtl_opt_pass
 {
- {
-  RTL_PASS,
-  "subreg2",	                        /* name */
-  gate_handle_lower_subreg,             /* gate */
-  rest_of_handle_lower_subreg2,          /* execute */
-  NULL,                                 /* sub */
-  NULL,                                 /* next */
-  0,                                    /* static_pass_number */
-  TV_LOWER_SUBREG,                      /* tv_id */
-  0,                                    /* properties_required */
-  0,                                    /* properties_provided */
-  0,                                    /* properties_destroyed */
-  0,                                    /* todo_flags_start */
-  TODO_df_finish | TODO_verify_rtl_sharing |
-  TODO_dump_func |
-  TODO_ggc_collect |
-  TODO_verify_flow                      /* todo_flags_finish */
- }
-};
+public:
+  pass_lower_subreg2 (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_lower_subreg2, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  virtual bool gate (function *) { return flag_split_wide_types != 0; }
+  virtual unsigned int execute (function *)
+    {
+      decompose_multiword_subregs (true);
+      return 0;
+    }
+
+}; // class pass_lower_subreg2
+
+} // anon namespace
+
+rtl_opt_pass *
+make_pass_lower_subreg2 (gcc::context *ctxt)
+{
+  return new pass_lower_subreg2 (ctxt);
+}