diff gcc/tree-vect-data-refs.c @ 145:1830386684a0

gcc-9.2.0
author anatofuz
date Thu, 13 Feb 2020 11:34:05 +0900
parents 84e7813d76e9
children
line wrap: on
line diff
--- a/gcc/tree-vect-data-refs.c	Thu Oct 25 07:37:49 2018 +0900
+++ b/gcc/tree-vect-data-refs.c	Thu Feb 13 11:34:05 2020 +0900
@@ -1,5 +1,5 @@
 /* Data References Analysis and Manipulation Utilities for Vectorization.
-   Copyright (C) 2003-2018 Free Software Foundation, Inc.
+   Copyright (C) 2003-2020 Free Software Foundation, Inc.
    Contributed by Dorit Naishlos <dorit@il.ibm.com>
    and Ira Rosen <irar@il.ibm.com>
 
@@ -49,7 +49,6 @@
 #include "tree-vectorizer.h"
 #include "expr.h"
 #include "builtins.h"
-#include "params.h"
 #include "tree-cfg.h"
 #include "tree-hash-traits.h"
 #include "vec-perm-indices.h"
@@ -145,6 +144,30 @@
       if (rhs < lhs)
         scalar_type = rhs_type;
     }
+  else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
+    {
+      unsigned int i = 0;
+      if (gimple_call_internal_p (call))
+	{
+	  internal_fn ifn = gimple_call_internal_fn (call);
+	  if (internal_load_fn_p (ifn) || internal_store_fn_p (ifn))
+	    /* gimple_expr_type already picked the type of the loaded
+	       or stored data.  */
+	    i = ~0U;
+	  else if (internal_fn_mask_index (ifn) == 0)
+	    i = 1;
+	}
+      if (i < gimple_call_num_args (call))
+	{
+	  tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
+	  if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
+	    {
+	      rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
+	      if (rhs < lhs)
+		scalar_type = rhs_type;
+	    }
+	}
+    }
 
   *lhs_size_unit = lhs;
   *rhs_size_unit = rhs;
@@ -159,9 +182,9 @@
 static opt_result
 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 {
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
-
-  if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+
+  if ((unsigned) param_vect_max_version_for_alias_checks == 0)
     return opt_result::failure_at (vect_location,
 				   "will not create alias checks, as"
 				   " --param vect-max-version-for-alias-checks"
@@ -210,26 +233,60 @@
     return true;
 
   /* STMT_A and STMT_B belong to overlapping groups.  All loads in a
-     group are emitted at the position of the last scalar load and all
-     stores in a group are emitted at the position of the last scalar store.
+     SLP group are emitted at the position of the last scalar load and
+     all loads in an interleaving group are emitted at the position
+     of the first scalar load.
+     Stores in a group are emitted at the position of the last scalar store.
      Compute that position and check whether the resulting order matches
-     the current one.  */
-  stmt_vec_info last_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
+     the current one.
+     We have not yet decided between SLP and interleaving so we have
+     to conservatively assume both.  */
+  stmt_vec_info il_a;
+  stmt_vec_info last_a = il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
   if (last_a)
-    for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_a); s;
-	 s = DR_GROUP_NEXT_ELEMENT (s))
-      last_a = get_later_stmt (last_a, s);
+    {
+      for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_a); s;
+	   s = DR_GROUP_NEXT_ELEMENT (s))
+	last_a = get_later_stmt (last_a, s);
+      if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
+	{
+	  for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
+	       s = DR_GROUP_NEXT_ELEMENT (s))
+	    if (get_later_stmt (il_a, s) == il_a)
+	      il_a = s;
+	}
+      else
+	il_a = last_a;
+    }
   else
-    last_a = stmtinfo_a;
-  stmt_vec_info last_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
+    last_a = il_a = stmtinfo_a;
+  stmt_vec_info il_b;
+  stmt_vec_info last_b = il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
   if (last_b)
-    for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_b); s;
-	 s = DR_GROUP_NEXT_ELEMENT (s))
-      last_b = get_later_stmt (last_b, s);
+    {
+      for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_b); s;
+	   s = DR_GROUP_NEXT_ELEMENT (s))
+	last_b = get_later_stmt (last_b, s);
+      if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
+	{
+	  for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
+	       s = DR_GROUP_NEXT_ELEMENT (s))
+	    if (get_later_stmt (il_b, s) == il_b)
+	      il_b = s;
+	}
+      else
+	il_b = last_b;
+    }
   else
-    last_b = stmtinfo_b;
-  return ((get_later_stmt (last_a, last_b) == last_a)
-	  == (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a));
+    last_b = il_b = stmtinfo_b;
+  bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
+  return (/* SLP */
+	  (get_later_stmt (last_a, last_b) == last_a) == a_after_b
+	  /* Interleaving */
+	  && (get_later_stmt (il_a, il_b) == il_a) == a_after_b
+	  /* Mixed */
+	  && (get_later_stmt (il_a, last_b) == il_a) == a_after_b
+	  && (get_later_stmt (last_a, il_b) == last_a) == a_after_b);
 }
 
 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
@@ -248,7 +305,7 @@
 				       loop_vec_info loop_vinfo,
 				       int loop_depth, unsigned int *max_vf)
 {
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   lambda_vector dist_v;
   unsigned int i;
   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
@@ -305,7 +362,7 @@
 				  unsigned int *max_vf)
 {
   unsigned int i;
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   struct data_reference *dra = DDR_A (ddr);
   struct data_reference *drb = DDR_B (ddr);
   dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
@@ -473,8 +530,22 @@
 	     reversed (to make distance vector positive), and the actual
 	     distance is negative.  */
 	  if (dump_enabled_p ())
-	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+	    dump_printf_loc (MSG_NOTE, vect_location,
 	                     "dependence distance negative.\n");
+	  /* When doing outer loop vectorization, we need to check if there is
+	     a backward dependence at the inner loop level if the dependence
+	     at the outer loop is reversed.  See PR81740.  */
+	  if (nested_in_vect_loop_p (loop, stmtinfo_a)
+	      || nested_in_vect_loop_p (loop, stmtinfo_b))
+	    {
+	      unsigned inner_depth = index_in_loop_nest (loop->inner->num,
+							 DDR_LOOP_NEST (ddr));
+	      if (dist_v[inner_depth] < 0)
+		return opt_result::failure_at (stmtinfo_a->stmt,
+					       "not vectorized, dependence "
+					       "between data-refs %T and %T\n",
+					       DR_REF (dra), DR_REF (drb));
+	    }
 	  /* Record a negative dependence distance to later limit the
 	     amount of stmt copying / unrolling we can perform.
 	     Only need to handle read-after-write dependence.  */
@@ -490,7 +561,7 @@
 	{
 	  /* The dependence distance requires reduction of the maximal
 	     vectorization factor.  */
-	  *max_vf = abs (dist);
+	  *max_vf = abs_dist;
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_NOTE, vect_location,
 	                     "adjusting maximal vectorization factor to %i\n",
@@ -795,7 +866,7 @@
 vect_record_base_alignments (vec_info *vinfo)
 {
   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
-  struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
+  class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
   data_reference *dr;
   unsigned int i;
   FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
@@ -819,7 +890,7 @@
 
 /* Return the target alignment for the vectorized form of DR_INFO.  */
 
-static unsigned int
+static poly_uint64
 vect_calculate_target_alignment (dr_vec_info *dr_info)
 {
   tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
@@ -842,7 +913,7 @@
   stmt_vec_info stmt_info = dr_info->stmt;
   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
-  struct loop *loop = NULL;
+  class loop *loop = NULL;
   tree ref = DR_REF (dr_info->dr);
   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 
@@ -862,10 +933,26 @@
   innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
   bool step_preserves_misalignment_p;
 
-  unsigned HOST_WIDE_INT vector_alignment
-    = vect_calculate_target_alignment (dr_info) / BITS_PER_UNIT;
+  poly_uint64 vector_alignment
+    = exact_div (vect_calculate_target_alignment (dr_info), BITS_PER_UNIT);
   DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
 
+  /* If the main loop has peeled for alignment we have no way of knowing
+     whether the data accesses in the epilogues are aligned.  We can't at
+     compile time answer the question whether we have entered the main loop or
+     not.  Fixes PR 92351.  */
+  if (loop_vinfo)
+    {
+      loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
+      if (orig_loop_vinfo
+	  && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
+	return;
+    }
+
+  unsigned HOST_WIDE_INT vect_align_c;
+  if (!vector_alignment.is_constant (&vect_align_c))
+    return;
+
   /* No step for BB vectorization.  */
   if (!loop)
     {
@@ -882,7 +969,7 @@
   else if (nested_in_vect_loop_p (loop, stmt_info))
     {
       step_preserves_misalignment_p
-	= (DR_STEP_ALIGNMENT (dr_info->dr) % vector_alignment) == 0;
+	= (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
 
       if (dump_enabled_p ())
 	{
@@ -904,7 +991,7 @@
     {
       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
       step_preserves_misalignment_p
-	= multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vector_alignment);
+	= multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
 
       if (!step_preserves_misalignment_p && dump_enabled_p ())
 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -923,7 +1010,7 @@
       base_misalignment = (*entry)->base_misalignment;
     }
 
-  if (drb->offset_alignment < vector_alignment
+  if (drb->offset_alignment < vect_align_c
       || !step_preserves_misalignment_p
       /* We need to know whether the step wrt the vectorized loop is
 	 negative when computing the starting misalignment below.  */
@@ -935,13 +1022,13 @@
       return;
     }
 
-  if (base_alignment < vector_alignment)
+  if (base_alignment < vect_align_c)
     {
       unsigned int max_alignment;
       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
-      if (max_alignment < vector_alignment
+      if (max_alignment < vect_align_c
 	  || !vect_can_force_dr_alignment_p (base,
-					     vector_alignment * BITS_PER_UNIT))
+					     vect_align_c * BITS_PER_UNIT))
 	{
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_NOTE, vect_location,
@@ -972,8 +1059,7 @@
 		     * TREE_INT_CST_LOW (drb->step));
 
   unsigned int const_misalignment;
-  if (!known_misalignment (misalignment, vector_alignment,
-			   &const_misalignment))
+  if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
     {
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -1010,20 +1096,10 @@
   unsigned int i;
   vec<dr_p> same_aligned_drs;
   struct data_reference *current_dr;
-  int dr_size = vect_get_scalar_dr_size (dr_info);
-  int dr_peel_size = vect_get_scalar_dr_size (dr_peel_info);
-  stmt_vec_info stmt_info = dr_info->stmt;
   stmt_vec_info peel_stmt_info = dr_peel_info->stmt;
 
- /* For interleaved data accesses the step in the loop must be multiplied by
-     the size of the interleaving group.  */
-  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
-    dr_size *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
-  if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
-    dr_peel_size *= DR_GROUP_SIZE (peel_stmt_info);
-
-  /* It can be assumed that the data refs with the same alignment as dr_peel
-     are aligned in the vector loop.  */
+  /* It can be assumed that if dr_info has the same alignment as dr_peel,
+     it is aligned in the vector loop.  */
   same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info);
   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
     {
@@ -1031,20 +1107,20 @@
         continue;
       gcc_assert (!known_alignment_for_access_p (dr_info)
 		  || !known_alignment_for_access_p (dr_peel_info)
-		  || (DR_MISALIGNMENT (dr_info) / dr_size
-		      == DR_MISALIGNMENT (dr_peel_info) / dr_peel_size));
+		  || (DR_MISALIGNMENT (dr_info)
+		      == DR_MISALIGNMENT (dr_peel_info)));
       SET_DR_MISALIGNMENT (dr_info, 0);
       return;
     }
 
-  if (known_alignment_for_access_p (dr_info)
+  unsigned HOST_WIDE_INT alignment;
+  if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
+      && known_alignment_for_access_p (dr_info)
       && known_alignment_for_access_p (dr_peel_info))
     {
-      bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
-					    size_zero_node) < 0;
       int misal = DR_MISALIGNMENT (dr_info);
-      misal += negative ? -npeel * dr_size : npeel * dr_size;
-      misal &= DR_TARGET_ALIGNMENT (dr_info) - 1;
+      misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
+      misal &= alignment - 1;
       SET_DR_MISALIGNMENT (dr_info, misal);
       return;
     }
@@ -1594,7 +1670,7 @@
 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
 {
   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   enum dr_alignment_support supportable_dr_alignment;
   dr_vec_info *first_store = NULL;
   dr_vec_info *dr0_info = NULL;
@@ -1688,7 +1764,12 @@
 						    size_zero_node) < 0;
 
 	      vectype = STMT_VINFO_VECTYPE (stmt_info);
-	      unsigned int target_align = DR_TARGET_ALIGNMENT (dr_info);
+	      /* If known_alignment_for_access_p then we have set
+	         DR_MISALIGNMENT which is only done if we know it at compiler
+	         time, so it is safe to assume target alignment is constant.
+	       */
+	      unsigned int target_align =
+		DR_TARGET_ALIGNMENT (dr_info).to_constant ();
 	      unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
 	      mis = (negative
 		     ? DR_MISALIGNMENT (dr_info)
@@ -1752,7 +1833,7 @@
 		 computation will be invariant in the outermost loop.  */
 	      else if (same_align_drs_max == same_align_drs)
 		{
-		  struct loop *ivloop0, *ivloop;
+		  class loop *ivloop0, *ivloop;
 		  ivloop0 = outermost_invariant_loop_for_expr
 		    (loop, DR_BASE_ADDRESS (dr0_info->dr));
 		  ivloop = outermost_invariant_loop_for_expr
@@ -1965,7 +2046,12 @@
 	      mis = (negative
 		     ? DR_MISALIGNMENT (dr0_info)
 		     : -DR_MISALIGNMENT (dr0_info));
-	      unsigned int target_align = DR_TARGET_ALIGNMENT (dr0_info);
+	      /* If known_alignment_for_access_p then we have set
+	         DR_MISALIGNMENT which is only done if we know it at compiler
+	         time, so it is safe to assume target alignment is constant.
+	       */
+	      unsigned int target_align =
+		DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
 	      npeel = ((mis & (target_align - 1))
 		       / vect_get_scalar_dr_size (dr0_info));
             }
@@ -1999,15 +2085,27 @@
       if (do_peeling)
         {
           unsigned max_allowed_peel
-            = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
+	    = param_vect_max_peeling_for_alignment;
+	  if (flag_vect_cost_model == VECT_COST_MODEL_CHEAP)
+	    max_allowed_peel = 0;
           if (max_allowed_peel != (unsigned)-1)
             {
               unsigned max_peel = npeel;
               if (max_peel == 0)
                 {
-		  unsigned int target_align = DR_TARGET_ALIGNMENT (dr0_info);
-		  max_peel = (target_align
-			      / vect_get_scalar_dr_size (dr0_info) - 1);
+		  poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
+		  unsigned HOST_WIDE_INT target_align_c;
+		  if (target_align.is_constant (&target_align_c))
+		    max_peel =
+		      target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
+		  else
+		    {
+		      do_peeling = false;
+		      if (dump_enabled_p ())
+			dump_printf_loc (MSG_NOTE, vect_location,
+			  "Disable peeling, max peels set and vector"
+			  " alignment unknown\n");
+		    }
                 }
               if (max_peel > max_allowed_peel)
                 {
@@ -2083,15 +2181,16 @@
   /* (2) Versioning to force alignment.  */
 
   /* Try versioning if:
-     1) optimize loop for speed
+     1) optimize loop for speed and the cost-model is not cheap
      2) there is at least one unsupported misaligned data ref with an unknown
         misalignment, and
      3) all misaligned data refs with a known misalignment are supported, and
      4) the number of runtime alignment checks is within reason.  */
 
-  do_versioning =
-	optimize_loop_nest_for_speed_p (loop)
-	&& (!loop->inner); /* FORNOW */
+  do_versioning
+    = (optimize_loop_nest_for_speed_p (loop)
+       && !loop->inner /* FORNOW */
+       && flag_vect_cost_model != VECT_COST_MODEL_CHEAP);
 
   if (do_versioning)
     {
@@ -2127,7 +2226,7 @@
 
               if (known_alignment_for_access_p (dr_info)
                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
-                     >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
+		  >= (unsigned) param_vect_max_version_for_alignment_checks)
                 {
                   do_versioning = false;
                   break;
@@ -2147,19 +2246,35 @@
 		  break;
 		}
 
+	      /* Forcing alignment in the first iteration is no good if
+		 we don't keep it across iterations.  For now, just disable
+		 versioning in this case.
+		 ?? We could actually unroll the loop to achieve the required
+		 overall step alignment, and forcing the alignment could be
+		 done by doing some iterations of the non-vectorized loop.  */
+	      if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
+			       * DR_STEP_ALIGNMENT (dr),
+			       DR_TARGET_ALIGNMENT (dr_info)))
+		{
+		  do_versioning = false;
+		  break;
+		}
+
               /* The rightmost bits of an aligned address must be zeros.
                  Construct the mask needed for this test.  For example,
                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
                  mask must be 15 = 0xf. */
 	      mask = size - 1;
 
-              /* FORNOW: use the same mask to test all potentially unaligned
-                 references in the loop.  The vectorizer currently supports
-                 a single vector size, see the reference to
-                 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
-                 vectorization factor is computed.  */
-              gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
-                          || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
+	      /* FORNOW: use the same mask to test all potentially unaligned
+		 references in the loop.  */
+	      if (LOOP_VINFO_PTR_MASK (loop_vinfo)
+		  && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
+		{
+		  do_versioning = false;
+		  break;
+		}
+
               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
 	      LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
             }
@@ -2246,11 +2361,18 @@
   if (maybe_ne (diff, 0))
     {
       /* Get the wider of the two alignments.  */
-      unsigned int align_a = (vect_calculate_target_alignment (dr_info_a)
-			      / BITS_PER_UNIT);
-      unsigned int align_b = (vect_calculate_target_alignment (dr_info_b)
-			      / BITS_PER_UNIT);
-      unsigned int max_align = MAX (align_a, align_b);
+      poly_uint64 align_a =
+	exact_div (vect_calculate_target_alignment (dr_info_a),
+		   BITS_PER_UNIT);
+      poly_uint64 align_b =
+	exact_div (vect_calculate_target_alignment (dr_info_b),
+		   BITS_PER_UNIT);
+      unsigned HOST_WIDE_INT align_a_c, align_b_c;
+      if (!align_a.is_constant (&align_a_c)
+	  || !align_b.is_constant (&align_b_c))
+	return;
+
+      unsigned HOST_WIDE_INT max_align = MAX (align_a_c, align_b_c);
 
       /* Require the gap to be a multiple of the larger vector alignment.  */
       if (!multiple_p (diff, max_align))
@@ -2438,7 +2560,8 @@
 	  return true;
 	}
 
-      dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
       STMT_VINFO_STRIDED_P (stmt_info) = true;
       return true;
     }
@@ -2450,40 +2573,15 @@
       struct data_reference *data_ref = dr;
       unsigned int count = 1;
       tree prev_init = DR_INIT (data_ref);
-      stmt_vec_info prev = stmt_info;
       HOST_WIDE_INT diff, gaps = 0;
 
       /* By construction, all group members have INTEGER_CST DR_INITs.  */
       while (next)
         {
-          /* Skip same data-refs.  In case that two or more stmts share
-             data-ref (supported only for loads), we vectorize only the first
-             stmt, and the rest get their vectorized loads from the first
-             one.  */
-          if (!tree_int_cst_compare (DR_INIT (data_ref),
-				     DR_INIT (STMT_VINFO_DATA_REF (next))))
-            {
-              if (DR_IS_WRITE (data_ref))
-                {
-                  if (dump_enabled_p ())
-                    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                     "Two store stmts share the same dr.\n");
-                  return false;
-                }
-
-	      if (dump_enabled_p ())
-		dump_printf_loc (MSG_NOTE, vect_location,
-				 "Two or more load stmts share the same dr.\n");
-
-	      /* For load use the same data-ref load.  */
-	      DR_GROUP_SAME_DR_STMT (next) = prev;
-
-	      prev = next;
-	      next = DR_GROUP_NEXT_ELEMENT (next);
-	      continue;
-            }
-
-	  prev = next;
+          /* We never have the same DR multiple times.  */
+          gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
+				DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
+
 	  data_ref = STMT_VINFO_DATA_REF (next);
 
 	  /* All group members have the same STEP by construction.  */
@@ -2559,11 +2657,22 @@
 	    dump_printf (MSG_NOTE, "strided store ");
 	  else
 	    dump_printf (MSG_NOTE, "store ");
-	  dump_printf (MSG_NOTE, "of size %u starting with %G",
-		       (unsigned)groupsize, stmt_info->stmt);
+	  dump_printf (MSG_NOTE, "of size %u\n",
+		       (unsigned)groupsize);
+	  dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
+	  next = DR_GROUP_NEXT_ELEMENT (stmt_info);
+	  while (next)
+	    {
+	      if (DR_GROUP_GAP (next) != 1)
+		dump_printf_loc (MSG_NOTE, vect_location,
+				 "\t<gap of %d elements>\n",
+				 DR_GROUP_GAP (next) - 1);
+	      dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
+	      next = DR_GROUP_NEXT_ELEMENT (next);
+	    }
 	  if (DR_GROUP_GAP (stmt_info) != 0)
 	    dump_printf_loc (MSG_NOTE, vect_location,
-			     "There is a gap of %u elements after the group\n",
+			     "\t<gap of %d elements>\n",
 			     DR_GROUP_GAP (stmt_info));
 	}
 
@@ -2617,7 +2726,7 @@
   tree scalar_type = TREE_TYPE (DR_REF (dr));
   stmt_vec_info stmt_info = dr_info->stmt;
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
-  struct loop *loop = NULL;
+  class loop *loop = NULL;
 
   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
     return true;
@@ -2770,10 +2879,12 @@
 }
 
 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
-   and STMT2_INFO being in a single group.  */
+   and STMT2_INFO being in a single group.  When ALLOW_SLP_P, masked loads can
+   be grouped in SLP mode.  */
 
 static bool
-can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info)
+can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
+		   bool allow_slp_p)
 {
   if (gimple_assign_single_p (stmt1_info->stmt))
     return gimple_assign_single_p (stmt2_info->stmt);
@@ -2795,7 +2906,8 @@
 	 like those created by build_mask_conversion.  */
       tree mask1 = gimple_call_arg (call1, 2);
       tree mask2 = gimple_call_arg (call2, 2);
-      if (!operand_equal_p (mask1, mask2, 0))
+      if (!operand_equal_p (mask1, mask2, 0)
+          && (ifn == IFN_MASK_STORE || !allow_slp_p))
 	{
 	  mask1 = strip_conversion (mask1);
 	  if (!mask1)
@@ -2881,7 +2993,7 @@
 	      || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
 					DR_BASE_ADDRESS (drb)) != 0
 	      || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
-	      || !can_group_stmts_p (stmtinfo_a, stmtinfo_b))
+	      || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
 	    break;
 
 	  /* Check that the data-refs have the same constant size.  */
@@ -2907,6 +3019,13 @@
 	      || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
 	    break;
 
+	  /* Different .GOMP_SIMD_LANE calls still give the same lane,
+	     just hold extra information.  */
+	  if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
+	      && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
+	      && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
+	    break;
+
 	  /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
 	  HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
 	  HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
@@ -2966,6 +3085,13 @@
 	  DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
 	  lastinfo = stmtinfo_b;
 
+	  STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
+	    = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
+
+	  if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
+	    dump_printf_loc (MSG_NOTE, vect_location,
+			     "Load suitable for SLP vectorization only.\n");
+
 	  if (init_b == init_prev
 	      && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
 	      && dump_enabled_p ())
@@ -2988,8 +3114,8 @@
       stmt_vec_info next, g = grp;
       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
 	{
-	  if ((DR_INIT (STMT_VINFO_DR_INFO (next)->dr)
-	       == DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
+	  if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
+				  DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
 	      && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
 	    first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
 	  g = next;
@@ -3001,7 +3127,7 @@
          Note this is a heuristic but one with the property that *it
 	 is fixed up completely.  */
       g = grp;
-      stmt_vec_info newgroup = NULL, ng;
+      stmt_vec_info newgroup = NULL, ng = grp;
       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
 	{
 	  if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
@@ -3353,7 +3479,6 @@
   /* First, we collect all data ref pairs for aliasing checks.  */
   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
     {
-      int comp_res;
       poly_uint64 lower_bound;
       tree segment_length_a, segment_length_b;
       unsigned HOST_WIDE_INT access_size_a, access_size_b;
@@ -3385,10 +3510,13 @@
       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
       stmt_vec_info stmt_info_b = dr_info_b->stmt;
 
+      bool preserves_scalar_order_p
+	= vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
+
       /* Skip the pair if inter-iteration dependencies are irrelevant
 	 and intra-iteration dependencies are guaranteed to be honored.  */
       if (ignore_step_p
-	  && (vect_preserves_scalar_order_p (dr_info_a, dr_info_b)
+	  && (preserves_scalar_order_p
 	      || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
 						 &lower_bound)))
 	{
@@ -3469,14 +3597,11 @@
       align_a = vect_vfa_align (dr_info_a);
       align_b = vect_vfa_align (dr_info_b);
 
-      comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_info_a->dr),
-					DR_BASE_ADDRESS (dr_info_b->dr));
-      if (comp_res == 0)
-	comp_res = data_ref_compare_tree (DR_OFFSET (dr_info_a->dr),
-					  DR_OFFSET (dr_info_b->dr));
-
       /* See whether the alias is known at compilation time.  */
-      if (comp_res == 0
+      if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
+			   DR_BASE_ADDRESS (dr_info_b->dr), 0)
+	  && operand_equal_p (DR_OFFSET (dr_info_a->dr),
+			      DR_OFFSET (dr_info_b->dr), 0)
 	  && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
 	  && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
 	  && poly_int_tree_p (segment_length_a)
@@ -3509,15 +3634,21 @@
 					   stmt_info_b->stmt);
 	}
 
+      dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
+			    access_size_a, align_a);
+      dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
+			    access_size_b, align_b);
+      /* Canonicalize the order to be the one that's needed for accurate
+	 RAW, WAR and WAW flags, in cases where the data references are
+	 well-ordered.  The order doesn't really matter otherwise,
+	 but we might as well be consistent.  */
+      if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
+	std::swap (dr_a, dr_b);
+
       dr_with_seg_len_pair_t dr_with_seg_len_pair
-	(dr_with_seg_len (dr_info_a->dr, segment_length_a,
-			  access_size_a, align_a),
-	 dr_with_seg_len (dr_info_b->dr, segment_length_b,
-			  access_size_b, align_b));
-
-      /* Canonicalize pairs by sorting the two DR members.  */
-      if (comp_res > 0)
-	std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
+	(dr_a, dr_b, (preserves_scalar_order_p
+		      ? dr_with_seg_len_pair_t::WELL_ORDERED
+		      : dr_with_seg_len_pair_t::REORDERED));
 
       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
     }
@@ -3527,16 +3658,18 @@
   unsigned int count = (comp_alias_ddrs.length ()
 			+ check_unequal_addrs.length ());
 
-  dump_printf_loc (MSG_NOTE, vect_location,
-		   "improved number of alias checks from %d to %d\n",
-		   may_alias_ddrs.length (), count);
-  if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+		     "improved number of alias checks from %d to %d\n",
+		     may_alias_ddrs.length (), count);
+  unsigned limit = param_vect_max_version_for_alias_checks;
+  if (flag_simd_cost_model == VECT_COST_MODEL_CHEAP)
+    limit = param_vect_max_version_for_alias_checks * 6 / 10;
+  if (count > limit)
     return opt_result::failure_at
       (vect_location,
-       "number of versioning for alias "
-       "run-time tests exceeds %d "
-       "(--param vect-max-version-for-alias-checks)\n",
-       PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
+       "number of versioning for alias run-time tests exceeds %d "
+       "(--param vect-max-version-for-alias-checks)\n", limit);
 
   return opt_result::success ();
 }
@@ -3544,28 +3677,22 @@
 /* Check whether we can use an internal function for a gather load
    or scatter store.  READ_P is true for loads and false for stores.
    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
-   the type of the memory elements being loaded or stored.  OFFSET_BITS
-   is the number of bits in each scalar offset and OFFSET_SIGN is the
-   sign of the offset.  SCALE is the amount by which the offset should
+   the type of the memory elements being loaded or stored.  OFFSET_TYPE
+   is the type of the offset that is being applied to the invariant
+   base address.  SCALE is the amount by which the offset should
    be multiplied *after* it has been converted to address width.
 
-   Return true if the function is supported, storing the function
-   id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT.  */
+   Return true if the function is supported, storing the function id in
+   *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT.  */
 
 bool
-vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
-			  tree memory_type, unsigned int offset_bits,
-			  signop offset_sign, int scale,
-			  internal_fn *ifn_out, tree *element_type_out)
+vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
+			  tree vectype, tree memory_type, tree offset_type,
+			  int scale, internal_fn *ifn_out,
+			  tree *offset_vectype_out)
 {
   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
   unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
-  if (offset_bits > element_bits)
-    /* Internal functions require the offset to be the same width as
-       the vector elements.  We can extend narrower offsets, but it isn't
-       safe to truncate wider offsets.  */
-    return false;
-
   if (element_bits != memory_bits)
     /* For now the vector elements must be the same width as the
        memory elements.  */
@@ -3578,14 +3705,28 @@
   else
     ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
 
-  /* Test whether the target supports this combination.  */
-  if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
-					       offset_sign, scale))
-    return false;
-
-  *ifn_out = ifn;
-  *element_type_out = TREE_TYPE (vectype);
-  return true;
+  for (;;)
+    {
+      tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
+      if (!offset_vectype)
+	return false;
+
+      /* Test whether the target supports this combination.  */
+      if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
+						  offset_vectype, scale))
+	{
+	  *ifn_out = ifn;
+	  *offset_vectype_out = offset_vectype;
+	  return true;
+	}
+
+      if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
+	  && TYPE_PRECISION (offset_type) >= element_bits)
+	return false;
+
+      offset_type = build_nonstandard_integer_type
+	(TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
+    }
 }
 
 /* STMT_INFO is a call to an internal gather load or scatter store function.
@@ -3619,7 +3760,7 @@
 {
   HOST_WIDE_INT scale = 1;
   poly_int64 pbitpos, pbitsize;
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
   tree offtype = NULL_TREE;
   tree decl = NULL_TREE, base, off;
@@ -3628,7 +3769,7 @@
   machine_mode pmode;
   int punsignedp, reversep, pvolatilep = 0;
   internal_fn ifn;
-  tree element_type;
+  tree offset_vectype;
   bool masked_p = false;
 
   /* See whether this is already a call to a gather/scatter internal function.
@@ -3789,13 +3930,18 @@
 	    {
 	      int new_scale = tree_to_shwi (op1);
 	      /* Only treat this as a scaling operation if the target
-		 supports it.  */
+		 supports it for at least some offset type.  */
 	      if (use_ifn_p
-		  && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
-						vectype, memory_type, 1,
-						TYPE_SIGN (TREE_TYPE (op0)),
+		  && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
+						masked_p, vectype, memory_type,
+						signed_char_type_node,
 						new_scale, &ifn,
-						&element_type))
+						&offset_vectype)
+		  && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
+						masked_p, vectype, memory_type,
+						unsigned_char_type_node,
+						new_scale, &ifn,
+						&offset_vectype))
 		break;
 	      scale = new_scale;
 	      off = op0;
@@ -3809,6 +3955,16 @@
 	  if (!POINTER_TYPE_P (TREE_TYPE (op0))
 	      && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
 	    break;
+
+	  /* Don't include the conversion if the target is happy with
+	     the current offset type.  */
+	  if (use_ifn_p
+	      && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
+					   masked_p, vectype, memory_type,
+					   TREE_TYPE (off), scale, &ifn,
+					   &offset_vectype))
+	    break;
+
 	  if (TYPE_PRECISION (TREE_TYPE (op0))
 	      == TYPE_PRECISION (TREE_TYPE (off)))
 	    {
@@ -3816,14 +3972,6 @@
 	      continue;
 	    }
 
-	  /* The internal functions need the offset to be the same width
-	     as the elements of VECTYPE.  Don't include operations that
-	     cast the offset from that width to a different width.  */
-	  if (use_ifn_p
-	      && (int_size_in_bytes (TREE_TYPE (vectype))
-		  == int_size_in_bytes (TREE_TYPE (off))))
-	    break;
-
 	  if (TYPE_PRECISION (TREE_TYPE (op0))
 	      < TYPE_PRECISION (TREE_TYPE (off)))
 	    {
@@ -3850,10 +3998,9 @@
 
   if (use_ifn_p)
     {
-      if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
-				     memory_type, TYPE_PRECISION (offtype),
-				     TYPE_SIGN (offtype), scale, &ifn,
-				     &element_type))
+      if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
+				     vectype, memory_type, offtype, scale,
+				     &ifn, &offset_vectype))
 	return false;
     }
   else
@@ -3873,7 +4020,8 @@
 	return false;
 
       ifn = IFN_LAST;
-      element_type = TREE_TYPE (vectype);
+      /* The offset vector type will be read from DECL when needed.  */
+      offset_vectype = NULL_TREE;
     }
 
   info->ifn = ifn;
@@ -3881,9 +4029,9 @@
   info->base = base;
   info->offset = off;
   info->offset_dt = vect_unknown_def_type;
-  info->offset_vectype = NULL_TREE;
+  info->offset_vectype = offset_vectype;
   info->scale = scale;
-  info->element_type = element_type;
+  info->element_type = TREE_TYPE (vectype);
   info->memory_type = memory_type;
   return true;
 }
@@ -3961,47 +4109,67 @@
 	  && DR_OFFSET (newdr)
 	  && DR_INIT (newdr)
 	  && DR_STEP (newdr)
+	  && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
 	  && integer_zerop (DR_STEP (newdr)))
 	{
+	  tree base_address = DR_BASE_ADDRESS (newdr);
 	  tree off = DR_OFFSET (newdr);
+	  tree step = ssize_int (1);
+	  if (integer_zerop (off)
+	      && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
+	    {
+	      off = TREE_OPERAND (base_address, 1);
+	      base_address = TREE_OPERAND (base_address, 0);
+	    }
 	  STRIP_NOPS (off);
-	  if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
-	      && TREE_CODE (off) == MULT_EXPR
+	  if (TREE_CODE (off) == MULT_EXPR
 	      && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
 	    {
-	      tree step = TREE_OPERAND (off, 1);
+	      step = TREE_OPERAND (off, 1);
 	      off = TREE_OPERAND (off, 0);
 	      STRIP_NOPS (off);
-	      if (CONVERT_EXPR_P (off)
-		  && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
-		      < TYPE_PRECISION (TREE_TYPE (off))))
-		off = TREE_OPERAND (off, 0);
-	      if (TREE_CODE (off) == SSA_NAME)
+	    }
+	  if (CONVERT_EXPR_P (off)
+	      && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
+		  < TYPE_PRECISION (TREE_TYPE (off))))
+	    off = TREE_OPERAND (off, 0);
+	  if (TREE_CODE (off) == SSA_NAME)
+	    {
+	      gimple *def = SSA_NAME_DEF_STMT (off);
+	      /* Look through widening conversion.  */
+	      if (is_gimple_assign (def)
+		  && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
 		{
-		  gimple *def = SSA_NAME_DEF_STMT (off);
+		  tree rhs1 = gimple_assign_rhs1 (def);
+		  if (TREE_CODE (rhs1) == SSA_NAME
+		      && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
+		      && (TYPE_PRECISION (TREE_TYPE (off))
+			  > TYPE_PRECISION (TREE_TYPE (rhs1))))
+		    def = SSA_NAME_DEF_STMT (rhs1);
+		}
+	      if (is_gimple_call (def)
+		  && gimple_call_internal_p (def)
+		  && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
+		{
+		  tree arg = gimple_call_arg (def, 0);
 		  tree reft = TREE_TYPE (DR_REF (newdr));
-		  if (is_gimple_call (def)
-		      && gimple_call_internal_p (def)
-		      && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
+		  gcc_assert (TREE_CODE (arg) == SSA_NAME);
+		  arg = SSA_NAME_VAR (arg);
+		  if (arg == loop->simduid
+		      /* For now.  */
+		      && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
 		    {
-		      tree arg = gimple_call_arg (def, 0);
-		      gcc_assert (TREE_CODE (arg) == SSA_NAME);
-		      arg = SSA_NAME_VAR (arg);
-		      if (arg == loop->simduid
-			  /* For now.  */
-			  && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
-			{
-			  DR_OFFSET (newdr) = ssize_int (0);
-			  DR_STEP (newdr) = step;
-			  DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
-			  DR_STEP_ALIGNMENT (newdr)
-			    = highest_pow2_factor (step);
-			  /* Mark as simd-lane access.  */
-			  newdr->aux = (void *)-1;
-			  free_data_ref (dr);
-			  datarefs->safe_push (newdr);
-			  return opt_result::success ();
-			}
+		      DR_BASE_ADDRESS (newdr) = base_address;
+		      DR_OFFSET (newdr) = ssize_int (0);
+		      DR_STEP (newdr) = step;
+		      DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
+		      DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
+		      /* Mark as simd-lane access.  */
+		      tree arg2 = gimple_call_arg (def, 1);
+		      newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
+		      free_data_ref (dr);
+		      datarefs->safe_push (newdr);
+		      return opt_result::success ();
 		    }
 		}
 	    }
@@ -4029,9 +4197,9 @@
 */
 
 opt_result
-vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
+vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
 {
-  struct loop *loop = NULL;
+  class loop *loop = NULL;
   unsigned int i;
   struct data_reference *dr;
   tree scalar_type;
@@ -4106,14 +4274,18 @@
         }
 
       /* See if this was detected as SIMD lane access.  */
-      if (dr->aux == (void *)-1)
+      if (dr->aux == (void *)-1
+	  || dr->aux == (void *)-2
+	  || dr->aux == (void *)-3
+	  || dr->aux == (void *)-4)
 	{
 	  if (nested_in_vect_loop_p (loop, stmt_info))
 	    return opt_result::failure_at (stmt_info->stmt,
 					   "not vectorized:"
 					   " data ref analysis failed: %G",
 					   stmt_info->stmt);
-	  STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
+	  STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
+	    = -(uintptr_t) dr->aux;
 	}
 
       tree base = get_base_address (DR_REF (dr));
@@ -4142,7 +4314,7 @@
 	{
 	  if (nested_in_vect_loop_p (loop, stmt_info))
 	    return opt_result::failure_at (stmt_info->stmt,
-					   "not vectorized:"
+					   "not vectorized: "
 					   "not suitable for strided load %G",
 					   stmt_info->stmt);
 	  STMT_VINFO_STRIDED_P (stmt_info) = true;
@@ -4203,9 +4375,8 @@
 
       /* Set vectype for STMT.  */
       scalar_type = TREE_TYPE (DR_REF (dr));
-      STMT_VINFO_VECTYPE (stmt_info)
-	= get_vectype_for_scalar_type (scalar_type);
-      if (!STMT_VINFO_VECTYPE (stmt_info))
+      tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
+      if (!vectype)
         {
           if (dump_enabled_p ())
             {
@@ -4225,6 +4396,8 @@
 	      STMT_VINFO_VECTORIZABLE (stmt_info) = false;
 	      continue;
 	    }
+	  if (fatal)
+	    *fatal = false;
 	  return opt_result::failure_at (stmt_info->stmt,
 					 "not vectorized:"
 					 " no vectype for stmt: %G"
@@ -4236,27 +4409,37 @@
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_NOTE, vect_location,
 			     "got vectype for stmt: %G%T\n",
-			     stmt_info->stmt, STMT_VINFO_VECTYPE (stmt_info));
+			     stmt_info->stmt, vectype);
 	}
 
       /* Adjust the minimal vectorization factor according to the
 	 vector type.  */
-      vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
+      vf = TYPE_VECTOR_SUBPARTS (vectype);
       *min_vf = upper_bound (*min_vf, vf);
 
+      /* Leave the BB vectorizer to pick the vector type later, based on
+	 the final dataref group size and SLP node size.  */
+      if (is_a <loop_vec_info> (vinfo))
+	STMT_VINFO_VECTYPE (stmt_info) = vectype;
+
       if (gatherscatter != SG_NONE)
 	{
 	  gather_scatter_info gs_info;
 	  if (!vect_check_gather_scatter (stmt_info,
 					  as_a <loop_vec_info> (vinfo),
 					  &gs_info)
-	      || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
-	    return opt_result::failure_at
-	      (stmt_info->stmt,
-	       (gatherscatter == GATHER) ?
-	       "not vectorized: not suitable for gather load %G" :
-	       "not vectorized: not suitable for scatter store %G",
-	       stmt_info->stmt);
+	      || !get_vectype_for_scalar_type (vinfo,
+					       TREE_TYPE (gs_info.offset)))
+	    {
+	      if (fatal)
+		*fatal = false;
+	      return opt_result::failure_at
+			(stmt_info->stmt,
+			 (gatherscatter == GATHER)
+			 ? "not vectorized: not suitable for gather load %G"
+			 : "not vectorized: not suitable for scatter store %G",
+			 stmt_info->stmt);
+	    }
 	  STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
 	}
     }
@@ -4358,7 +4541,8 @@
     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
   else
     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
-			    DR_TARGET_ALIGNMENT (dr_info), misalign);
+			    known_alignment (DR_TARGET_ALIGNMENT (dr_info)),
+			    misalign);
 }
 
 /* Function vect_create_addr_base_for_vector_ref.
@@ -4413,7 +4597,7 @@
   innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
 
   tree data_ref_base = unshare_expr (drb->base_address);
-  tree base_offset = unshare_expr (drb->offset);
+  tree base_offset = unshare_expr (get_dr_vinfo_offset (dr_info, true));
   tree init = unshare_expr (drb->init);
 
   if (loop_vinfo)
@@ -4531,16 +4715,16 @@
 
 tree
 vect_create_data_ref_ptr (stmt_vec_info stmt_info, tree aggr_type,
-			  struct loop *at_loop, tree offset,
+			  class loop *at_loop, tree offset,
 			  tree *initial_address, gimple_stmt_iterator *gsi,
 			  gimple **ptr_incr, bool only_init,
 			  tree byte_offset, tree iv_step)
 {
   const char *base_name;
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
-  struct loop *loop = NULL;
+  class loop *loop = NULL;
   bool nested_in_vect_loop = false;
-  struct loop *containing_loop = NULL;
+  class loop *containing_loop = NULL;
   tree aggr_ptr_type;
   tree aggr_ptr;
   tree new_temp;
@@ -5285,13 +5469,13 @@
                         tree *realignment_token,
 			enum dr_alignment_support alignment_support_scheme,
 			tree init_addr,
-			struct loop **at_loop)
+			class loop **at_loop)
 {
   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
   struct data_reference *dr = dr_info->dr;
-  struct loop *loop = NULL;
+  class loop *loop = NULL;
   edge pe = NULL;
   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
   tree vec_dest;
@@ -5306,8 +5490,8 @@
   gimple_seq stmts = NULL;
   bool compute_in_loop = false;
   bool nested_in_vect_loop = false;
-  struct loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
-  struct loop *loop_for_initial_load = NULL;
+  class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
+  class loop *loop_for_initial_load = NULL;
 
   if (loop_vinfo)
     {
@@ -5402,10 +5586,13 @@
 	new_temp = copy_ssa_name (ptr);
       else
 	new_temp = make_ssa_name (TREE_TYPE (ptr));
-      unsigned int align = DR_TARGET_ALIGNMENT (dr_info);
+      poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
+      tree type = TREE_TYPE (ptr);
       new_stmt = gimple_build_assign
 		   (new_temp, BIT_AND_EXPR, ptr,
-		    build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
+		    fold_build2 (MINUS_EXPR, type,
+				 build_int_cst (type, 0),
+				 build_int_cst (type, align)));
       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
       gcc_assert (!new_bb);
       data_ref
@@ -6240,12 +6427,14 @@
        correspond to the gaps.  */
       if (next_stmt_info != first_stmt_info
 	  && gap_count < DR_GROUP_GAP (next_stmt_info))
-      {
-        gap_count++;
-        continue;
-      }
-
-      while (next_stmt_info)
+	{
+	  gap_count++;
+	  continue;
+	}
+
+      /* ???  The following needs cleanup after the removal of
+         DR_GROUP_SAME_DR_STMT.  */
+      if (next_stmt_info)
         {
 	  stmt_vec_info new_stmt_info = vinfo->lookup_def (tmp_data_ref);
 	  /* We assume that if VEC_STMT is not NULL, this is a case of multiple
@@ -6255,29 +6444,21 @@
 	    STMT_VINFO_VEC_STMT (next_stmt_info) = new_stmt_info;
 	  else
             {
-	      if (!DR_GROUP_SAME_DR_STMT (next_stmt_info))
-                {
-		  stmt_vec_info prev_stmt_info
-		    = STMT_VINFO_VEC_STMT (next_stmt_info);
-		  stmt_vec_info rel_stmt_info
-		    = STMT_VINFO_RELATED_STMT (prev_stmt_info);
-		  while (rel_stmt_info)
-		    {
-		      prev_stmt_info = rel_stmt_info;
-		      rel_stmt_info = STMT_VINFO_RELATED_STMT (rel_stmt_info);
-		    }
-
-		  STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
-                }
+	      stmt_vec_info prev_stmt_info
+		= STMT_VINFO_VEC_STMT (next_stmt_info);
+	      stmt_vec_info rel_stmt_info
+		= STMT_VINFO_RELATED_STMT (prev_stmt_info);
+	      while (rel_stmt_info)
+		{
+		  prev_stmt_info = rel_stmt_info;
+		  rel_stmt_info = STMT_VINFO_RELATED_STMT (rel_stmt_info);
+		}
+
+	      STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
             }
 
 	  next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
 	  gap_count = 1;
-	  /* If NEXT_STMT_INFO accesses the same DR as the previous statement,
-	     put the same TMP_DATA_REF as its vectorized statement; otherwise
-	     get the next data-ref from RESULT_CHAIN.  */
-	  if (!next_stmt_info || !DR_GROUP_SAME_DR_STMT (next_stmt_info))
-	    break;
         }
     }
 }
@@ -6288,7 +6469,7 @@
    on ALIGNMENT bit boundary.  */
 
 bool
-vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
+vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
 {
   if (!VAR_P (decl))
     return false;
@@ -6298,9 +6479,10 @@
     return false;
 
   if (TREE_STATIC (decl))
-    return (alignment <= MAX_OFILE_ALIGNMENT);
+    return (known_le (alignment,
+		      (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
   else
-    return (alignment <= MAX_STACK_ALIGNMENT);
+    return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
 }
 
 
@@ -6319,7 +6501,7 @@
   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
   machine_mode mode = TYPE_MODE (vectype);
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
-  struct loop *vect_loop = NULL;
+  class loop *vect_loop = NULL;
   bool nested_in_vect_loop = false;
 
   if (aligned_access_p (dr_info) && !check_aligned_accesses)