CbC/CbC_gcc: gcc/tree-vect-loop.c comparison

comparison gcc/tree-vect-loop.c @ 67:f6334be47118

update gcc from gcc-4.6-20100522 to gcc-4.6-20110318

author	nobuyasu <dimolto@cr.ie.u-ryukyu.ac.jp>
date	Tue, 22 Mar 2011 17:18:12 +0900
parents	b7f97abdc517
children	04ced10e8804

comparison

equal deleted inserted replaced

-:65488c3d617d
+:f6334be47118
 #include "coretypes.h"
 #include "tm.h"
 #include "ggc.h"
 #include "tree.h"
 #include "basic-block.h"
-#include "diagnostic.h"
 #include "tree-pretty-print.h"
 #include "gimple-pretty-print.h"
 #include "tree-flow.h"
 #include "tree-dump.h"
 #include "cfgloop.h"
 #include "cfglayout.h"
 #include "expr.h"
 #include "recog.h"
 #include "optabs.h"
 #include "params.h"
-#include "toplev.h"
+#include "diagnostic-core.h"
 #include "tree-chrec.h"
 #include "tree-scalar-evolution.h"
 #include "tree-vectorizer.h"
+#include "target.h"
 /* Loop Vectorization Pass.
 This pass tries to vectorize loops.
 the vectorizer applies a set of analyses on a given set of loops,
 followed by the actual vectorization transformation for the loops that
 had successfully passed the analysis phase.
 Throughout this pass we make a distinction between two types of
 data: scalars (which are represented by SSA_NAMES), and memory references
-("data-refs"). These two types of data require different handling both
+("data-refs").  These two types of data require different handling both
 during analysis and transformation. The types of data-refs that the
 vectorizer currently supports are ARRAY_REFS which base is an array DECL
 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
 accesses are required to have a simple (consecutive) access pattern.
 Transformation phase:
 =====================
 The loop transformation phase scans all the stmts in the loop, and
 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
-the loop that needs to be vectorized. It inserts the vector code sequence
+the loop that needs to be vectorized.  It inserts the vector code sequence
 just before the scalar stmt S, and records a pointer to the vector code
 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
-attached to S). This pointer will be used for the vectorization of following
+attached to S).  This pointer will be used for the vectorization of following
 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 otherwise, we rely on dead code elimination for removing it.
 For example, say stmt S1 was vectorized into stmt VS1:
 S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 S2:  a = b;
 To vectorize stmt S2, the vectorizer first finds the stmt that defines
 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
-vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
+vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 resulting sequence would be:
 VS1: vb = px[i];
 S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 VS2: va = vb;
 load/store operations (like 'x[i]' in S1), and are handled differently.
 Target modeling:
 =================
 Currently the only target specific information that is used is the
-size of the vector (in bytes) - "UNITS_PER_SIMD_WORD". Targets that can
+size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
-support different sizes of vectors, for now will need to specify one value
+Targets that can support different sizes of vectors, for now will need
-for "UNITS_PER_SIMD_WORD". More flexibility will be added in the future.
+to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
+flexibility will be added in the future.
 Since we only vectorize operations which vector form can be
 expressed using existing tree codes, to verify that an operation is
 supported, the vectorizer checks the relevant optab at the relevant
-machine_mode (e.g, optab_handler (add_optab, V8HImode)->insn_code). If
+machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 the value found is CODE_FOR_nothing, then there's no target support, and
 we can't vectorize the stmt.
 For additional information on this project see:
 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 */
 /* Function vect_determine_vectorization_factor
-Determine the vectorization factor (VF). VF is the number of data elements
+Determine the vectorization factor (VF).  VF is the number of data elements
 that are operated upon in parallel in a single iteration of the vectorized
-loop. For example, when vectorizing a loop that operates on 4byte elements,
+loop.  For example, when vectorizing a loop that operates on 4byte elements,
 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 elements can fit in a single vector register.
 We currently support vectorization of loops in which all types operated upon
-are of the same size. Therefore this function currently sets VF according to
+are of the same size.  Therefore this function currently sets VF according to
 the size of the types operated upon, and fails if there are multiple sizes
 in the loop.
 VF is also the factor by which the loop iterations are strip-mined, e.g.:
 original loop:
 }
 /* Function vect_analyze_scalar_cycles_1.
 Examine the cross iteration def-use cycles of scalar variables
-in LOOP. LOOP_VINFO represents the loop that is now being
+in LOOP.  LOOP_VINFO represents the loop that is now being
 considered for vectorization (can be LOOP, or an outer-loop
 enclosing LOOP).  */
 static void
 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 bool double_reduc;
 if (vect_print_dump_info (REPORT_DETAILS))
 fprintf (vect_dump, "=== vect_analyze_scalar_cycles ===");
-/* First - identify all inductions. Reduction detection assumes that all the
+/* First - identify all inductions.  Reduction detection assumes that all the
 inductions have been identified, therefore, this order must not be
 changed.  */
 for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 {
 gimple phi = gsi_stmt (gsi);
 	{
 	  fprintf (vect_dump, "Analyze phi: ");
 	  print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 	}
-/* Skip virtual phi's. The data dependences that are associated with
+/* Skip virtual phi's.  The data dependences that are associated with
 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 if (!is_gimple_reg (SSA_NAME_VAR (def)))
 	continue;
 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 /* Analyze the evolution function.  */
 access_fn = analyze_scalar_evolution (loop, def);
+if (access_fn)
+	STRIP_NOPS (access_fn);
 if (access_fn && vect_print_dump_info (REPORT_DETAILS))
 	{
 	  fprintf (vect_dump, "Access function of PHI: ");
 	  print_generic_expr (vect_dump, access_fn, TDF_SLIM);
 	}
 /* Function vect_analyze_scalar_cycles.
 Examine the cross iteration def-use cycles of scalar variables, by
-analyzing the loop-header PHIs of scalar variables; Classify each
+analyzing the loop-header PHIs of scalar variables.  Classify each
 cycle as one of the following: invariant, induction, reduction, unknown.
 We do that for the loop represented by LOOP_VINFO, and also to its
 inner-loop, if exists.
 Examples for scalar cycles:
 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
 LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
 LOOP_VINFO_VECT_FACTOR (res) = 0;
+LOOP_VINFO_LOOP_NEST (res) = VEC_alloc (loop_p, heap, 3);
 LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
 LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
 LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
 VEC_alloc (gimple, heap,
 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
 LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10);
 LOOP_VINFO_REDUCTIONS (res) = VEC_alloc (gimple, heap, 10);
 LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
+LOOP_VINFO_PEELING_HTAB (res) = NULL;
 return res;
 }
 if (!clean_stmts)
 {
 free (LOOP_VINFO_BBS (loop_vinfo));
 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
+VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo));
 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
+VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
 free (loop_vinfo);
 loop->aux = NULL;
 return;
 }
 }
 free (LOOP_VINFO_BBS (loop_vinfo));
 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
+VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo));
 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
-for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++)
+FOR_EACH_VEC_ELT (slp_instance, slp_instances, j, instance)
 vect_free_slp_instance (instance);
 VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
 VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
 VEC_free (gimple, heap, LOOP_VINFO_REDUCTIONS (loop_vinfo));
+if (LOOP_VINFO_PEELING_HTAB (loop_vinfo))
+htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo));
 free (loop_vinfo);
 loop->aux = NULL;
 }
 loop->aux = loop_vinfo;
 return loop_vinfo;
 }
+/* Get cost by calling cost target builtin.  */
+static inline int
+vect_get_cost (enum vect_cost_for_stmt type_of_cost)
+{
+tree dummy_type = NULL;
+int dummy = 0;
+return targetm.vectorize.builtin_vectorization_cost (type_of_cost,
+dummy_type, dummy);
+}
 /* Function vect_analyze_loop_operations.
 Scan the loop stmts and make sure they are all vectorizable.  */
 static bool
 fprintf (vect_dump,"not vectorized: iteration count smaller than "
 "vectorization factor.");
 return false;
 }
-/* Analyze cost. Decide if worth while to vectorize.  */
+/* Analyze cost.  Decide if worth while to vectorize.  */
 /* Once VF is set, SLP costs should be updated since the number of created
 vector stmts depends on VF.  */
 vect_update_slp_costs_according_to_vf (loop_vinfo);
 return true;
 }
-/* Function vect_analyze_loop.
+/* Function vect_analyze_loop_2.
 Apply a set of analyses on LOOP, and create a loop_vec_info struct
-for it. The different analyses will record information in the
+for it.  The different analyses will record information in the
 loop_vec_info struct.  */
-loop_vec_info
+static bool
-vect_analyze_loop (struct loop *loop)
+vect_analyze_loop_2 (loop_vec_info loop_vinfo)
 {
-bool ok;
+bool ok, dummy;
-loop_vec_info loop_vinfo;
 int max_vf = MAX_VECTORIZATION_FACTOR;
 int min_vf = 2;
-if (vect_print_dump_info (REPORT_DETAILS))
-fprintf (vect_dump, "===== analyze_loop_nest =====");
-if (loop_outer (loop)
-&& loop_vec_info_for_loop (loop_outer (loop))
-&& LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
-{
-if (vect_print_dump_info (REPORT_DETAILS))
-	fprintf (vect_dump, "outer-loop already vectorized.");
-return NULL;
-}
-/* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
-loop_vinfo = vect_analyze_loop_form (loop);
-if (!loop_vinfo)
-{
-if (vect_print_dump_info (REPORT_DETAILS))
-	fprintf (vect_dump, "bad loop form.");
-return NULL;
-}
 /* Find all data references in the loop (which correspond to vdefs/vuses)
 and analyze their evolution in the loop.  Also adjust the minimal
 vectorization factor according to the loads and stores.
 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
 if (!ok)
 {
 if (vect_print_dump_info (REPORT_DETAILS))
 	fprintf (vect_dump, "bad data references.");
-destroy_loop_vec_info (loop_vinfo, true);
+return false;
-return NULL;
 }
 /* Classify all cross-iteration scalar data-flow cycles.
 Cross-iteration cycles caused by virtual phis are analyzed separately.  */
 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
 if (!ok)
 {
 if (vect_print_dump_info (REPORT_DETAILS))
 	fprintf (vect_dump, "unexpected pattern.");
-destroy_loop_vec_info (loop_vinfo, true);
+return false;
-return NULL;
 }
 /* Analyze data dependences between the data-refs in the loop
 and adjust the maximum vectorization factor according to
 the dependences.
 FORNOW: fail at the first data dependence that we encounter.  */
-ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf);
+ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf, &dummy);
 if (!ok
 || max_vf < min_vf)
 {
 if (vect_print_dump_info (REPORT_DETAILS))
 	fprintf (vect_dump, "bad data dependence.");
-destroy_loop_vec_info (loop_vinfo, true);
+return false;
-return NULL;
 }
 ok = vect_determine_vectorization_factor (loop_vinfo);
 if (!ok)
 {
 if (vect_print_dump_info (REPORT_DETAILS))
 fprintf (vect_dump, "can't determine vectorization factor.");
-destroy_loop_vec_info (loop_vinfo, true);
+return false;
-return NULL;
 }
 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
 {
 if (vect_print_dump_info (REPORT_DETAILS))
 	fprintf (vect_dump, "bad data dependence.");
-destroy_loop_vec_info (loop_vinfo, true);
+return false;
-return NULL;
 }
 /* Analyze the alignment of the data-refs in the loop.
 Fail if a data reference is found that cannot be vectorized.  */
 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
 if (!ok)
 {
 if (vect_print_dump_info (REPORT_DETAILS))
 	fprintf (vect_dump, "bad data alignment.");
-destroy_loop_vec_info (loop_vinfo, true);
+return false;
-return NULL;
 }
 /* Analyze the access patterns of the data-refs in the loop (consecutive,
 complex, etc.). FORNOW: Only handle consecutive access pattern.  */
 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
 if (!ok)
 {
 if (vect_print_dump_info (REPORT_DETAILS))
 	fprintf (vect_dump, "bad data access.");
-destroy_loop_vec_info (loop_vinfo, true);
+return false;
-return NULL;
 }
 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
 It is important to call pruning after vect_analyze_data_ref_accesses,
 since we use grouping information gathered by interleaving analysis.  */
 if (!ok)
 {
 if (vect_print_dump_info (REPORT_DETAILS))
 	fprintf (vect_dump, "too long list of versioning for alias "
 			    "run-time tests.");
-destroy_loop_vec_info (loop_vinfo, true);
+return false;
-return NULL;
+}
+/* This pass will decide on using loop versioning and/or loop peeling in
+order to enhance the alignment of data references in the loop.  */
+ok = vect_enhance_data_refs_alignment (loop_vinfo);
+if (!ok)
+{
+if (vect_print_dump_info (REPORT_DETAILS))
+fprintf (vect_dump, "bad data alignment.");
+return false;
 }
 /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
 ok = vect_analyze_slp (loop_vinfo, NULL);
 if (ok)
 /* Find stmts that need to be both vectorized and SLPed.  */
 vect_detect_hybrid_slp (loop_vinfo);
 }
-/* This pass will decide on using loop versioning and/or loop peeling in
-order to enhance the alignment of data references in the loop.  */
-ok = vect_enhance_data_refs_alignment (loop_vinfo);
-if (!ok)
-{
-if (vect_print_dump_info (REPORT_DETAILS))
-	fprintf (vect_dump, "bad data alignment.");
-destroy_loop_vec_info (loop_vinfo, true);
-return NULL;
-}
 /* Scan all the operations in the loop and make sure they are
 vectorizable.  */
 ok = vect_analyze_loop_operations (loop_vinfo);
 if (!ok)
 {
 if (vect_print_dump_info (REPORT_DETAILS))
 	fprintf (vect_dump, "bad operation or unsupported loop bound.");
+return false;
+}
+return true;
+}
+/* Function vect_analyze_loop.
+Apply a set of analyses on LOOP, and create a loop_vec_info struct
+for it.  The different analyses will record information in the
+loop_vec_info struct.  */
+loop_vec_info
+vect_analyze_loop (struct loop *loop)
+{
+loop_vec_info loop_vinfo;
+unsigned int vector_sizes;
+/* Autodetect first vector size we try.  */
+current_vector_size = 0;
+vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
+if (vect_print_dump_info (REPORT_DETAILS))
+fprintf (vect_dump, "===== analyze_loop_nest =====");
+if (loop_outer (loop)
+&& loop_vec_info_for_loop (loop_outer (loop))
+&& LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
+{
+if (vect_print_dump_info (REPORT_DETAILS))
+	fprintf (vect_dump, "outer-loop already vectorized.");
+return NULL;
+}
+while (1)
+{
+/* Check the CFG characteristics of the loop (nesting, entry/exit).  */
+loop_vinfo = vect_analyze_loop_form (loop);
+if (!loop_vinfo)
+	{
+	  if (vect_print_dump_info (REPORT_DETAILS))
+	    fprintf (vect_dump, "bad loop form.");
+	  return NULL;
+	}
+if (vect_analyze_loop_2 (loop_vinfo))
+	{
+	  LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
+	  return loop_vinfo;
+	}
 destroy_loop_vec_info (loop_vinfo, true);
-return NULL;
-}
+vector_sizes &= ~current_vector_size;
+if (vector_sizes == 0
-LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
+	  || current_vector_size == 0)
+	return NULL;
-return loop_vinfo;
+/* Try the next biggest vector size.  */
+current_vector_size = 1 << floor_log2 (vector_sizes);
+if (vect_print_dump_info (REPORT_DETAILS))
+	fprintf (vect_dump, "***** Re-trying analysis with "
+		 "vector size %d\n", current_vector_size);
+}
 }
 /* Function reduction_code_for_scalar_code
 return false;
 }
 }
-/* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
+/* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
 STMT is printed with a message MSG. */
 static void
 report_vect_op (gimple stmt, const char *msg)
 {
 /* Function vect_is_simple_reduction_1
 (1) Detect a cross-iteration def-use cycle that represents a simple
-reduction computation. We look for the following pattern:
+reduction computation.  We look for the following pattern:
 loop_header:
 a1 = phi < a0, a2 >
 a3 = ...
 a2 = operation (a3, a1)
 such that:
 1. operation is commutative and associative and it is safe to
 change the order of the computation (if CHECK_REDUCTION is true)
 2. no uses for a2 in the loop (a2 is used out of the loop)
-3. no uses of a1 in the loop besides the reduction operation.
+3. no uses of a1 in the loop besides the reduction operation
+4. no uses of a1 outside the loop.
-Condition 1 is tested here.
+Conditions 1,4 are tested here.
 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
 nested cycles, if CHECK_REDUCTION is false.
 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
 {
 gimple use_stmt = USE_STMT (use_p);
 if (is_gimple_debug (use_stmt))
 	continue;
-if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
-	  && vinfo_for_stmt (use_stmt)
+if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
+{
+if (vect_print_dump_info (REPORT_DETAILS))
+fprintf (vect_dump, "intermediate value used outside loop.");
+return NULL;
+}
+if (vinfo_for_stmt (use_stmt)
 	  && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
 nloop_uses++;
 if (nloop_uses > 1)
 {
 if (vect_print_dump_info (REPORT_DETAILS))
 /* We can handle "res -= x[i]", which is non-associative by
 simply rewriting this into "res += -x[i]".  Avoid changing
 gimple instruction for the first simple tests and only do this
 if we're allowed to change code at all.  */
-if (code == MINUS_EXPR && modify)
+if (code == MINUS_EXPR
+&& modify
+&& (op1 = gimple_assign_rhs1 (def_stmt))
+&& TREE_CODE (op1) == SSA_NAME
+&& SSA_NAME_DEF_STMT (op1) == phi)
 code = PLUS_EXPR;
 if (check_reduction
 && (!commutative_tree_code (code) || !associative_tree_code (code)))
 {
 if (def2 && def2 == phi
 && (code == COND_EXPR
 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
 && (is_gimple_assign (def1)
+		  || is_gimple_call (def1)
 	          || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
 == vect_induction_def
 	          || (gimple_code (def1) == GIMPLE_PHI
 	              && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
 == vect_internal_def
 }
 else if (def1 && def1 == phi
 	   && (code == COND_EXPR
 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
 	           && (is_gimple_assign (def2)
+		       || is_gimple_call (def2)
 	               || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
 == vect_induction_def
 	               || (gimple_code (def2) == GIMPLE_PHI
 		           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
 == vect_internal_def
 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
 bool check_reduction, bool *double_reduc)
 {
 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
 				     double_reduc, true);
+}
+/* Calculate the cost of one scalar iteration of the loop.  */
+int
+vect_get_single_scalar_iteraion_cost (loop_vec_info loop_vinfo)
+{
+struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
+int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
+int innerloop_iters, i, stmt_cost;
+/* Count statements in scalar loop.  Using this as scalar cost for a single
+iteration for now.
+TODO: Add outer loop support.
+TODO: Consider assigning different costs to different scalar
+statements.  */
+/* FORNOW.  */
+innerloop_iters = 1;
+if (loop->inner)
+innerloop_iters = 50; /* FIXME */
+for (i = 0; i < nbbs; i++)
+{
+gimple_stmt_iterator si;
+basic_block bb = bbs[i];
+if (bb->loop_father == loop->inner)
+factor = innerloop_iters;
+else
+factor = 1;
+for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
+{
+gimple stmt = gsi_stmt (si);
+stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
+continue;
+/* Skip stmts that are not vectorized inside the loop.  */
+if (stmt_info
+&& !STMT_VINFO_RELEVANT_P (stmt_info)
+&& (!STMT_VINFO_LIVE_P (stmt_info)
+|| STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
+continue;
+if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
+{
+if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
+stmt_cost = vect_get_cost (scalar_load);
+else
+stmt_cost = vect_get_cost (scalar_store);
+}
+else
+stmt_cost = vect_get_cost (scalar_stmt);
+scalar_single_iter_cost += stmt_cost * factor;
+}
+}
+return scalar_single_iter_cost;
+}
+/* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
+int
+vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
+int *peel_iters_epilogue,
+int scalar_single_iter_cost)
+{
+int peel_guard_costs = 0;
+int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+{
+*peel_iters_epilogue = vf/2;
+if (vect_print_dump_info (REPORT_COST))
+fprintf (vect_dump, "cost model: "
+"epilogue peel iters set to vf/2 because "
+"loop iterations are unknown .");
+/* If peeled iterations are known but number of scalar loop
+iterations are unknown, count a taken branch per peeled loop.  */
+peel_guard_costs =  2 * vect_get_cost (cond_branch_taken);
+}
+else
+{
+int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
+peel_iters_prologue = niters < peel_iters_prologue ?
+niters : peel_iters_prologue;
+*peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
+}
+return (peel_iters_prologue * scalar_single_iter_cost)
++ (*peel_iters_epilogue * scalar_single_iter_cost)
++ peel_guard_costs;
 }
 /* Function vect_estimate_min_profitable_iters
 Return the number of iterations required for the vector version of the
 int scalar_outside_cost = 0;
 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 int nbbs = loop->num_nodes;
-int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
+int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
 int peel_guard_costs = 0;
 int innerloop_iters = 0, factor;
 VEC (slp_instance, heap) *slp_instances;
 slp_instance instance;
 "versioning aliasing.\n");
 }
 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
-vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
+vec_outside_cost += vect_get_cost (cond_branch_taken);
 /* Count statements in scalar loop.  Using this as scalar cost for a single
 iteration for now.
 TODO: Add outer loop support.
 	  /* Skip stmts that are not vectorized inside the loop.  */
 	  if (!STMT_VINFO_RELEVANT_P (stmt_info)
 	      && (!STMT_VINFO_LIVE_P (stmt_info)
 		  || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
 	    continue;
-	  scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
 	  vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
 	  /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
 	     some of the "outside" costs are generated inside the outer-loop.  */
 	  vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
 	}
 }
+scalar_single_iter_cost = vect_get_single_scalar_iteraion_cost (loop_vinfo);
 /* Add additional cost for the peeled instructions in prologue and epilogue
 loop.
 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
 at compile-time - we assume it's vf/2 (the worst would be vf-1).
 TODO: Build an expression that represents peel_iters for prologue and
 epilogue to be used in a run-time test.  */
-if (byte_misalign < 0)
+if (npeel  < 0)
 {
 peel_iters_prologue = vf/2;
 if (vect_print_dump_info (REPORT_COST))
 fprintf (vect_dump, "cost model: "
 "prologue peel iters set to vf/2.");
 /* If peeled iterations are unknown, count a taken branch and a not taken
 branch per peeled loop. Even if scalar loop iterations are known,
 vector iterations are not known since peeled prologue iterations are
 not known. Hence guards remain the same.  */
-peel_guard_costs +=  2 * (TARG_COND_TAKEN_BRANCH_COST
+peel_guard_costs +=  2 * (vect_get_cost (cond_branch_taken)
-+ TARG_COND_NOT_TAKEN_BRANCH_COST);
++ vect_get_cost (cond_branch_not_taken));
+vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
++ (peel_iters_epilogue * scalar_single_iter_cost)
++ peel_guard_costs;
 }
 else
 {
-if (byte_misalign)
+peel_iters_prologue = npeel;
-	{
+vec_outside_cost += vect_get_known_peeling_cost (loop_vinfo,
-	  struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
+peel_iters_prologue, &peel_iters_epilogue,
-	  int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
+scalar_single_iter_cost);
-	  tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
+}
-	  int nelements = TYPE_VECTOR_SUBPARTS (vectype);
-	  peel_iters_prologue = nelements - (byte_misalign / element_size);
-	}
-else
-	peel_iters_prologue = 0;
-if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
-{
-peel_iters_epilogue = vf/2;
-if (vect_print_dump_info (REPORT_COST))
-fprintf (vect_dump, "cost model: "
-"epilogue peel iters set to vf/2 because "
-"loop iterations are unknown .");
-	  /* If peeled iterations are known but number of scalar loop
-	     iterations are unknown, count a taken branch per peeled loop.  */
-	  peel_guard_costs +=  2 * TARG_COND_TAKEN_BRANCH_COST;
-}
-else
-	{
-	  int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
-	  peel_iters_prologue = niters < peel_iters_prologue ?
-					niters : peel_iters_prologue;
-	  peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
-	}
-}
-vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
-+ (peel_iters_epilogue * scalar_single_iter_cost)
-+ peel_guard_costs;
 /* FORNOW: The scalar outside cost is incremented in one of the
 following ways:
 1. The vectorizer checks for alignment and aliasing and generates
 TODO: The back end may reorder the BBS's differently and reverse
 conditions/branch directions.  Change the estimates below to
 something more reasonable.  */
 /* If the number of iterations is known and we do not do versioning, we can
-decide whether to vectorize at compile time. Hence the scalar version
+decide whether to vectorize at compile time.  Hence the scalar version
 do not carry cost model guard costs.  */
 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
 {
 /* Cost model check occurs at versioning.  */
 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
-	scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST;
+	scalar_outside_cost += vect_get_cost (cond_branch_not_taken);
 else
 	{
 	  /* Cost model check occurs at prologue generation.  */
 	  if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
-	    scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST
+	    scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken)
-	      + TARG_COND_NOT_TAKEN_BRANCH_COST;
++ vect_get_cost (cond_branch_not_taken);
 	  /* Cost model check occurs at epilogue generation.  */
 	  else
-	    scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST;
+	    scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken);
 	}
 }
 /* Add SLP costs.  */
 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
-for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
+FOR_EACH_VEC_ELT (slp_instance, slp_instances, i, instance)
 {
 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
 }
 /* Calculate number of iterations required to make the vector version
-profitable, relative to the loop bodies only. The following condition
+profitable, relative to the loop bodies only.  The following condition
 must hold true:
 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
 where
 SIC = scalar iteration cost, VIC = vector iteration cost,
 VOC = vector outside cost, VF = vectorization factor,
 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 /* Cost of reduction op inside loop.  */
-STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST;
+STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
++= ncopies * vect_get_cost (vector_stmt);
 stmt = STMT_VINFO_STMT (stmt_info);
 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
 {
 orig_stmt = STMT_VINFO_STMT (stmt_info);
 code = gimple_assign_rhs_code (orig_stmt);
 /* Add in cost for initial definition.  */
-outer_cost += TARG_SCALAR_TO_VEC_COST;
+outer_cost += vect_get_cost (scalar_to_vec);
 /* Determine cost of epilogue code.
 We have a reduction operator that will reduce the vector in one statement.
 Also requires scalar extract.  */
 if (!nested_in_vect_loop_p (loop, orig_stmt))
 {
 if (reduc_code != ERROR_MARK)
-	outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
+	outer_cost += vect_get_cost (vector_stmt)
++ vect_get_cost (vec_to_scalar);
 else
 	{
 	  int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
 	  tree bitsize =
 	    TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
 	  optab = optab_for_tree_code (code, vectype, optab_default);
 	  /* We have a whole vector shift available.  */
 	  if (VECTOR_MODE_P (mode)
-	      && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing
+	      && optab_handler (optab, mode) != CODE_FOR_nothing
-	      && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
+	      && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
 	    /* Final reduction via vector shifts and the reduction operator. Also
 	       requires scalar extract.  */
-	    outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST
+	    outer_cost += ((exact_log2(nelements) * 2)
-				+ TARG_VEC_TO_SCALAR_COST);
+* vect_get_cost (vector_stmt)
+	      + vect_get_cost (vec_to_scalar));
 	  else
 	    /* Use extracts and reduction op for final reduction.  For N elements,
 we have N extracts and N-1 reduction ops.  */
-	    outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST);
+	    outer_cost += ((nelements + nelements - 1)
+* vect_get_cost (vector_stmt));
 	}
 }
 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
 static void
 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
 {
 /* loop cost for vec_loop.  */
-STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
+STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
+= ncopies * vect_get_cost (vector_stmt);
 /* prologue cost for vec_init and vec_step.  */
-STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
+STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info)
+= 2 * vect_get_cost (scalar_to_vec);
 if (vect_print_dump_info (REPORT_COST))
 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
 STMT - a stmt that performs an induction operation in the loop.
 IV_PHI - the initial value of the induction variable
 Output:
 Return a vector variable, initialized with the first VF values of
-the induction variable. E.g., for an iv with IV_PHI='X' and
+the induction variable.  E.g., for an iv with IV_PHI='X' and
 evolution S, for a vector of 4 units, we want to return:
 [X, X + S, X + 2*S, X + 3*S].  */
 static tree
 get_initial_def_for_induction (gimple iv_phi)
 {
 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
-tree scalar_type = TREE_TYPE (gimple_phi_result (iv_phi));
+tree scalar_type;
 tree vectype;
 int nunits;
 edge pe = loop_preheader_edge (loop);
 struct loop *iv_loop;
 basic_block new_bb;
 edge latch_e;
 tree loop_arg;
 gimple_stmt_iterator si;
 basic_block bb = gimple_bb (iv_phi);
 tree stepvectype;
+tree resvectype;
-vectype = get_vectype_for_scalar_type (scalar_type);
-gcc_assert (vectype);
-nunits = TYPE_VECTOR_SUBPARTS (vectype);
-ncopies = vf / nunits;
-gcc_assert (phi_info);
-gcc_assert (ncopies >= 1);
-/* Find the first insertion point in the BB.  */
-si = gsi_after_labels (bb);
-if (INTEGRAL_TYPE_P (scalar_type))
-step_expr = build_int_cst (scalar_type, 0);
-else if (POINTER_TYPE_P (scalar_type))
-step_expr = build_int_cst (sizetype, 0);
-else
-step_expr = build_real (scalar_type, dconst0);
 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
 if (nested_in_vect_loop_p (loop, iv_phi))
 {
 nested_in_vect_loop = true;
 latch_e = loop_latch_edge (iv_loop);
 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
 gcc_assert (access_fn);
+STRIP_NOPS (access_fn);
 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
 &init_expr, &step_expr);
 gcc_assert (ok);
 pe = loop_preheader_edge (iv_loop);
+scalar_type = TREE_TYPE (init_expr);
+vectype = get_vectype_for_scalar_type (scalar_type);
+resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
+gcc_assert (vectype);
+nunits = TYPE_VECTOR_SUBPARTS (vectype);
+ncopies = vf / nunits;
+gcc_assert (phi_info);
+gcc_assert (ncopies >= 1);
+/* Find the first insertion point in the BB.  */
+si = gsi_after_labels (bb);
 /* Create the vector that holds the initial_value of the induction.  */
 if (nested_in_vect_loop)
 {
 /* iv_loop is nested in the loop to be vectorized.  init_expr had already
-	 been created during vectorization of previous stmts; We obtain it from
+	 been created during vectorization of previous stmts.  We obtain it
-	 the STMT_VINFO_VEC_STMT of the defining stmt. */
+	 from the STMT_VINFO_VEC_STMT of the defining stmt.  */
 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
 loop_preheader_edge (iv_loop));
 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
 }
 else
 	  new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
 	  gcc_assert (!new_bb);
 	}
 t = NULL_TREE;
-t = tree_cons (NULL_TREE, init_expr, t);
+t = tree_cons (NULL_TREE, new_name, t);
 for (i = 1; i < nunits; i++)
 	{
 	  /* Create: new_name_i = new_name + step_expr  */
 	  enum tree_code code = POINTER_TYPE_P (scalar_type)
 				? POINTER_PLUS_EXPR : PLUS_EXPR;
 expr = build_int_cst (TREE_TYPE (step_expr), vf);
 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
 			      expr, step_expr);
 }
-t = NULL_TREE;
+t = unshare_expr (new_name);
-for (i = 0; i < nunits; i++)
-t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
 gcc_assert (CONSTANT_CLASS_P (new_name));
 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
 gcc_assert (stepvectype);
-vec = build_vector (stepvectype, t);
+vec = build_vector_from_val (stepvectype, t);
 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
 /* Create the following def-use cycle:
 loop prolog:
 /* Create the vector that holds the step of the induction.  */
 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
 			      expr, step_expr);
-t = NULL_TREE;
+t = unshare_expr (new_name);
-for (i = 0; i < nunits; i++)
-	t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
 gcc_assert (CONSTANT_CLASS_P (new_name));
-vec = build_vector (stepvectype, t);
+vec = build_vector_from_val (stepvectype, t);
 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
 vec_def = induc_def;
 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
 for (i = 1; i < ncopies; i++)
 						   vec_def, vec_step);
 	  vec_def = make_ssa_name (vec_dest, new_stmt);
 	  gimple_assign_set_lhs (new_stmt, vec_def);
 	  gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
+	  if (!useless_type_conversion_p (resvectype, vectype))
+	    {
+	      new_stmt = gimple_build_assign_with_ops
+		  (VIEW_CONVERT_EXPR,
+		   vect_get_new_vect_var (resvectype, vect_simple_var,
+					  "vec_iv_"),
+		   build1 (VIEW_CONVERT_EXPR, resvectype,
+			   gimple_assign_lhs (new_stmt)), NULL_TREE);
+	      gimple_assign_set_lhs (new_stmt,
+				     make_ssa_name
+				       (gimple_assign_lhs (new_stmt), new_stmt));
+	      gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
+	    }
 	  set_vinfo_for_stmt (new_stmt,
 			      new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
 	  STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
 	  prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
 	}
 fprintf (vect_dump, "\n");
 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
 }
 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
+if (!useless_type_conversion_p (resvectype, vectype))
+{
+new_stmt = gimple_build_assign_with_ops
+	 (VIEW_CONVERT_EXPR,
+	  vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
+	  build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
+induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
+gimple_assign_set_lhs (new_stmt, induc_def);
+si = gsi_start_bb (bb);
+gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
+set_vinfo_for_stmt (new_stmt,
+			  new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
+STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
+	= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
+}
 return induc_def;
 }
 /* Function get_initial_def_for_reduction
 nested_in_vect_loop = true;
 else
 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
 /* In case of double reduction we only create a vector variable to be put
-in the reduction phi node. The actual statement creation is done in
+in the reduction phi node.  The actual statement creation is done in
 vect_create_epilog_for_reduction.  */
 if (adjustment_def && nested_in_vect_loop
 && TREE_CODE (init_val) == SSA_NAME
 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
 && gimple_code (def_stmt) == GIMPLE_PHI
 NULL);
 else
 *adjustment_def = init_val;
 }
-if (code == MULT_EXPR || code == BIT_AND_EXPR)
+if (code == MULT_EXPR)
 {
 real_init_val = dconst1;
 int_init_val = 1;
 }
+if (code == BIT_AND_EXPR)
+int_init_val = -1;
 if (SCALAR_FLOAT_TYPE_P (scalar_type))
 def_for_init = build_real (scalar_type, real_init_val);
 else
 def_for_init = build_int_cst (scalar_type, int_init_val);
 *adjustment_def = NULL_TREE;
 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
 break;
 }
-for (i = nunits - 1; i >= 0; --i)
+	init_def = build_vector_from_val (vectype, init_value);
-t = tree_cons (NULL_TREE, init_value, t);
-if (TREE_CONSTANT (init_val))
-init_def = build_vector (vectype, t);
-else
-init_def = build_constructor_from_list (vectype, t);
 break;
 default:
 gcc_unreachable ();
 }
 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
 reduction statements.
 STMT is the scalar reduction stmt that is being vectorized.
 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
-number of elements that we can fit in a vectype (nunits). In this case
+number of elements that we can fit in a vectype (nunits).  In this case
 we have to generate more than one vector stmt - i.e - we need to "unroll"
 the vector stmt by a factor VF/nunits.  For more details see documentation
 in vectorizable_operation.
 REDUC_CODE is the tree-code for the epilog reduction.
 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
 tree bitsize, bitpos;
 tree adjustment_def = NULL;
 tree vec_initial_def = NULL;
 tree reduction_op, expr, def;
 tree orig_name, scalar_result;
-imm_use_iterator imm_iter;
+imm_use_iterator imm_iter, phi_imm_iter;
-use_operand_p use_p;
+use_operand_p use_p, phi_use_p;
 bool extract_scalar_result = false;
 gimple use_stmt, orig_stmt, reduction_phi = NULL;
 bool nested_in_vect_loop = false;
 VEC (gimple, heap) *new_phis = NULL;
 enum vect_def_type dt = vect_unknown_def_type;
 (in case of SLP, do it for all the phis). */
 /* Get the loop-entry arguments.  */
 if (slp_node)
-vect_get_slp_defs (slp_node, &vec_initial_defs, NULL, reduc_index);
+vect_get_slp_defs (reduction_op, NULL_TREE, slp_node, &vec_initial_defs,
+NULL, reduc_index);
 else
 {
 vec_initial_defs = VEC_alloc (tree, heap, 1);
 /* For the case of reduction, vect_get_vec_def_for_operand returns
 the scalar def before the loop, that defines the initial value
 &adjustment_def);
 VEC_quick_push (tree, vec_initial_defs, vec_initial_def);
 }
 /* Set phi nodes arguments.  */
-for (i = 0; VEC_iterate (gimple, reduction_phis, i, phi); i++)
+FOR_EACH_VEC_ELT (gimple, reduction_phis, i, phi)
 {
 tree vec_init_def = VEC_index (tree, vec_initial_defs, i);
 tree def = VEC_index (tree, vect_defs, i);
 for (j = 0; j < ncopies; j++)
 {
 Store them in NEW_PHIS.  */
 exit_bb = single_exit (loop)->dest;
 prev_phi_info = NULL;
 new_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs));
-for (i = 0; VEC_iterate (tree, vect_defs, i, def); i++)
+FOR_EACH_VEC_ELT (tree, vect_defs, i, def)
 {
 for (j = 0; j < ncopies; j++)
 {
 phi = create_phi_node (SSA_NAME_VAR (def), exit_bb);
 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
 	    }
 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
 prev_phi_info = vinfo_for_stmt (phi);
 }
+}
+/* The epilogue is created for the outer-loop, i.e., for the loop being
+vectorized.  */
+if (double_reduc)
+{
+loop = outer_loop;
+exit_bb = single_exit (loop)->dest;
 }
 exit_gsi = gsi_after_labels (exit_bb);
 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
 bitsize = TYPE_SIZE (scalar_type);
 /* In case this is a reduction in an inner-loop while vectorizing an outer
 loop - we don't need to extract a single scalar result at the end of the
 inner-loop (unless it is double reduction, i.e., the use of reduction is
-outside the outer-loop). The final vector of partial results will be used
+outside the outer-loop).  The final vector of partial results will be used
 in the vectorized outer-loop, or reduced to a scalar result at the end of
 the outer-loop.  */
 if (nested_in_vect_loop && !double_reduc)
 goto vect_finalize_reduction;
 int bit_offset;
 int element_bitsize = tree_low_cst (bitsize, 1);
 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
 tree vec_temp;
-if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
+if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
 shift_code = VEC_RSHIFT_EXPR;
 else
 have_whole_vector_shift = false;
 /* Regardless of whether we have a whole vector shift, if we're
 if (!VECTOR_MODE_P (mode))
 have_whole_vector_shift = false;
 else
 {
 optab optab = optab_for_tree_code (code, vectype, optab_default);
-if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
+if (optab_handler (optab, mode) == CODE_FOR_nothing)
 have_whole_vector_shift = false;
 }
 if (have_whole_vector_shift && !slp_node)
 {
 if (vect_print_dump_info (REPORT_DETAILS))
 fprintf (vect_dump, "Reduce using scalar code. ");
 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
-for (i = 0; VEC_iterate (gimple, new_phis, i, new_phi); i++)
+FOR_EACH_VEC_ELT (gimple, new_phis, i, new_phi)
 {
 vec_temp = PHI_RESULT (new_phi);
 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
 bitsize_zero_node);
 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
 }
 }
 }
 /* The only case where we need to reduce scalar results in SLP, is
-unrolling. If the size of SCALAR_RESULTS is greater than
+unrolling.  If the size of SCALAR_RESULTS is greater than
 GROUP_SIZE, we reduce them combining elements modulo
 GROUP_SIZE.  */
 if (slp_node)
 {
 tree res, first_res, new_res;
 VEC_safe_push (tree, heap, scalar_results, new_temp);
 }
 vect_finalize_reduction:
+if (double_reduc)
+loop = loop->inner;
 /* 2.5 Adjust the final result by the initial value of the reduction
 	 variable. (When such adjustment is not needed, then
 	 'adjustment_def' is zero).  For example, if code is PLUS we create:
 	 new_temp = loop_exit_def + adjustment_def  */
 VEC_replace (tree, scalar_results, 0, new_temp);
 VEC_replace (gimple, new_phis, 0, epilog_stmt);
 }
-/* 2.6  Handle the loop-exit phis. Replace the uses of scalar loop-exit
+/* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
 phis with new adjusted scalar results, i.e., replace use <s_out0>
 with use <s_out4>.
 Transform:
 loop_exit:
 s_out4 = adjust_result <s_out3>
 use <s_out4>
 use <s_out4> */
 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
-case that GROUP_SIZE is greater than vectorization factor). Therefore, we
+case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
-need to match SCALAR_RESULTS with corresponding statements. The first
+need to match SCALAR_RESULTS with corresponding statements.  The first
 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
 the first vector stmt, etc.
 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
 if (group_size > VEC_length (gimple, new_phis))
 {
 scalar_dest = gimple_assign_lhs (current_stmt);
 }
 phis = VEC_alloc (gimple, heap, 3);
 /* Find the loop-closed-use at the loop exit of the original scalar
-result. (The reduction result is expected to have two immediate uses -
+result.  (The reduction result is expected to have two immediate uses -
 one at the latch block, and one at the loop exit).  */
 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
 VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
 /* We expect to have found an exit_phi because of loop-closed-ssa
 form.  */
 gcc_assert (!VEC_empty (gimple, phis));
-for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++)
+FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
 {
 if (outer_loop)
 {
 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
 gimple vect_phi;
 }
 vect_phi_res = PHI_RESULT (vect_phi);
 /* Replace the use, i.e., set the correct vs1 in the regular
-reduction phi node. FORNOW, NCOPIES is always 1, so the
+reduction phi node.  FORNOW, NCOPIES is always 1, so the
 loop is redundant.  */
 use = reduction_phi;
 for (j = 0; j < ncopies; j++)
 {
 edge pr_edge = loop_preheader_edge (loop);
 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
 }
 }
 }
+}
+VEC_free (gimple, heap, phis);
+if (nested_in_vect_loop)
+{
+if (double_reduc)
+loop = outer_loop;
+else
+continue;
+}
+phis = VEC_alloc (gimple, heap, 3);
+/* Find the loop-closed-use at the loop exit of the original scalar
+result.  (The reduction result is expected to have two immediate uses,
+one at the latch block, and one at the loop exit).  For double
+reductions we are looking for exit phis of the outer loop.  */
+FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
+{
+if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
+VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
+else
+{
+if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
+{
+tree phi_res = PHI_RESULT (USE_STMT (use_p));
+FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
+{
+if (!flow_bb_inside_loop_p (loop,
+gimple_bb (USE_STMT (phi_use_p))))
+VEC_safe_push (gimple, heap, phis,
+USE_STMT (phi_use_p));
+}
+}
+}
+}
+FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
+{
 /* Replace the uses:  */
 orig_name = PHI_RESULT (exit_phi);
 scalar_result = VEC_index (tree, scalar_results, k);
 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
 Return FALSE if not a vectorizable STMT, TRUE otherwise.
 This function also handles reduction idioms (patterns) that have been
-recognized in advance during vect_pattern_recog. In this case, STMT may be
+recognized in advance during vect_pattern_recog.  In this case, STMT may be
 of this form:
 X = pattern_expr (arg0, arg1, ..., X)
 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
 sequence that had been detected and replaced by the pattern-stmt (STMT).
 vectorization factor should be V8HI); on the other hand, the vectype that
 is used to create the vector form is actually V4SI (the type of the result).
 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
 indicates what is the actual level of parallelism (V8HI in the example), so
-that the right vectorization factor would be derived. This vectype
+that the right vectorization factor would be derived.  This vectype
 corresponds to the type of arguments to the reduction stmt, and should *NOT*
-be used to create the vectorized stmt. The right vectype for the vectorized
+be used to create the vectorized stmt.  The right vectype for the vectorized
 stmt is obtained from the type of the result X:
 get_vectype_for_scalar_type (TREE_TYPE (X))
 This means that, contrary to "regular" reductions (or "regular" stmts in
 general), the following equation:
 tree def_arg;
 gimple def_arg_stmt;
 VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL, *vect_defs = NULL;
 VEC (gimple, heap) *phis = NULL;
 int vec_num;
-tree def0, def1;
+tree def0, def1, tem;
 if (nested_in_vect_loop_p (loop, stmt))
 {
 outer_loop = loop;
 loop = loop->inner;
 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
 }
-/* 3. Check the operands of the operation. The first operands are defined
+/* 3. Check the operands of the operation.  The first operands are defined
 inside the loop body. The last operand is the reduction variable,
 which is defined by the loop-header-phi.  */
 gcc_assert (is_gimple_assign (stmt));
-/* Flatten RHS */
+/* Flatten RHS.  */
 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
 {
 case GIMPLE_SINGLE_RHS:
 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
 if (op_type == ternary_op)
 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
 && !SCALAR_FLOAT_TYPE_P (scalar_type))
 return false;
 /* All uses but the last are expected to be defined in the loop.
-The last use is the reduction variable. In case of nested cycle this
+The last use is the reduction variable.  In case of nested cycle this
 assumption is not true: we use reduc_index to record the index of the
 reduction variable.  */
 for (i = 0; i < op_type-1; i++)
 {
-tree tem;
 /* The condition of COND_EXPR is checked in vectorizable_condition().  */
 if (i == 0 && code == COND_EXPR)
 continue;
 is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL,
 reduc_def_stmt = def_stmt;
 reduc_index = i;
 }
 }
-is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, NULL, &def_stmt,
+is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL, &def_stmt,
-				      &def, &dt);
+					&def, &dt, &tem);
+if (!vectype_in)
+vectype_in = tem;
 gcc_assert (is_simple_use);
 gcc_assert (dt == vect_reduction_def
 || dt == vect_nested_cycle
 || ((dt == vect_internal_def || dt == vect_external_def
 || dt == vect_constant_def || dt == vect_induction_def)
 fprintf (vect_dump, "no optab.");
 return false;
 }
-if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing)
+if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
 {
 if (vect_print_dump_info (REPORT_DETAILS))
 fprintf (vect_dump, "op not supported by target.");
 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
 This means that:
 1. The tree-code that is used to create the vector operation in the
 epilog code (that reduces the partial results) is not the
 tree-code of STMT, but is rather the tree-code of the original
-stmt from the pattern that STMT is replacing. I.e, in the example
+stmt from the pattern that STMT is replacing.  I.e, in the example
 above we want to use 'widen_sum' in the loop, but 'plus' in the
 epilog.
 2. The type (mode) we use to check available target support
 for the vector operation to be created in the *epilog*, is
 determined by the type of the reduction variable (in the example
-above we'd check this: plus_optab[vect_int_mode]).
+above we'd check this: optab_handler (plus_optab, vect_int_mode])).
 However the type (mode) we use to check available target support
 for the vector operation to be created *inside the loop*, is
 determined by the type of the other arguments to STMT (in the
-example we'd check this: widen_sum_optab[vect_short_mode]).
+example we'd check this: optab_handler (widen_sum_optab,
+	     vect_short_mode)).
 This is contrary to "regular" reductions, in which the types of all
 the arguments are the same as the type of the reduction variable.
 For "regular" reductions we can therefore use the same vector type
 (and also the same tree-code) when generating the epilog code and
 epilog_reduc_code = ERROR_MARK;
 }
 if (reduc_optab
-&& optab_handler (reduc_optab, vec_mode)->insn_code
+&& optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
-== CODE_FOR_nothing)
 {
 if (vect_print_dump_info (REPORT_DETAILS))
 fprintf (vect_dump, "reduc op not supported by target.");
 epilog_reduc_code = ERROR_MARK;
 }
 /* Handle uses.  */
 if (j == 0)
 {
+tree op0, op1 = NULL_TREE;
+op0 = ops[!reduc_index];
+if (op_type == ternary_op)
+{
+if (reduc_index == 0)
+op1 = ops[2];
+else
+op1 = ops[1];
+}
 if (slp_node)
-vect_get_slp_defs (slp_node, &vec_oprnds0, &vec_oprnds1, -1);
+vect_get_slp_defs (op0, op1, slp_node, &vec_oprnds0, &vec_oprnds1,
+-1);
 else
 {
 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
 stmt, NULL);
 VEC_quick_push (tree, vec_oprnds0, loop_vec_def0);
 if (op_type == ternary_op)
 {
-if (reduc_index == 0)
+loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
-loop_vec_def1 = vect_get_vec_def_for_operand (ops[2], stmt,
+NULL);
-NULL);
-else
-loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt,
-NULL);
 VEC_quick_push (tree, vec_oprnds1, loop_vec_def1);
 }
 }
 }
 else
 reduc_def = gimple_assign_lhs (new_stmt);
 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
 }
-for (i = 0; VEC_iterate (tree, vec_oprnds0, i, def0); i++)
+FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, def0)
 {
 if (slp_node)
 reduc_def = PHI_RESULT (VEC_index (gimple, phis, i));
 else
 {
 return true;
 }
 /* Function vectorizable_live_operation.
-STMT computes a value that is used outside the loop. Check if
+STMT computes a value that is used outside the loop.  Check if
 it can be supported.  */
 bool
 vectorizable_live_operation (gimple stmt,
 			     gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
 op_type = TREE_CODE_LENGTH (code);
 rhs_class = get_gimple_rhs_class (code);
 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
-/* FORNOW: support only if all uses are invariant. This means
+/* FORNOW: support only if all uses are invariant.  This means
 that the scalar operations can remain in place, unvectorized.
 The original last scalar value that they compute will be used.  */
 for (i = 0; i < op_type; i++)
 {
 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
 compile time constant), or it is a constant that doesn't divide by the
 vectorization factor, then an epilog loop needs to be created.
 We therefore duplicate the loop: the original loop will be vectorized,
-and will compute the first (n/VF) iterations. The second copy of the loop
+and will compute the first (n/VF) iterations.  The second copy of the loop
 will remain scalar and will compute the remaining (n%VF) iterations.
 (VF is the vectorization factor).  */
 if (do_peeling_for_loop_bound)
 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,

Mercurial > hg > CbC > CbC_gcc

comparison gcc/tree-vect-loop.c @ 67:f6334be47118