comparison gcc/tree-vect-loop.c @ 67:f6334be47118

update gcc from gcc-4.6-20100522 to gcc-4.6-20110318
author nobuyasu <dimolto@cr.ie.u-ryukyu.ac.jp>
date Tue, 22 Mar 2011 17:18:12 +0900
parents b7f97abdc517
children 04ced10e8804
comparison
equal deleted inserted replaced
65:65488c3d617d 67:f6334be47118
25 #include "coretypes.h" 25 #include "coretypes.h"
26 #include "tm.h" 26 #include "tm.h"
27 #include "ggc.h" 27 #include "ggc.h"
28 #include "tree.h" 28 #include "tree.h"
29 #include "basic-block.h" 29 #include "basic-block.h"
30 #include "diagnostic.h"
31 #include "tree-pretty-print.h" 30 #include "tree-pretty-print.h"
32 #include "gimple-pretty-print.h" 31 #include "gimple-pretty-print.h"
33 #include "tree-flow.h" 32 #include "tree-flow.h"
34 #include "tree-dump.h" 33 #include "tree-dump.h"
35 #include "cfgloop.h" 34 #include "cfgloop.h"
36 #include "cfglayout.h" 35 #include "cfglayout.h"
37 #include "expr.h" 36 #include "expr.h"
38 #include "recog.h" 37 #include "recog.h"
39 #include "optabs.h" 38 #include "optabs.h"
40 #include "params.h" 39 #include "params.h"
41 #include "toplev.h" 40 #include "diagnostic-core.h"
42 #include "tree-chrec.h" 41 #include "tree-chrec.h"
43 #include "tree-scalar-evolution.h" 42 #include "tree-scalar-evolution.h"
44 #include "tree-vectorizer.h" 43 #include "tree-vectorizer.h"
44 #include "target.h"
45 45
46 /* Loop Vectorization Pass. 46 /* Loop Vectorization Pass.
47 47
48 This pass tries to vectorize loops. 48 This pass tries to vectorize loops.
49 49
73 the vectorizer applies a set of analyses on a given set of loops, 73 the vectorizer applies a set of analyses on a given set of loops,
74 followed by the actual vectorization transformation for the loops that 74 followed by the actual vectorization transformation for the loops that
75 had successfully passed the analysis phase. 75 had successfully passed the analysis phase.
76 Throughout this pass we make a distinction between two types of 76 Throughout this pass we make a distinction between two types of
77 data: scalars (which are represented by SSA_NAMES), and memory references 77 data: scalars (which are represented by SSA_NAMES), and memory references
78 ("data-refs"). These two types of data require different handling both 78 ("data-refs"). These two types of data require different handling both
79 during analysis and transformation. The types of data-refs that the 79 during analysis and transformation. The types of data-refs that the
80 vectorizer currently supports are ARRAY_REFS which base is an array DECL 80 vectorizer currently supports are ARRAY_REFS which base is an array DECL
81 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer 81 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
82 accesses are required to have a simple (consecutive) access pattern. 82 accesses are required to have a simple (consecutive) access pattern.
83 83
94 94
95 Transformation phase: 95 Transformation phase:
96 ===================== 96 =====================
97 The loop transformation phase scans all the stmts in the loop, and 97 The loop transformation phase scans all the stmts in the loop, and
98 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in 98 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
99 the loop that needs to be vectorized. It inserts the vector code sequence 99 the loop that needs to be vectorized. It inserts the vector code sequence
100 just before the scalar stmt S, and records a pointer to the vector code 100 just before the scalar stmt S, and records a pointer to the vector code
101 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct 101 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
102 attached to S). This pointer will be used for the vectorization of following 102 attached to S). This pointer will be used for the vectorization of following
103 stmts which use the def of stmt S. Stmt S is removed if it writes to memory; 103 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
104 otherwise, we rely on dead code elimination for removing it. 104 otherwise, we rely on dead code elimination for removing it.
105 105
106 For example, say stmt S1 was vectorized into stmt VS1: 106 For example, say stmt S1 was vectorized into stmt VS1:
107 107
109 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1 109 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
110 S2: a = b; 110 S2: a = b;
111 111
112 To vectorize stmt S2, the vectorizer first finds the stmt that defines 112 To vectorize stmt S2, the vectorizer first finds the stmt that defines
113 the operand 'b' (S1), and gets the relevant vector def 'vb' from the 113 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
114 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The 114 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
115 resulting sequence would be: 115 resulting sequence would be:
116 116
117 VS1: vb = px[i]; 117 VS1: vb = px[i];
118 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1 118 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
119 VS2: va = vb; 119 VS2: va = vb;
123 load/store operations (like 'x[i]' in S1), and are handled differently. 123 load/store operations (like 'x[i]' in S1), and are handled differently.
124 124
125 Target modeling: 125 Target modeling:
126 ================= 126 =================
127 Currently the only target specific information that is used is the 127 Currently the only target specific information that is used is the
128 size of the vector (in bytes) - "UNITS_PER_SIMD_WORD". Targets that can 128 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
129 support different sizes of vectors, for now will need to specify one value 129 Targets that can support different sizes of vectors, for now will need
130 for "UNITS_PER_SIMD_WORD". More flexibility will be added in the future. 130 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
131 flexibility will be added in the future.
131 132
132 Since we only vectorize operations which vector form can be 133 Since we only vectorize operations which vector form can be
133 expressed using existing tree codes, to verify that an operation is 134 expressed using existing tree codes, to verify that an operation is
134 supported, the vectorizer checks the relevant optab at the relevant 135 supported, the vectorizer checks the relevant optab at the relevant
135 machine_mode (e.g, optab_handler (add_optab, V8HImode)->insn_code). If 136 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
136 the value found is CODE_FOR_nothing, then there's no target support, and 137 the value found is CODE_FOR_nothing, then there's no target support, and
137 we can't vectorize the stmt. 138 we can't vectorize the stmt.
138 139
139 For additional information on this project see: 140 For additional information on this project see:
140 http://gcc.gnu.org/projects/tree-ssa/vectorization.html 141 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
141 */ 142 */
142 143
143 /* Function vect_determine_vectorization_factor 144 /* Function vect_determine_vectorization_factor
144 145
145 Determine the vectorization factor (VF). VF is the number of data elements 146 Determine the vectorization factor (VF). VF is the number of data elements
146 that are operated upon in parallel in a single iteration of the vectorized 147 that are operated upon in parallel in a single iteration of the vectorized
147 loop. For example, when vectorizing a loop that operates on 4byte elements, 148 loop. For example, when vectorizing a loop that operates on 4byte elements,
148 on a target with vector size (VS) 16byte, the VF is set to 4, since 4 149 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
149 elements can fit in a single vector register. 150 elements can fit in a single vector register.
150 151
151 We currently support vectorization of loops in which all types operated upon 152 We currently support vectorization of loops in which all types operated upon
152 are of the same size. Therefore this function currently sets VF according to 153 are of the same size. Therefore this function currently sets VF according to
153 the size of the types operated upon, and fails if there are multiple sizes 154 the size of the types operated upon, and fails if there are multiple sizes
154 in the loop. 155 in the loop.
155 156
156 VF is also the factor by which the loop iterations are strip-mined, e.g.: 157 VF is also the factor by which the loop iterations are strip-mined, e.g.:
157 original loop: 158 original loop:
434 } 435 }
435 436
436 /* Function vect_analyze_scalar_cycles_1. 437 /* Function vect_analyze_scalar_cycles_1.
437 438
438 Examine the cross iteration def-use cycles of scalar variables 439 Examine the cross iteration def-use cycles of scalar variables
439 in LOOP. LOOP_VINFO represents the loop that is now being 440 in LOOP. LOOP_VINFO represents the loop that is now being
440 considered for vectorization (can be LOOP, or an outer-loop 441 considered for vectorization (can be LOOP, or an outer-loop
441 enclosing LOOP). */ 442 enclosing LOOP). */
442 443
443 static void 444 static void
444 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop) 445 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
450 bool double_reduc; 451 bool double_reduc;
451 452
452 if (vect_print_dump_info (REPORT_DETAILS)) 453 if (vect_print_dump_info (REPORT_DETAILS))
453 fprintf (vect_dump, "=== vect_analyze_scalar_cycles ==="); 454 fprintf (vect_dump, "=== vect_analyze_scalar_cycles ===");
454 455
455 /* First - identify all inductions. Reduction detection assumes that all the 456 /* First - identify all inductions. Reduction detection assumes that all the
456 inductions have been identified, therefore, this order must not be 457 inductions have been identified, therefore, this order must not be
457 changed. */ 458 changed. */
458 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi)) 459 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
459 { 460 {
460 gimple phi = gsi_stmt (gsi); 461 gimple phi = gsi_stmt (gsi);
466 { 467 {
467 fprintf (vect_dump, "Analyze phi: "); 468 fprintf (vect_dump, "Analyze phi: ");
468 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM); 469 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
469 } 470 }
470 471
471 /* Skip virtual phi's. The data dependences that are associated with 472 /* Skip virtual phi's. The data dependences that are associated with
472 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */ 473 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
473 if (!is_gimple_reg (SSA_NAME_VAR (def))) 474 if (!is_gimple_reg (SSA_NAME_VAR (def)))
474 continue; 475 continue;
475 476
476 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type; 477 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
477 478
478 /* Analyze the evolution function. */ 479 /* Analyze the evolution function. */
479 access_fn = analyze_scalar_evolution (loop, def); 480 access_fn = analyze_scalar_evolution (loop, def);
481 if (access_fn)
482 STRIP_NOPS (access_fn);
480 if (access_fn && vect_print_dump_info (REPORT_DETAILS)) 483 if (access_fn && vect_print_dump_info (REPORT_DETAILS))
481 { 484 {
482 fprintf (vect_dump, "Access function of PHI: "); 485 fprintf (vect_dump, "Access function of PHI: ");
483 print_generic_expr (vect_dump, access_fn, TDF_SLIM); 486 print_generic_expr (vect_dump, access_fn, TDF_SLIM);
484 } 487 }
565 568
566 569
567 /* Function vect_analyze_scalar_cycles. 570 /* Function vect_analyze_scalar_cycles.
568 571
569 Examine the cross iteration def-use cycles of scalar variables, by 572 Examine the cross iteration def-use cycles of scalar variables, by
570 analyzing the loop-header PHIs of scalar variables; Classify each 573 analyzing the loop-header PHIs of scalar variables. Classify each
571 cycle as one of the following: invariant, induction, reduction, unknown. 574 cycle as one of the following: invariant, induction, reduction, unknown.
572 We do that for the loop represented by LOOP_VINFO, and also to its 575 We do that for the loop represented by LOOP_VINFO, and also to its
573 inner-loop, if exists. 576 inner-loop, if exists.
574 Examples for scalar cycles: 577 Examples for scalar cycles:
575 578
740 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL; 743 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
741 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0; 744 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
742 LOOP_VINFO_VECTORIZABLE_P (res) = 0; 745 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
743 LOOP_PEELING_FOR_ALIGNMENT (res) = 0; 746 LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
744 LOOP_VINFO_VECT_FACTOR (res) = 0; 747 LOOP_VINFO_VECT_FACTOR (res) = 0;
748 LOOP_VINFO_LOOP_NEST (res) = VEC_alloc (loop_p, heap, 3);
745 LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10); 749 LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
746 LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10); 750 LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
747 LOOP_VINFO_UNALIGNED_DR (res) = NULL; 751 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
748 LOOP_VINFO_MAY_MISALIGN_STMTS (res) = 752 LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
749 VEC_alloc (gimple, heap, 753 VEC_alloc (gimple, heap,
753 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS)); 757 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
754 LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10); 758 LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10);
755 LOOP_VINFO_REDUCTIONS (res) = VEC_alloc (gimple, heap, 10); 759 LOOP_VINFO_REDUCTIONS (res) = VEC_alloc (gimple, heap, 10);
756 LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10); 760 LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
757 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1; 761 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
762 LOOP_VINFO_PEELING_HTAB (res) = NULL;
758 763
759 return res; 764 return res;
760 } 765 }
761 766
762 767
787 if (!clean_stmts) 792 if (!clean_stmts)
788 { 793 {
789 free (LOOP_VINFO_BBS (loop_vinfo)); 794 free (LOOP_VINFO_BBS (loop_vinfo));
790 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo)); 795 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
791 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo)); 796 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
797 VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo));
792 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)); 798 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
799 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
793 800
794 free (loop_vinfo); 801 free (loop_vinfo);
795 loop->aux = NULL; 802 loop->aux = NULL;
796 return; 803 return;
797 } 804 }
833 } 840 }
834 841
835 free (LOOP_VINFO_BBS (loop_vinfo)); 842 free (LOOP_VINFO_BBS (loop_vinfo));
836 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo)); 843 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
837 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo)); 844 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
845 VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo));
838 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)); 846 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
839 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)); 847 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
840 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo); 848 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
841 for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++) 849 FOR_EACH_VEC_ELT (slp_instance, slp_instances, j, instance)
842 vect_free_slp_instance (instance); 850 vect_free_slp_instance (instance);
843 851
844 VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo)); 852 VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
845 VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo)); 853 VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
846 VEC_free (gimple, heap, LOOP_VINFO_REDUCTIONS (loop_vinfo)); 854 VEC_free (gimple, heap, LOOP_VINFO_REDUCTIONS (loop_vinfo));
855
856 if (LOOP_VINFO_PEELING_HTAB (loop_vinfo))
857 htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo));
847 858
848 free (loop_vinfo); 859 free (loop_vinfo);
849 loop->aux = NULL; 860 loop->aux = NULL;
850 } 861 }
851 862
1115 loop->aux = loop_vinfo; 1126 loop->aux = loop_vinfo;
1116 return loop_vinfo; 1127 return loop_vinfo;
1117 } 1128 }
1118 1129
1119 1130
1131 /* Get cost by calling cost target builtin. */
1132
1133 static inline int
1134 vect_get_cost (enum vect_cost_for_stmt type_of_cost)
1135 {
1136 tree dummy_type = NULL;
1137 int dummy = 0;
1138
1139 return targetm.vectorize.builtin_vectorization_cost (type_of_cost,
1140 dummy_type, dummy);
1141 }
1142
1143
1120 /* Function vect_analyze_loop_operations. 1144 /* Function vect_analyze_loop_operations.
1121 1145
1122 Scan the loop stmts and make sure they are all vectorizable. */ 1146 Scan the loop stmts and make sure they are all vectorizable. */
1123 1147
1124 static bool 1148 static bool
1280 fprintf (vect_dump,"not vectorized: iteration count smaller than " 1304 fprintf (vect_dump,"not vectorized: iteration count smaller than "
1281 "vectorization factor."); 1305 "vectorization factor.");
1282 return false; 1306 return false;
1283 } 1307 }
1284 1308
1285 /* Analyze cost. Decide if worth while to vectorize. */ 1309 /* Analyze cost. Decide if worth while to vectorize. */
1286 1310
1287 /* Once VF is set, SLP costs should be updated since the number of created 1311 /* Once VF is set, SLP costs should be updated since the number of created
1288 vector stmts depends on VF. */ 1312 vector stmts depends on VF. */
1289 vect_update_slp_costs_according_to_vf (loop_vinfo); 1313 vect_update_slp_costs_according_to_vf (loop_vinfo);
1290 1314
1350 1374
1351 return true; 1375 return true;
1352 } 1376 }
1353 1377
1354 1378
1355 /* Function vect_analyze_loop. 1379 /* Function vect_analyze_loop_2.
1356 1380
1357 Apply a set of analyses on LOOP, and create a loop_vec_info struct 1381 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1358 for it. The different analyses will record information in the 1382 for it. The different analyses will record information in the
1359 loop_vec_info struct. */ 1383 loop_vec_info struct. */
1360 loop_vec_info 1384 static bool
1361 vect_analyze_loop (struct loop *loop) 1385 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1362 { 1386 {
1363 bool ok; 1387 bool ok, dummy;
1364 loop_vec_info loop_vinfo;
1365 int max_vf = MAX_VECTORIZATION_FACTOR; 1388 int max_vf = MAX_VECTORIZATION_FACTOR;
1366 int min_vf = 2; 1389 int min_vf = 2;
1367
1368 if (vect_print_dump_info (REPORT_DETAILS))
1369 fprintf (vect_dump, "===== analyze_loop_nest =====");
1370
1371 if (loop_outer (loop)
1372 && loop_vec_info_for_loop (loop_outer (loop))
1373 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1374 {
1375 if (vect_print_dump_info (REPORT_DETAILS))
1376 fprintf (vect_dump, "outer-loop already vectorized.");
1377 return NULL;
1378 }
1379
1380 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
1381
1382 loop_vinfo = vect_analyze_loop_form (loop);
1383 if (!loop_vinfo)
1384 {
1385 if (vect_print_dump_info (REPORT_DETAILS))
1386 fprintf (vect_dump, "bad loop form.");
1387 return NULL;
1388 }
1389 1390
1390 /* Find all data references in the loop (which correspond to vdefs/vuses) 1391 /* Find all data references in the loop (which correspond to vdefs/vuses)
1391 and analyze their evolution in the loop. Also adjust the minimal 1392 and analyze their evolution in the loop. Also adjust the minimal
1392 vectorization factor according to the loads and stores. 1393 vectorization factor according to the loads and stores.
1393 1394
1397 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf); 1398 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1398 if (!ok) 1399 if (!ok)
1399 { 1400 {
1400 if (vect_print_dump_info (REPORT_DETAILS)) 1401 if (vect_print_dump_info (REPORT_DETAILS))
1401 fprintf (vect_dump, "bad data references."); 1402 fprintf (vect_dump, "bad data references.");
1402 destroy_loop_vec_info (loop_vinfo, true); 1403 return false;
1403 return NULL;
1404 } 1404 }
1405 1405
1406 /* Classify all cross-iteration scalar data-flow cycles. 1406 /* Classify all cross-iteration scalar data-flow cycles.
1407 Cross-iteration cycles caused by virtual phis are analyzed separately. */ 1407 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1408 1408
1415 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo); 1415 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1416 if (!ok) 1416 if (!ok)
1417 { 1417 {
1418 if (vect_print_dump_info (REPORT_DETAILS)) 1418 if (vect_print_dump_info (REPORT_DETAILS))
1419 fprintf (vect_dump, "unexpected pattern."); 1419 fprintf (vect_dump, "unexpected pattern.");
1420 destroy_loop_vec_info (loop_vinfo, true); 1420 return false;
1421 return NULL;
1422 } 1421 }
1423 1422
1424 /* Analyze data dependences between the data-refs in the loop 1423 /* Analyze data dependences between the data-refs in the loop
1425 and adjust the maximum vectorization factor according to 1424 and adjust the maximum vectorization factor according to
1426 the dependences. 1425 the dependences.
1427 FORNOW: fail at the first data dependence that we encounter. */ 1426 FORNOW: fail at the first data dependence that we encounter. */
1428 1427
1429 ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf); 1428 ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf, &dummy);
1430 if (!ok 1429 if (!ok
1431 || max_vf < min_vf) 1430 || max_vf < min_vf)
1432 { 1431 {
1433 if (vect_print_dump_info (REPORT_DETAILS)) 1432 if (vect_print_dump_info (REPORT_DETAILS))
1434 fprintf (vect_dump, "bad data dependence."); 1433 fprintf (vect_dump, "bad data dependence.");
1435 destroy_loop_vec_info (loop_vinfo, true); 1434 return false;
1436 return NULL;
1437 } 1435 }
1438 1436
1439 ok = vect_determine_vectorization_factor (loop_vinfo); 1437 ok = vect_determine_vectorization_factor (loop_vinfo);
1440 if (!ok) 1438 if (!ok)
1441 { 1439 {
1442 if (vect_print_dump_info (REPORT_DETAILS)) 1440 if (vect_print_dump_info (REPORT_DETAILS))
1443 fprintf (vect_dump, "can't determine vectorization factor."); 1441 fprintf (vect_dump, "can't determine vectorization factor.");
1444 destroy_loop_vec_info (loop_vinfo, true); 1442 return false;
1445 return NULL;
1446 } 1443 }
1447 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo)) 1444 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1448 { 1445 {
1449 if (vect_print_dump_info (REPORT_DETAILS)) 1446 if (vect_print_dump_info (REPORT_DETAILS))
1450 fprintf (vect_dump, "bad data dependence."); 1447 fprintf (vect_dump, "bad data dependence.");
1451 destroy_loop_vec_info (loop_vinfo, true); 1448 return false;
1452 return NULL;
1453 } 1449 }
1454 1450
1455 /* Analyze the alignment of the data-refs in the loop. 1451 /* Analyze the alignment of the data-refs in the loop.
1456 Fail if a data reference is found that cannot be vectorized. */ 1452 Fail if a data reference is found that cannot be vectorized. */
1457 1453
1458 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL); 1454 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1459 if (!ok) 1455 if (!ok)
1460 { 1456 {
1461 if (vect_print_dump_info (REPORT_DETAILS)) 1457 if (vect_print_dump_info (REPORT_DETAILS))
1462 fprintf (vect_dump, "bad data alignment."); 1458 fprintf (vect_dump, "bad data alignment.");
1463 destroy_loop_vec_info (loop_vinfo, true); 1459 return false;
1464 return NULL;
1465 } 1460 }
1466 1461
1467 /* Analyze the access patterns of the data-refs in the loop (consecutive, 1462 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1468 complex, etc.). FORNOW: Only handle consecutive access pattern. */ 1463 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1469 1464
1470 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL); 1465 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1471 if (!ok) 1466 if (!ok)
1472 { 1467 {
1473 if (vect_print_dump_info (REPORT_DETAILS)) 1468 if (vect_print_dump_info (REPORT_DETAILS))
1474 fprintf (vect_dump, "bad data access."); 1469 fprintf (vect_dump, "bad data access.");
1475 destroy_loop_vec_info (loop_vinfo, true); 1470 return false;
1476 return NULL;
1477 } 1471 }
1478 1472
1479 /* Prune the list of ddrs to be tested at run-time by versioning for alias. 1473 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1480 It is important to call pruning after vect_analyze_data_ref_accesses, 1474 It is important to call pruning after vect_analyze_data_ref_accesses,
1481 since we use grouping information gathered by interleaving analysis. */ 1475 since we use grouping information gathered by interleaving analysis. */
1483 if (!ok) 1477 if (!ok)
1484 { 1478 {
1485 if (vect_print_dump_info (REPORT_DETAILS)) 1479 if (vect_print_dump_info (REPORT_DETAILS))
1486 fprintf (vect_dump, "too long list of versioning for alias " 1480 fprintf (vect_dump, "too long list of versioning for alias "
1487 "run-time tests."); 1481 "run-time tests.");
1488 destroy_loop_vec_info (loop_vinfo, true); 1482 return false;
1489 return NULL; 1483 }
1484
1485 /* This pass will decide on using loop versioning and/or loop peeling in
1486 order to enhance the alignment of data references in the loop. */
1487
1488 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1489 if (!ok)
1490 {
1491 if (vect_print_dump_info (REPORT_DETAILS))
1492 fprintf (vect_dump, "bad data alignment.");
1493 return false;
1490 } 1494 }
1491 1495
1492 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */ 1496 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1493 ok = vect_analyze_slp (loop_vinfo, NULL); 1497 ok = vect_analyze_slp (loop_vinfo, NULL);
1494 if (ok) 1498 if (ok)
1498 1502
1499 /* Find stmts that need to be both vectorized and SLPed. */ 1503 /* Find stmts that need to be both vectorized and SLPed. */
1500 vect_detect_hybrid_slp (loop_vinfo); 1504 vect_detect_hybrid_slp (loop_vinfo);
1501 } 1505 }
1502 1506
1503 /* This pass will decide on using loop versioning and/or loop peeling in
1504 order to enhance the alignment of data references in the loop. */
1505
1506 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1507 if (!ok)
1508 {
1509 if (vect_print_dump_info (REPORT_DETAILS))
1510 fprintf (vect_dump, "bad data alignment.");
1511 destroy_loop_vec_info (loop_vinfo, true);
1512 return NULL;
1513 }
1514
1515 /* Scan all the operations in the loop and make sure they are 1507 /* Scan all the operations in the loop and make sure they are
1516 vectorizable. */ 1508 vectorizable. */
1517 1509
1518 ok = vect_analyze_loop_operations (loop_vinfo); 1510 ok = vect_analyze_loop_operations (loop_vinfo);
1519 if (!ok) 1511 if (!ok)
1520 { 1512 {
1521 if (vect_print_dump_info (REPORT_DETAILS)) 1513 if (vect_print_dump_info (REPORT_DETAILS))
1522 fprintf (vect_dump, "bad operation or unsupported loop bound."); 1514 fprintf (vect_dump, "bad operation or unsupported loop bound.");
1515 return false;
1516 }
1517
1518 return true;
1519 }
1520
1521 /* Function vect_analyze_loop.
1522
1523 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1524 for it. The different analyses will record information in the
1525 loop_vec_info struct. */
1526 loop_vec_info
1527 vect_analyze_loop (struct loop *loop)
1528 {
1529 loop_vec_info loop_vinfo;
1530 unsigned int vector_sizes;
1531
1532 /* Autodetect first vector size we try. */
1533 current_vector_size = 0;
1534 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1535
1536 if (vect_print_dump_info (REPORT_DETAILS))
1537 fprintf (vect_dump, "===== analyze_loop_nest =====");
1538
1539 if (loop_outer (loop)
1540 && loop_vec_info_for_loop (loop_outer (loop))
1541 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1542 {
1543 if (vect_print_dump_info (REPORT_DETAILS))
1544 fprintf (vect_dump, "outer-loop already vectorized.");
1545 return NULL;
1546 }
1547
1548 while (1)
1549 {
1550 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
1551 loop_vinfo = vect_analyze_loop_form (loop);
1552 if (!loop_vinfo)
1553 {
1554 if (vect_print_dump_info (REPORT_DETAILS))
1555 fprintf (vect_dump, "bad loop form.");
1556 return NULL;
1557 }
1558
1559 if (vect_analyze_loop_2 (loop_vinfo))
1560 {
1561 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1562
1563 return loop_vinfo;
1564 }
1565
1523 destroy_loop_vec_info (loop_vinfo, true); 1566 destroy_loop_vec_info (loop_vinfo, true);
1524 return NULL; 1567
1525 } 1568 vector_sizes &= ~current_vector_size;
1526 1569 if (vector_sizes == 0
1527 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1; 1570 || current_vector_size == 0)
1528 1571 return NULL;
1529 return loop_vinfo; 1572
1573 /* Try the next biggest vector size. */
1574 current_vector_size = 1 << floor_log2 (vector_sizes);
1575 if (vect_print_dump_info (REPORT_DETAILS))
1576 fprintf (vect_dump, "***** Re-trying analysis with "
1577 "vector size %d\n", current_vector_size);
1578 }
1530 } 1579 }
1531 1580
1532 1581
1533 /* Function reduction_code_for_scalar_code 1582 /* Function reduction_code_for_scalar_code
1534 1583
1573 return false; 1622 return false;
1574 } 1623 }
1575 } 1624 }
1576 1625
1577 1626
1578 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement 1627 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1579 STMT is printed with a message MSG. */ 1628 STMT is printed with a message MSG. */
1580 1629
1581 static void 1630 static void
1582 report_vect_op (gimple stmt, const char *msg) 1631 report_vect_op (gimple stmt, const char *msg)
1583 { 1632 {
1587 1636
1588 1637
1589 /* Function vect_is_simple_reduction_1 1638 /* Function vect_is_simple_reduction_1
1590 1639
1591 (1) Detect a cross-iteration def-use cycle that represents a simple 1640 (1) Detect a cross-iteration def-use cycle that represents a simple
1592 reduction computation. We look for the following pattern: 1641 reduction computation. We look for the following pattern:
1593 1642
1594 loop_header: 1643 loop_header:
1595 a1 = phi < a0, a2 > 1644 a1 = phi < a0, a2 >
1596 a3 = ... 1645 a3 = ...
1597 a2 = operation (a3, a1) 1646 a2 = operation (a3, a1)
1598 1647
1599 such that: 1648 such that:
1600 1. operation is commutative and associative and it is safe to 1649 1. operation is commutative and associative and it is safe to
1601 change the order of the computation (if CHECK_REDUCTION is true) 1650 change the order of the computation (if CHECK_REDUCTION is true)
1602 2. no uses for a2 in the loop (a2 is used out of the loop) 1651 2. no uses for a2 in the loop (a2 is used out of the loop)
1603 3. no uses of a1 in the loop besides the reduction operation. 1652 3. no uses of a1 in the loop besides the reduction operation
1604 1653 4. no uses of a1 outside the loop.
1605 Condition 1 is tested here. 1654
1655 Conditions 1,4 are tested here.
1606 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized. 1656 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
1607 1657
1608 (2) Detect a cross-iteration def-use cycle in nested loops, i.e., 1658 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
1609 nested cycles, if CHECK_REDUCTION is false. 1659 nested cycles, if CHECK_REDUCTION is false.
1610 1660
1651 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name) 1701 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1652 { 1702 {
1653 gimple use_stmt = USE_STMT (use_p); 1703 gimple use_stmt = USE_STMT (use_p);
1654 if (is_gimple_debug (use_stmt)) 1704 if (is_gimple_debug (use_stmt))
1655 continue; 1705 continue;
1656 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)) 1706
1657 && vinfo_for_stmt (use_stmt) 1707 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1708 {
1709 if (vect_print_dump_info (REPORT_DETAILS))
1710 fprintf (vect_dump, "intermediate value used outside loop.");
1711
1712 return NULL;
1713 }
1714
1715 if (vinfo_for_stmt (use_stmt)
1658 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt))) 1716 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1659 nloop_uses++; 1717 nloop_uses++;
1660 if (nloop_uses > 1) 1718 if (nloop_uses > 1)
1661 { 1719 {
1662 if (vect_print_dump_info (REPORT_DETAILS)) 1720 if (vect_print_dump_info (REPORT_DETAILS))
1754 1812
1755 /* We can handle "res -= x[i]", which is non-associative by 1813 /* We can handle "res -= x[i]", which is non-associative by
1756 simply rewriting this into "res += -x[i]". Avoid changing 1814 simply rewriting this into "res += -x[i]". Avoid changing
1757 gimple instruction for the first simple tests and only do this 1815 gimple instruction for the first simple tests and only do this
1758 if we're allowed to change code at all. */ 1816 if we're allowed to change code at all. */
1759 if (code == MINUS_EXPR && modify) 1817 if (code == MINUS_EXPR
1818 && modify
1819 && (op1 = gimple_assign_rhs1 (def_stmt))
1820 && TREE_CODE (op1) == SSA_NAME
1821 && SSA_NAME_DEF_STMT (op1) == phi)
1760 code = PLUS_EXPR; 1822 code = PLUS_EXPR;
1761 1823
1762 if (check_reduction 1824 if (check_reduction
1763 && (!commutative_tree_code (code) || !associative_tree_code (code))) 1825 && (!commutative_tree_code (code) || !associative_tree_code (code)))
1764 { 1826 {
1919 1981
1920 if (def2 && def2 == phi 1982 if (def2 && def2 == phi
1921 && (code == COND_EXPR 1983 && (code == COND_EXPR
1922 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1)) 1984 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
1923 && (is_gimple_assign (def1) 1985 && (is_gimple_assign (def1)
1986 || is_gimple_call (def1)
1924 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) 1987 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1925 == vect_induction_def 1988 == vect_induction_def
1926 || (gimple_code (def1) == GIMPLE_PHI 1989 || (gimple_code (def1) == GIMPLE_PHI
1927 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) 1990 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1928 == vect_internal_def 1991 == vect_internal_def
1934 } 1997 }
1935 else if (def1 && def1 == phi 1998 else if (def1 && def1 == phi
1936 && (code == COND_EXPR 1999 && (code == COND_EXPR
1937 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2)) 2000 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
1938 && (is_gimple_assign (def2) 2001 && (is_gimple_assign (def2)
2002 || is_gimple_call (def2)
1939 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) 2003 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
1940 == vect_induction_def 2004 == vect_induction_def
1941 || (gimple_code (def2) == GIMPLE_PHI 2005 || (gimple_code (def2) == GIMPLE_PHI
1942 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) 2006 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
1943 == vect_internal_def 2007 == vect_internal_def
1991 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi, 2055 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
1992 bool check_reduction, bool *double_reduc) 2056 bool check_reduction, bool *double_reduc)
1993 { 2057 {
1994 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction, 2058 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
1995 double_reduc, true); 2059 double_reduc, true);
2060 }
2061
2062 /* Calculate the cost of one scalar iteration of the loop. */
2063 int
2064 vect_get_single_scalar_iteraion_cost (loop_vec_info loop_vinfo)
2065 {
2066 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2067 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2068 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2069 int innerloop_iters, i, stmt_cost;
2070
2071 /* Count statements in scalar loop. Using this as scalar cost for a single
2072 iteration for now.
2073
2074 TODO: Add outer loop support.
2075
2076 TODO: Consider assigning different costs to different scalar
2077 statements. */
2078
2079 /* FORNOW. */
2080 innerloop_iters = 1;
2081 if (loop->inner)
2082 innerloop_iters = 50; /* FIXME */
2083
2084 for (i = 0; i < nbbs; i++)
2085 {
2086 gimple_stmt_iterator si;
2087 basic_block bb = bbs[i];
2088
2089 if (bb->loop_father == loop->inner)
2090 factor = innerloop_iters;
2091 else
2092 factor = 1;
2093
2094 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2095 {
2096 gimple stmt = gsi_stmt (si);
2097 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2098
2099 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2100 continue;
2101
2102 /* Skip stmts that are not vectorized inside the loop. */
2103 if (stmt_info
2104 && !STMT_VINFO_RELEVANT_P (stmt_info)
2105 && (!STMT_VINFO_LIVE_P (stmt_info)
2106 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
2107 continue;
2108
2109 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2110 {
2111 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2112 stmt_cost = vect_get_cost (scalar_load);
2113 else
2114 stmt_cost = vect_get_cost (scalar_store);
2115 }
2116 else
2117 stmt_cost = vect_get_cost (scalar_stmt);
2118
2119 scalar_single_iter_cost += stmt_cost * factor;
2120 }
2121 }
2122 return scalar_single_iter_cost;
2123 }
2124
2125 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2126 int
2127 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2128 int *peel_iters_epilogue,
2129 int scalar_single_iter_cost)
2130 {
2131 int peel_guard_costs = 0;
2132 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2133
2134 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2135 {
2136 *peel_iters_epilogue = vf/2;
2137 if (vect_print_dump_info (REPORT_COST))
2138 fprintf (vect_dump, "cost model: "
2139 "epilogue peel iters set to vf/2 because "
2140 "loop iterations are unknown .");
2141
2142 /* If peeled iterations are known but number of scalar loop
2143 iterations are unknown, count a taken branch per peeled loop. */
2144 peel_guard_costs = 2 * vect_get_cost (cond_branch_taken);
2145 }
2146 else
2147 {
2148 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2149 peel_iters_prologue = niters < peel_iters_prologue ?
2150 niters : peel_iters_prologue;
2151 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2152 }
2153
2154 return (peel_iters_prologue * scalar_single_iter_cost)
2155 + (*peel_iters_epilogue * scalar_single_iter_cost)
2156 + peel_guard_costs;
1996 } 2157 }
1997 2158
1998 /* Function vect_estimate_min_profitable_iters 2159 /* Function vect_estimate_min_profitable_iters
1999 2160
2000 Return the number of iterations required for the vector version of the 2161 Return the number of iterations required for the vector version of the
2017 int scalar_outside_cost = 0; 2178 int scalar_outside_cost = 0;
2018 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 2179 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2019 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 2180 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2020 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 2181 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2021 int nbbs = loop->num_nodes; 2182 int nbbs = loop->num_nodes;
2022 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo); 2183 int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2023 int peel_guard_costs = 0; 2184 int peel_guard_costs = 0;
2024 int innerloop_iters = 0, factor; 2185 int innerloop_iters = 0, factor;
2025 VEC (slp_instance, heap) *slp_instances; 2186 VEC (slp_instance, heap) *slp_instances;
2026 slp_instance instance; 2187 slp_instance instance;
2027 2188
2055 "versioning aliasing.\n"); 2216 "versioning aliasing.\n");
2056 } 2217 }
2057 2218
2058 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo) 2219 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2059 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)) 2220 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2060 vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST; 2221 vec_outside_cost += vect_get_cost (cond_branch_taken);
2061 2222
2062 /* Count statements in scalar loop. Using this as scalar cost for a single 2223 /* Count statements in scalar loop. Using this as scalar cost for a single
2063 iteration for now. 2224 iteration for now.
2064 2225
2065 TODO: Add outer loop support. 2226 TODO: Add outer loop support.
2088 /* Skip stmts that are not vectorized inside the loop. */ 2249 /* Skip stmts that are not vectorized inside the loop. */
2089 if (!STMT_VINFO_RELEVANT_P (stmt_info) 2250 if (!STMT_VINFO_RELEVANT_P (stmt_info)
2090 && (!STMT_VINFO_LIVE_P (stmt_info) 2251 && (!STMT_VINFO_LIVE_P (stmt_info)
2091 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)) 2252 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
2092 continue; 2253 continue;
2093 scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
2094 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor; 2254 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
2095 /* FIXME: for stmts in the inner-loop in outer-loop vectorization, 2255 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
2096 some of the "outside" costs are generated inside the outer-loop. */ 2256 some of the "outside" costs are generated inside the outer-loop. */
2097 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info); 2257 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
2098 } 2258 }
2099 } 2259 }
2100 2260
2261 scalar_single_iter_cost = vect_get_single_scalar_iteraion_cost (loop_vinfo);
2262
2101 /* Add additional cost for the peeled instructions in prologue and epilogue 2263 /* Add additional cost for the peeled instructions in prologue and epilogue
2102 loop. 2264 loop.
2103 2265
2104 FORNOW: If we don't know the value of peel_iters for prologue or epilogue 2266 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2105 at compile-time - we assume it's vf/2 (the worst would be vf-1). 2267 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2106 2268
2107 TODO: Build an expression that represents peel_iters for prologue and 2269 TODO: Build an expression that represents peel_iters for prologue and
2108 epilogue to be used in a run-time test. */ 2270 epilogue to be used in a run-time test. */
2109 2271
2110 if (byte_misalign < 0) 2272 if (npeel < 0)
2111 { 2273 {
2112 peel_iters_prologue = vf/2; 2274 peel_iters_prologue = vf/2;
2113 if (vect_print_dump_info (REPORT_COST)) 2275 if (vect_print_dump_info (REPORT_COST))
2114 fprintf (vect_dump, "cost model: " 2276 fprintf (vect_dump, "cost model: "
2115 "prologue peel iters set to vf/2."); 2277 "prologue peel iters set to vf/2.");
2124 2286
2125 /* If peeled iterations are unknown, count a taken branch and a not taken 2287 /* If peeled iterations are unknown, count a taken branch and a not taken
2126 branch per peeled loop. Even if scalar loop iterations are known, 2288 branch per peeled loop. Even if scalar loop iterations are known,
2127 vector iterations are not known since peeled prologue iterations are 2289 vector iterations are not known since peeled prologue iterations are
2128 not known. Hence guards remain the same. */ 2290 not known. Hence guards remain the same. */
2129 peel_guard_costs += 2 * (TARG_COND_TAKEN_BRANCH_COST 2291 peel_guard_costs += 2 * (vect_get_cost (cond_branch_taken)
2130 + TARG_COND_NOT_TAKEN_BRANCH_COST); 2292 + vect_get_cost (cond_branch_not_taken));
2293 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
2294 + (peel_iters_epilogue * scalar_single_iter_cost)
2295 + peel_guard_costs;
2131 } 2296 }
2132 else 2297 else
2133 { 2298 {
2134 if (byte_misalign) 2299 peel_iters_prologue = npeel;
2135 { 2300 vec_outside_cost += vect_get_known_peeling_cost (loop_vinfo,
2136 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo); 2301 peel_iters_prologue, &peel_iters_epilogue,
2137 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr)))); 2302 scalar_single_iter_cost);
2138 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))); 2303 }
2139 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
2140
2141 peel_iters_prologue = nelements - (byte_misalign / element_size);
2142 }
2143 else
2144 peel_iters_prologue = 0;
2145
2146 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2147 {
2148 peel_iters_epilogue = vf/2;
2149 if (vect_print_dump_info (REPORT_COST))
2150 fprintf (vect_dump, "cost model: "
2151 "epilogue peel iters set to vf/2 because "
2152 "loop iterations are unknown .");
2153
2154 /* If peeled iterations are known but number of scalar loop
2155 iterations are unknown, count a taken branch per peeled loop. */
2156 peel_guard_costs += 2 * TARG_COND_TAKEN_BRANCH_COST;
2157
2158 }
2159 else
2160 {
2161 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2162 peel_iters_prologue = niters < peel_iters_prologue ?
2163 niters : peel_iters_prologue;
2164 peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2165 }
2166 }
2167
2168 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
2169 + (peel_iters_epilogue * scalar_single_iter_cost)
2170 + peel_guard_costs;
2171 2304
2172 /* FORNOW: The scalar outside cost is incremented in one of the 2305 /* FORNOW: The scalar outside cost is incremented in one of the
2173 following ways: 2306 following ways:
2174 2307
2175 1. The vectorizer checks for alignment and aliasing and generates 2308 1. The vectorizer checks for alignment and aliasing and generates
2218 TODO: The back end may reorder the BBS's differently and reverse 2351 TODO: The back end may reorder the BBS's differently and reverse
2219 conditions/branch directions. Change the estimates below to 2352 conditions/branch directions. Change the estimates below to
2220 something more reasonable. */ 2353 something more reasonable. */
2221 2354
2222 /* If the number of iterations is known and we do not do versioning, we can 2355 /* If the number of iterations is known and we do not do versioning, we can
2223 decide whether to vectorize at compile time. Hence the scalar version 2356 decide whether to vectorize at compile time. Hence the scalar version
2224 do not carry cost model guard costs. */ 2357 do not carry cost model guard costs. */
2225 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 2358 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2226 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo) 2359 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2227 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)) 2360 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2228 { 2361 {
2229 /* Cost model check occurs at versioning. */ 2362 /* Cost model check occurs at versioning. */
2230 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo) 2363 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2231 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)) 2364 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2232 scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST; 2365 scalar_outside_cost += vect_get_cost (cond_branch_not_taken);
2233 else 2366 else
2234 { 2367 {
2235 /* Cost model check occurs at prologue generation. */ 2368 /* Cost model check occurs at prologue generation. */
2236 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0) 2369 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2237 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST 2370 scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken)
2238 + TARG_COND_NOT_TAKEN_BRANCH_COST; 2371 + vect_get_cost (cond_branch_not_taken);
2239 /* Cost model check occurs at epilogue generation. */ 2372 /* Cost model check occurs at epilogue generation. */
2240 else 2373 else
2241 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST; 2374 scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken);
2242 } 2375 }
2243 } 2376 }
2244 2377
2245 /* Add SLP costs. */ 2378 /* Add SLP costs. */
2246 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo); 2379 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2247 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++) 2380 FOR_EACH_VEC_ELT (slp_instance, slp_instances, i, instance)
2248 { 2381 {
2249 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance); 2382 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
2250 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance); 2383 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
2251 } 2384 }
2252 2385
2253 /* Calculate number of iterations required to make the vector version 2386 /* Calculate number of iterations required to make the vector version
2254 profitable, relative to the loop bodies only. The following condition 2387 profitable, relative to the loop bodies only. The following condition
2255 must hold true: 2388 must hold true:
2256 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC 2389 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2257 where 2390 where
2258 SIC = scalar iteration cost, VIC = vector iteration cost, 2391 SIC = scalar iteration cost, VIC = vector iteration cost,
2259 VOC = vector outside cost, VF = vectorization factor, 2392 VOC = vector outside cost, VF = vectorization factor,
2346 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 2479 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2347 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 2480 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2348 2481
2349 2482
2350 /* Cost of reduction op inside loop. */ 2483 /* Cost of reduction op inside loop. */
2351 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST; 2484 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2485 += ncopies * vect_get_cost (vector_stmt);
2352 2486
2353 stmt = STMT_VINFO_STMT (stmt_info); 2487 stmt = STMT_VINFO_STMT (stmt_info);
2354 2488
2355 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt))) 2489 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2356 { 2490 {
2386 orig_stmt = STMT_VINFO_STMT (stmt_info); 2520 orig_stmt = STMT_VINFO_STMT (stmt_info);
2387 2521
2388 code = gimple_assign_rhs_code (orig_stmt); 2522 code = gimple_assign_rhs_code (orig_stmt);
2389 2523
2390 /* Add in cost for initial definition. */ 2524 /* Add in cost for initial definition. */
2391 outer_cost += TARG_SCALAR_TO_VEC_COST; 2525 outer_cost += vect_get_cost (scalar_to_vec);
2392 2526
2393 /* Determine cost of epilogue code. 2527 /* Determine cost of epilogue code.
2394 2528
2395 We have a reduction operator that will reduce the vector in one statement. 2529 We have a reduction operator that will reduce the vector in one statement.
2396 Also requires scalar extract. */ 2530 Also requires scalar extract. */
2397 2531
2398 if (!nested_in_vect_loop_p (loop, orig_stmt)) 2532 if (!nested_in_vect_loop_p (loop, orig_stmt))
2399 { 2533 {
2400 if (reduc_code != ERROR_MARK) 2534 if (reduc_code != ERROR_MARK)
2401 outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST; 2535 outer_cost += vect_get_cost (vector_stmt)
2536 + vect_get_cost (vec_to_scalar);
2402 else 2537 else
2403 { 2538 {
2404 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1); 2539 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2405 tree bitsize = 2540 tree bitsize =
2406 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt))); 2541 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
2409 2544
2410 optab = optab_for_tree_code (code, vectype, optab_default); 2545 optab = optab_for_tree_code (code, vectype, optab_default);
2411 2546
2412 /* We have a whole vector shift available. */ 2547 /* We have a whole vector shift available. */
2413 if (VECTOR_MODE_P (mode) 2548 if (VECTOR_MODE_P (mode)
2414 && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing 2549 && optab_handler (optab, mode) != CODE_FOR_nothing
2415 && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing) 2550 && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
2416 /* Final reduction via vector shifts and the reduction operator. Also 2551 /* Final reduction via vector shifts and the reduction operator. Also
2417 requires scalar extract. */ 2552 requires scalar extract. */
2418 outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST 2553 outer_cost += ((exact_log2(nelements) * 2)
2419 + TARG_VEC_TO_SCALAR_COST); 2554 * vect_get_cost (vector_stmt)
2555 + vect_get_cost (vec_to_scalar));
2420 else 2556 else
2421 /* Use extracts and reduction op for final reduction. For N elements, 2557 /* Use extracts and reduction op for final reduction. For N elements,
2422 we have N extracts and N-1 reduction ops. */ 2558 we have N extracts and N-1 reduction ops. */
2423 outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST); 2559 outer_cost += ((nelements + nelements - 1)
2560 * vect_get_cost (vector_stmt));
2424 } 2561 }
2425 } 2562 }
2426 2563
2427 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost; 2564 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
2428 2565
2441 2578
2442 static void 2579 static void
2443 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies) 2580 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
2444 { 2581 {
2445 /* loop cost for vec_loop. */ 2582 /* loop cost for vec_loop. */
2446 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST; 2583 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2584 = ncopies * vect_get_cost (vector_stmt);
2447 /* prologue cost for vec_init and vec_step. */ 2585 /* prologue cost for vec_init and vec_step. */
2448 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST; 2586 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info)
2587 = 2 * vect_get_cost (scalar_to_vec);
2449 2588
2450 if (vect_print_dump_info (REPORT_COST)) 2589 if (vect_print_dump_info (REPORT_COST))
2451 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, " 2590 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
2452 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info), 2591 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2453 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info)); 2592 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2460 STMT - a stmt that performs an induction operation in the loop. 2599 STMT - a stmt that performs an induction operation in the loop.
2461 IV_PHI - the initial value of the induction variable 2600 IV_PHI - the initial value of the induction variable
2462 2601
2463 Output: 2602 Output:
2464 Return a vector variable, initialized with the first VF values of 2603 Return a vector variable, initialized with the first VF values of
2465 the induction variable. E.g., for an iv with IV_PHI='X' and 2604 the induction variable. E.g., for an iv with IV_PHI='X' and
2466 evolution S, for a vector of 4 units, we want to return: 2605 evolution S, for a vector of 4 units, we want to return:
2467 [X, X + S, X + 2*S, X + 3*S]. */ 2606 [X, X + S, X + 2*S, X + 3*S]. */
2468 2607
2469 static tree 2608 static tree
2470 get_initial_def_for_induction (gimple iv_phi) 2609 get_initial_def_for_induction (gimple iv_phi)
2471 { 2610 {
2472 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi); 2611 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
2473 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo); 2612 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2474 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 2613 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2475 tree scalar_type = TREE_TYPE (gimple_phi_result (iv_phi)); 2614 tree scalar_type;
2476 tree vectype; 2615 tree vectype;
2477 int nunits; 2616 int nunits;
2478 edge pe = loop_preheader_edge (loop); 2617 edge pe = loop_preheader_edge (loop);
2479 struct loop *iv_loop; 2618 struct loop *iv_loop;
2480 basic_block new_bb; 2619 basic_block new_bb;
2499 edge latch_e; 2638 edge latch_e;
2500 tree loop_arg; 2639 tree loop_arg;
2501 gimple_stmt_iterator si; 2640 gimple_stmt_iterator si;
2502 basic_block bb = gimple_bb (iv_phi); 2641 basic_block bb = gimple_bb (iv_phi);
2503 tree stepvectype; 2642 tree stepvectype;
2504 2643 tree resvectype;
2505 vectype = get_vectype_for_scalar_type (scalar_type);
2506 gcc_assert (vectype);
2507 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2508 ncopies = vf / nunits;
2509
2510 gcc_assert (phi_info);
2511 gcc_assert (ncopies >= 1);
2512
2513 /* Find the first insertion point in the BB. */
2514 si = gsi_after_labels (bb);
2515
2516 if (INTEGRAL_TYPE_P (scalar_type))
2517 step_expr = build_int_cst (scalar_type, 0);
2518 else if (POINTER_TYPE_P (scalar_type))
2519 step_expr = build_int_cst (sizetype, 0);
2520 else
2521 step_expr = build_real (scalar_type, dconst0);
2522 2644
2523 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */ 2645 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
2524 if (nested_in_vect_loop_p (loop, iv_phi)) 2646 if (nested_in_vect_loop_p (loop, iv_phi))
2525 { 2647 {
2526 nested_in_vect_loop = true; 2648 nested_in_vect_loop = true;
2533 latch_e = loop_latch_edge (iv_loop); 2655 latch_e = loop_latch_edge (iv_loop);
2534 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e); 2656 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
2535 2657
2536 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi)); 2658 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
2537 gcc_assert (access_fn); 2659 gcc_assert (access_fn);
2660 STRIP_NOPS (access_fn);
2538 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn, 2661 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
2539 &init_expr, &step_expr); 2662 &init_expr, &step_expr);
2540 gcc_assert (ok); 2663 gcc_assert (ok);
2541 pe = loop_preheader_edge (iv_loop); 2664 pe = loop_preheader_edge (iv_loop);
2542 2665
2666 scalar_type = TREE_TYPE (init_expr);
2667 vectype = get_vectype_for_scalar_type (scalar_type);
2668 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
2669 gcc_assert (vectype);
2670 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2671 ncopies = vf / nunits;
2672
2673 gcc_assert (phi_info);
2674 gcc_assert (ncopies >= 1);
2675
2676 /* Find the first insertion point in the BB. */
2677 si = gsi_after_labels (bb);
2678
2543 /* Create the vector that holds the initial_value of the induction. */ 2679 /* Create the vector that holds the initial_value of the induction. */
2544 if (nested_in_vect_loop) 2680 if (nested_in_vect_loop)
2545 { 2681 {
2546 /* iv_loop is nested in the loop to be vectorized. init_expr had already 2682 /* iv_loop is nested in the loop to be vectorized. init_expr had already
2547 been created during vectorization of previous stmts; We obtain it from 2683 been created during vectorization of previous stmts. We obtain it
2548 the STMT_VINFO_VEC_STMT of the defining stmt. */ 2684 from the STMT_VINFO_VEC_STMT of the defining stmt. */
2549 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi, 2685 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
2550 loop_preheader_edge (iv_loop)); 2686 loop_preheader_edge (iv_loop));
2551 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL); 2687 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
2552 } 2688 }
2553 else 2689 else
2563 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); 2699 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
2564 gcc_assert (!new_bb); 2700 gcc_assert (!new_bb);
2565 } 2701 }
2566 2702
2567 t = NULL_TREE; 2703 t = NULL_TREE;
2568 t = tree_cons (NULL_TREE, init_expr, t); 2704 t = tree_cons (NULL_TREE, new_name, t);
2569 for (i = 1; i < nunits; i++) 2705 for (i = 1; i < nunits; i++)
2570 { 2706 {
2571 /* Create: new_name_i = new_name + step_expr */ 2707 /* Create: new_name_i = new_name + step_expr */
2572 enum tree_code code = POINTER_TYPE_P (scalar_type) 2708 enum tree_code code = POINTER_TYPE_P (scalar_type)
2573 ? POINTER_PLUS_EXPR : PLUS_EXPR; 2709 ? POINTER_PLUS_EXPR : PLUS_EXPR;
2604 expr = build_int_cst (TREE_TYPE (step_expr), vf); 2740 expr = build_int_cst (TREE_TYPE (step_expr), vf);
2605 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr), 2741 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2606 expr, step_expr); 2742 expr, step_expr);
2607 } 2743 }
2608 2744
2609 t = NULL_TREE; 2745 t = unshare_expr (new_name);
2610 for (i = 0; i < nunits; i++)
2611 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
2612 gcc_assert (CONSTANT_CLASS_P (new_name)); 2746 gcc_assert (CONSTANT_CLASS_P (new_name));
2613 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name)); 2747 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
2614 gcc_assert (stepvectype); 2748 gcc_assert (stepvectype);
2615 vec = build_vector (stepvectype, t); 2749 vec = build_vector_from_val (stepvectype, t);
2616 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL); 2750 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2617 2751
2618 2752
2619 /* Create the following def-use cycle: 2753 /* Create the following def-use cycle:
2620 loop prolog: 2754 loop prolog:
2664 2798
2665 /* Create the vector that holds the step of the induction. */ 2799 /* Create the vector that holds the step of the induction. */
2666 expr = build_int_cst (TREE_TYPE (step_expr), nunits); 2800 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
2667 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr), 2801 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2668 expr, step_expr); 2802 expr, step_expr);
2669 t = NULL_TREE; 2803 t = unshare_expr (new_name);
2670 for (i = 0; i < nunits; i++)
2671 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
2672 gcc_assert (CONSTANT_CLASS_P (new_name)); 2804 gcc_assert (CONSTANT_CLASS_P (new_name));
2673 vec = build_vector (stepvectype, t); 2805 vec = build_vector_from_val (stepvectype, t);
2674 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL); 2806 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2675 2807
2676 vec_def = induc_def; 2808 vec_def = induc_def;
2677 prev_stmt_vinfo = vinfo_for_stmt (induction_phi); 2809 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
2678 for (i = 1; i < ncopies; i++) 2810 for (i = 1; i < ncopies; i++)
2682 vec_def, vec_step); 2814 vec_def, vec_step);
2683 vec_def = make_ssa_name (vec_dest, new_stmt); 2815 vec_def = make_ssa_name (vec_dest, new_stmt);
2684 gimple_assign_set_lhs (new_stmt, vec_def); 2816 gimple_assign_set_lhs (new_stmt, vec_def);
2685 2817
2686 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT); 2818 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2819 if (!useless_type_conversion_p (resvectype, vectype))
2820 {
2821 new_stmt = gimple_build_assign_with_ops
2822 (VIEW_CONVERT_EXPR,
2823 vect_get_new_vect_var (resvectype, vect_simple_var,
2824 "vec_iv_"),
2825 build1 (VIEW_CONVERT_EXPR, resvectype,
2826 gimple_assign_lhs (new_stmt)), NULL_TREE);
2827 gimple_assign_set_lhs (new_stmt,
2828 make_ssa_name
2829 (gimple_assign_lhs (new_stmt), new_stmt));
2830 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2831 }
2687 set_vinfo_for_stmt (new_stmt, 2832 set_vinfo_for_stmt (new_stmt,
2688 new_stmt_vec_info (new_stmt, loop_vinfo, NULL)); 2833 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
2689 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt; 2834 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
2690 prev_stmt_vinfo = vinfo_for_stmt (new_stmt); 2835 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
2691 } 2836 }
2729 fprintf (vect_dump, "\n"); 2874 fprintf (vect_dump, "\n");
2730 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM); 2875 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
2731 } 2876 }
2732 2877
2733 STMT_VINFO_VEC_STMT (phi_info) = induction_phi; 2878 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
2879 if (!useless_type_conversion_p (resvectype, vectype))
2880 {
2881 new_stmt = gimple_build_assign_with_ops
2882 (VIEW_CONVERT_EXPR,
2883 vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
2884 build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
2885 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
2886 gimple_assign_set_lhs (new_stmt, induc_def);
2887 si = gsi_start_bb (bb);
2888 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2889 set_vinfo_for_stmt (new_stmt,
2890 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
2891 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
2892 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
2893 }
2894
2734 return induc_def; 2895 return induc_def;
2735 } 2896 }
2736 2897
2737 2898
2738 /* Function get_initial_def_for_reduction 2899 /* Function get_initial_def_for_reduction
2809 nested_in_vect_loop = true; 2970 nested_in_vect_loop = true;
2810 else 2971 else
2811 gcc_assert (loop == (gimple_bb (stmt))->loop_father); 2972 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
2812 2973
2813 /* In case of double reduction we only create a vector variable to be put 2974 /* In case of double reduction we only create a vector variable to be put
2814 in the reduction phi node. The actual statement creation is done in 2975 in the reduction phi node. The actual statement creation is done in
2815 vect_create_epilog_for_reduction. */ 2976 vect_create_epilog_for_reduction. */
2816 if (adjustment_def && nested_in_vect_loop 2977 if (adjustment_def && nested_in_vect_loop
2817 && TREE_CODE (init_val) == SSA_NAME 2978 && TREE_CODE (init_val) == SSA_NAME
2818 && (def_stmt = SSA_NAME_DEF_STMT (init_val)) 2979 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
2819 && gimple_code (def_stmt) == GIMPLE_PHI 2980 && gimple_code (def_stmt) == GIMPLE_PHI
2855 NULL); 3016 NULL);
2856 else 3017 else
2857 *adjustment_def = init_val; 3018 *adjustment_def = init_val;
2858 } 3019 }
2859 3020
2860 if (code == MULT_EXPR || code == BIT_AND_EXPR) 3021 if (code == MULT_EXPR)
2861 { 3022 {
2862 real_init_val = dconst1; 3023 real_init_val = dconst1;
2863 int_init_val = 1; 3024 int_init_val = 1;
2864 } 3025 }
3026
3027 if (code == BIT_AND_EXPR)
3028 int_init_val = -1;
2865 3029
2866 if (SCALAR_FLOAT_TYPE_P (scalar_type)) 3030 if (SCALAR_FLOAT_TYPE_P (scalar_type))
2867 def_for_init = build_real (scalar_type, real_init_val); 3031 def_for_init = build_real (scalar_type, real_init_val);
2868 else 3032 else
2869 def_for_init = build_int_cst (scalar_type, int_init_val); 3033 def_for_init = build_int_cst (scalar_type, int_init_val);
2897 *adjustment_def = NULL_TREE; 3061 *adjustment_def = NULL_TREE;
2898 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL); 3062 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
2899 break; 3063 break;
2900 } 3064 }
2901 3065
2902 for (i = nunits - 1; i >= 0; --i) 3066 init_def = build_vector_from_val (vectype, init_value);
2903 t = tree_cons (NULL_TREE, init_value, t);
2904
2905 if (TREE_CONSTANT (init_val))
2906 init_def = build_vector (vectype, t);
2907 else
2908 init_def = build_constructor_from_list (vectype, t);
2909
2910 break; 3067 break;
2911 3068
2912 default: 3069 default:
2913 gcc_unreachable (); 3070 gcc_unreachable ();
2914 } 3071 }
2924 3081
2925 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector 3082 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
2926 reduction statements. 3083 reduction statements.
2927 STMT is the scalar reduction stmt that is being vectorized. 3084 STMT is the scalar reduction stmt that is being vectorized.
2928 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the 3085 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
2929 number of elements that we can fit in a vectype (nunits). In this case 3086 number of elements that we can fit in a vectype (nunits). In this case
2930 we have to generate more than one vector stmt - i.e - we need to "unroll" 3087 we have to generate more than one vector stmt - i.e - we need to "unroll"
2931 the vector stmt by a factor VF/nunits. For more details see documentation 3088 the vector stmt by a factor VF/nunits. For more details see documentation
2932 in vectorizable_operation. 3089 in vectorizable_operation.
2933 REDUC_CODE is the tree-code for the epilog reduction. 3090 REDUC_CODE is the tree-code for the epilog reduction.
2934 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction 3091 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3004 tree bitsize, bitpos; 3161 tree bitsize, bitpos;
3005 tree adjustment_def = NULL; 3162 tree adjustment_def = NULL;
3006 tree vec_initial_def = NULL; 3163 tree vec_initial_def = NULL;
3007 tree reduction_op, expr, def; 3164 tree reduction_op, expr, def;
3008 tree orig_name, scalar_result; 3165 tree orig_name, scalar_result;
3009 imm_use_iterator imm_iter; 3166 imm_use_iterator imm_iter, phi_imm_iter;
3010 use_operand_p use_p; 3167 use_operand_p use_p, phi_use_p;
3011 bool extract_scalar_result = false; 3168 bool extract_scalar_result = false;
3012 gimple use_stmt, orig_stmt, reduction_phi = NULL; 3169 gimple use_stmt, orig_stmt, reduction_phi = NULL;
3013 bool nested_in_vect_loop = false; 3170 bool nested_in_vect_loop = false;
3014 VEC (gimple, heap) *new_phis = NULL; 3171 VEC (gimple, heap) *new_phis = NULL;
3015 enum vect_def_type dt = vect_unknown_def_type; 3172 enum vect_def_type dt = vect_unknown_def_type;
3069 3226
3070 (in case of SLP, do it for all the phis). */ 3227 (in case of SLP, do it for all the phis). */
3071 3228
3072 /* Get the loop-entry arguments. */ 3229 /* Get the loop-entry arguments. */
3073 if (slp_node) 3230 if (slp_node)
3074 vect_get_slp_defs (slp_node, &vec_initial_defs, NULL, reduc_index); 3231 vect_get_slp_defs (reduction_op, NULL_TREE, slp_node, &vec_initial_defs,
3232 NULL, reduc_index);
3075 else 3233 else
3076 { 3234 {
3077 vec_initial_defs = VEC_alloc (tree, heap, 1); 3235 vec_initial_defs = VEC_alloc (tree, heap, 1);
3078 /* For the case of reduction, vect_get_vec_def_for_operand returns 3236 /* For the case of reduction, vect_get_vec_def_for_operand returns
3079 the scalar def before the loop, that defines the initial value 3237 the scalar def before the loop, that defines the initial value
3082 &adjustment_def); 3240 &adjustment_def);
3083 VEC_quick_push (tree, vec_initial_defs, vec_initial_def); 3241 VEC_quick_push (tree, vec_initial_defs, vec_initial_def);
3084 } 3242 }
3085 3243
3086 /* Set phi nodes arguments. */ 3244 /* Set phi nodes arguments. */
3087 for (i = 0; VEC_iterate (gimple, reduction_phis, i, phi); i++) 3245 FOR_EACH_VEC_ELT (gimple, reduction_phis, i, phi)
3088 { 3246 {
3089 tree vec_init_def = VEC_index (tree, vec_initial_defs, i); 3247 tree vec_init_def = VEC_index (tree, vec_initial_defs, i);
3090 tree def = VEC_index (tree, vect_defs, i); 3248 tree def = VEC_index (tree, vect_defs, i);
3091 for (j = 0; j < ncopies; j++) 3249 for (j = 0; j < ncopies; j++)
3092 { 3250 {
3148 Store them in NEW_PHIS. */ 3306 Store them in NEW_PHIS. */
3149 3307
3150 exit_bb = single_exit (loop)->dest; 3308 exit_bb = single_exit (loop)->dest;
3151 prev_phi_info = NULL; 3309 prev_phi_info = NULL;
3152 new_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs)); 3310 new_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs));
3153 for (i = 0; VEC_iterate (tree, vect_defs, i, def); i++) 3311 FOR_EACH_VEC_ELT (tree, vect_defs, i, def)
3154 { 3312 {
3155 for (j = 0; j < ncopies; j++) 3313 for (j = 0; j < ncopies; j++)
3156 { 3314 {
3157 phi = create_phi_node (SSA_NAME_VAR (def), exit_bb); 3315 phi = create_phi_node (SSA_NAME_VAR (def), exit_bb);
3158 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL)); 3316 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3165 } 3323 }
3166 3324
3167 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def); 3325 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3168 prev_phi_info = vinfo_for_stmt (phi); 3326 prev_phi_info = vinfo_for_stmt (phi);
3169 } 3327 }
3328 }
3329
3330 /* The epilogue is created for the outer-loop, i.e., for the loop being
3331 vectorized. */
3332 if (double_reduc)
3333 {
3334 loop = outer_loop;
3335 exit_bb = single_exit (loop)->dest;
3170 } 3336 }
3171 3337
3172 exit_gsi = gsi_after_labels (exit_bb); 3338 exit_gsi = gsi_after_labels (exit_bb);
3173 3339
3174 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3 3340 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3207 bitsize = TYPE_SIZE (scalar_type); 3373 bitsize = TYPE_SIZE (scalar_type);
3208 3374
3209 /* In case this is a reduction in an inner-loop while vectorizing an outer 3375 /* In case this is a reduction in an inner-loop while vectorizing an outer
3210 loop - we don't need to extract a single scalar result at the end of the 3376 loop - we don't need to extract a single scalar result at the end of the
3211 inner-loop (unless it is double reduction, i.e., the use of reduction is 3377 inner-loop (unless it is double reduction, i.e., the use of reduction is
3212 outside the outer-loop). The final vector of partial results will be used 3378 outside the outer-loop). The final vector of partial results will be used
3213 in the vectorized outer-loop, or reduced to a scalar result at the end of 3379 in the vectorized outer-loop, or reduced to a scalar result at the end of
3214 the outer-loop. */ 3380 the outer-loop. */
3215 if (nested_in_vect_loop && !double_reduc) 3381 if (nested_in_vect_loop && !double_reduc)
3216 goto vect_finalize_reduction; 3382 goto vect_finalize_reduction;
3217 3383
3245 int bit_offset; 3411 int bit_offset;
3246 int element_bitsize = tree_low_cst (bitsize, 1); 3412 int element_bitsize = tree_low_cst (bitsize, 1);
3247 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1); 3413 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3248 tree vec_temp; 3414 tree vec_temp;
3249 3415
3250 if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing) 3416 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3251 shift_code = VEC_RSHIFT_EXPR; 3417 shift_code = VEC_RSHIFT_EXPR;
3252 else 3418 else
3253 have_whole_vector_shift = false; 3419 have_whole_vector_shift = false;
3254 3420
3255 /* Regardless of whether we have a whole vector shift, if we're 3421 /* Regardless of whether we have a whole vector shift, if we're
3261 if (!VECTOR_MODE_P (mode)) 3427 if (!VECTOR_MODE_P (mode))
3262 have_whole_vector_shift = false; 3428 have_whole_vector_shift = false;
3263 else 3429 else
3264 { 3430 {
3265 optab optab = optab_for_tree_code (code, vectype, optab_default); 3431 optab optab = optab_for_tree_code (code, vectype, optab_default);
3266 if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing) 3432 if (optab_handler (optab, mode) == CODE_FOR_nothing)
3267 have_whole_vector_shift = false; 3433 have_whole_vector_shift = false;
3268 } 3434 }
3269 3435
3270 if (have_whole_vector_shift && !slp_node) 3436 if (have_whole_vector_shift && !slp_node)
3271 { 3437 {
3319 3485
3320 if (vect_print_dump_info (REPORT_DETAILS)) 3486 if (vect_print_dump_info (REPORT_DETAILS))
3321 fprintf (vect_dump, "Reduce using scalar code. "); 3487 fprintf (vect_dump, "Reduce using scalar code. ");
3322 3488
3323 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1); 3489 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3324 for (i = 0; VEC_iterate (gimple, new_phis, i, new_phi); i++) 3490 FOR_EACH_VEC_ELT (gimple, new_phis, i, new_phi)
3325 { 3491 {
3326 vec_temp = PHI_RESULT (new_phi); 3492 vec_temp = PHI_RESULT (new_phi);
3327 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize, 3493 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3328 bitsize_zero_node); 3494 bitsize_zero_node);
3329 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs); 3495 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3366 } 3532 }
3367 } 3533 }
3368 } 3534 }
3369 3535
3370 /* The only case where we need to reduce scalar results in SLP, is 3536 /* The only case where we need to reduce scalar results in SLP, is
3371 unrolling. If the size of SCALAR_RESULTS is greater than 3537 unrolling. If the size of SCALAR_RESULTS is greater than
3372 GROUP_SIZE, we reduce them combining elements modulo 3538 GROUP_SIZE, we reduce them combining elements modulo
3373 GROUP_SIZE. */ 3539 GROUP_SIZE. */
3374 if (slp_node) 3540 if (slp_node)
3375 { 3541 {
3376 tree res, first_res, new_res; 3542 tree res, first_res, new_res;
3422 VEC_safe_push (tree, heap, scalar_results, new_temp); 3588 VEC_safe_push (tree, heap, scalar_results, new_temp);
3423 } 3589 }
3424 3590
3425 vect_finalize_reduction: 3591 vect_finalize_reduction:
3426 3592
3593 if (double_reduc)
3594 loop = loop->inner;
3595
3427 /* 2.5 Adjust the final result by the initial value of the reduction 3596 /* 2.5 Adjust the final result by the initial value of the reduction
3428 variable. (When such adjustment is not needed, then 3597 variable. (When such adjustment is not needed, then
3429 'adjustment_def' is zero). For example, if code is PLUS we create: 3598 'adjustment_def' is zero). For example, if code is PLUS we create:
3430 new_temp = loop_exit_def + adjustment_def */ 3599 new_temp = loop_exit_def + adjustment_def */
3431 3600
3469 VEC_replace (tree, scalar_results, 0, new_temp); 3638 VEC_replace (tree, scalar_results, 0, new_temp);
3470 3639
3471 VEC_replace (gimple, new_phis, 0, epilog_stmt); 3640 VEC_replace (gimple, new_phis, 0, epilog_stmt);
3472 } 3641 }
3473 3642
3474 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit 3643 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
3475 phis with new adjusted scalar results, i.e., replace use <s_out0> 3644 phis with new adjusted scalar results, i.e., replace use <s_out0>
3476 with use <s_out4>. 3645 with use <s_out4>.
3477 3646
3478 Transform: 3647 Transform:
3479 loop_exit: 3648 loop_exit:
3495 s_out4 = adjust_result <s_out3> 3664 s_out4 = adjust_result <s_out3>
3496 use <s_out4> 3665 use <s_out4>
3497 use <s_out4> */ 3666 use <s_out4> */
3498 3667
3499 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in 3668 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
3500 case that GROUP_SIZE is greater than vectorization factor). Therefore, we 3669 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
3501 need to match SCALAR_RESULTS with corresponding statements. The first 3670 need to match SCALAR_RESULTS with corresponding statements. The first
3502 (GROUP_SIZE / number of new vector stmts) scalar results correspond to 3671 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
3503 the first vector stmt, etc. 3672 the first vector stmt, etc.
3504 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */ 3673 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
3505 if (group_size > VEC_length (gimple, new_phis)) 3674 if (group_size > VEC_length (gimple, new_phis))
3506 { 3675 {
3529 scalar_dest = gimple_assign_lhs (current_stmt); 3698 scalar_dest = gimple_assign_lhs (current_stmt);
3530 } 3699 }
3531 3700
3532 phis = VEC_alloc (gimple, heap, 3); 3701 phis = VEC_alloc (gimple, heap, 3);
3533 /* Find the loop-closed-use at the loop exit of the original scalar 3702 /* Find the loop-closed-use at the loop exit of the original scalar
3534 result. (The reduction result is expected to have two immediate uses - 3703 result. (The reduction result is expected to have two immediate uses -
3535 one at the latch block, and one at the loop exit). */ 3704 one at the latch block, and one at the loop exit). */
3536 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest) 3705 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
3537 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))) 3706 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
3538 VEC_safe_push (gimple, heap, phis, USE_STMT (use_p)); 3707 VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
3539 3708
3540 /* We expect to have found an exit_phi because of loop-closed-ssa 3709 /* We expect to have found an exit_phi because of loop-closed-ssa
3541 form. */ 3710 form. */
3542 gcc_assert (!VEC_empty (gimple, phis)); 3711 gcc_assert (!VEC_empty (gimple, phis));
3543 3712
3544 for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++) 3713 FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
3545 { 3714 {
3546 if (outer_loop) 3715 if (outer_loop)
3547 { 3716 {
3548 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi); 3717 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
3549 gimple vect_phi; 3718 gimple vect_phi;
3630 } 3799 }
3631 3800
3632 vect_phi_res = PHI_RESULT (vect_phi); 3801 vect_phi_res = PHI_RESULT (vect_phi);
3633 3802
3634 /* Replace the use, i.e., set the correct vs1 in the regular 3803 /* Replace the use, i.e., set the correct vs1 in the regular
3635 reduction phi node. FORNOW, NCOPIES is always 1, so the 3804 reduction phi node. FORNOW, NCOPIES is always 1, so the
3636 loop is redundant. */ 3805 loop is redundant. */
3637 use = reduction_phi; 3806 use = reduction_phi;
3638 for (j = 0; j < ncopies; j++) 3807 for (j = 0; j < ncopies; j++)
3639 { 3808 {
3640 edge pr_edge = loop_preheader_edge (loop); 3809 edge pr_edge = loop_preheader_edge (loop);
3641 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res); 3810 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
3642 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use)); 3811 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
3643 } 3812 }
3644 } 3813 }
3645 } 3814 }
3646 3815 }
3816
3817 VEC_free (gimple, heap, phis);
3818 if (nested_in_vect_loop)
3819 {
3820 if (double_reduc)
3821 loop = outer_loop;
3822 else
3823 continue;
3824 }
3825
3826 phis = VEC_alloc (gimple, heap, 3);
3827 /* Find the loop-closed-use at the loop exit of the original scalar
3828 result. (The reduction result is expected to have two immediate uses,
3829 one at the latch block, and one at the loop exit). For double
3830 reductions we are looking for exit phis of the outer loop. */
3831 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
3832 {
3833 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
3834 VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
3835 else
3836 {
3837 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
3838 {
3839 tree phi_res = PHI_RESULT (USE_STMT (use_p));
3840
3841 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
3842 {
3843 if (!flow_bb_inside_loop_p (loop,
3844 gimple_bb (USE_STMT (phi_use_p))))
3845 VEC_safe_push (gimple, heap, phis,
3846 USE_STMT (phi_use_p));
3847 }
3848 }
3849 }
3850 }
3851
3852 FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
3853 {
3647 /* Replace the uses: */ 3854 /* Replace the uses: */
3648 orig_name = PHI_RESULT (exit_phi); 3855 orig_name = PHI_RESULT (exit_phi);
3649 scalar_result = VEC_index (tree, scalar_results, k); 3856 scalar_result = VEC_index (tree, scalar_results, k);
3650 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name) 3857 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3651 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) 3858 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3666 If VEC_STMT is also passed, vectorize the STMT: create a vectorized 3873 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3667 stmt to replace it, put it in VEC_STMT, and insert it at GSI. 3874 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
3668 Return FALSE if not a vectorizable STMT, TRUE otherwise. 3875 Return FALSE if not a vectorizable STMT, TRUE otherwise.
3669 3876
3670 This function also handles reduction idioms (patterns) that have been 3877 This function also handles reduction idioms (patterns) that have been
3671 recognized in advance during vect_pattern_recog. In this case, STMT may be 3878 recognized in advance during vect_pattern_recog. In this case, STMT may be
3672 of this form: 3879 of this form:
3673 X = pattern_expr (arg0, arg1, ..., X) 3880 X = pattern_expr (arg0, arg1, ..., X)
3674 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original 3881 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
3675 sequence that had been detected and replaced by the pattern-stmt (STMT). 3882 sequence that had been detected and replaced by the pattern-stmt (STMT).
3676 3883
3687 vectorization factor should be V8HI); on the other hand, the vectype that 3894 vectorization factor should be V8HI); on the other hand, the vectype that
3688 is used to create the vector form is actually V4SI (the type of the result). 3895 is used to create the vector form is actually V4SI (the type of the result).
3689 3896
3690 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that 3897 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
3691 indicates what is the actual level of parallelism (V8HI in the example), so 3898 indicates what is the actual level of parallelism (V8HI in the example), so
3692 that the right vectorization factor would be derived. This vectype 3899 that the right vectorization factor would be derived. This vectype
3693 corresponds to the type of arguments to the reduction stmt, and should *NOT* 3900 corresponds to the type of arguments to the reduction stmt, and should *NOT*
3694 be used to create the vectorized stmt. The right vectype for the vectorized 3901 be used to create the vectorized stmt. The right vectype for the vectorized
3695 stmt is obtained from the type of the result X: 3902 stmt is obtained from the type of the result X:
3696 get_vectype_for_scalar_type (TREE_TYPE (X)) 3903 get_vectype_for_scalar_type (TREE_TYPE (X))
3697 3904
3698 This means that, contrary to "regular" reductions (or "regular" stmts in 3905 This means that, contrary to "regular" reductions (or "regular" stmts in
3699 general), the following equation: 3906 general), the following equation:
3745 tree def_arg; 3952 tree def_arg;
3746 gimple def_arg_stmt; 3953 gimple def_arg_stmt;
3747 VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL, *vect_defs = NULL; 3954 VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL, *vect_defs = NULL;
3748 VEC (gimple, heap) *phis = NULL; 3955 VEC (gimple, heap) *phis = NULL;
3749 int vec_num; 3956 int vec_num;
3750 tree def0, def1; 3957 tree def0, def1, tem;
3751 3958
3752 if (nested_in_vect_loop_p (loop, stmt)) 3959 if (nested_in_vect_loop_p (loop, stmt))
3753 { 3960 {
3754 outer_loop = loop; 3961 outer_loop = loop;
3755 loop = loop->inner; 3962 loop = loop->inner;
3786 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt); 3993 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
3787 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)); 3994 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
3788 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info)); 3995 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
3789 } 3996 }
3790 3997
3791 /* 3. Check the operands of the operation. The first operands are defined 3998 /* 3. Check the operands of the operation. The first operands are defined
3792 inside the loop body. The last operand is the reduction variable, 3999 inside the loop body. The last operand is the reduction variable,
3793 which is defined by the loop-header-phi. */ 4000 which is defined by the loop-header-phi. */
3794 4001
3795 gcc_assert (is_gimple_assign (stmt)); 4002 gcc_assert (is_gimple_assign (stmt));
3796 4003
3797 /* Flatten RHS */ 4004 /* Flatten RHS. */
3798 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt))) 4005 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3799 { 4006 {
3800 case GIMPLE_SINGLE_RHS: 4007 case GIMPLE_SINGLE_RHS:
3801 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)); 4008 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
3802 if (op_type == ternary_op) 4009 if (op_type == ternary_op)
3831 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type) 4038 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
3832 && !SCALAR_FLOAT_TYPE_P (scalar_type)) 4039 && !SCALAR_FLOAT_TYPE_P (scalar_type))
3833 return false; 4040 return false;
3834 4041
3835 /* All uses but the last are expected to be defined in the loop. 4042 /* All uses but the last are expected to be defined in the loop.
3836 The last use is the reduction variable. In case of nested cycle this 4043 The last use is the reduction variable. In case of nested cycle this
3837 assumption is not true: we use reduc_index to record the index of the 4044 assumption is not true: we use reduc_index to record the index of the
3838 reduction variable. */ 4045 reduction variable. */
3839 for (i = 0; i < op_type-1; i++) 4046 for (i = 0; i < op_type-1; i++)
3840 { 4047 {
3841 tree tem;
3842
3843 /* The condition of COND_EXPR is checked in vectorizable_condition(). */ 4048 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
3844 if (i == 0 && code == COND_EXPR) 4049 if (i == 0 && code == COND_EXPR)
3845 continue; 4050 continue;
3846 4051
3847 is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL, 4052 is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL,
3862 reduc_def_stmt = def_stmt; 4067 reduc_def_stmt = def_stmt;
3863 reduc_index = i; 4068 reduc_index = i;
3864 } 4069 }
3865 } 4070 }
3866 4071
3867 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, NULL, &def_stmt, 4072 is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL, &def_stmt,
3868 &def, &dt); 4073 &def, &dt, &tem);
4074 if (!vectype_in)
4075 vectype_in = tem;
3869 gcc_assert (is_simple_use); 4076 gcc_assert (is_simple_use);
3870 gcc_assert (dt == vect_reduction_def 4077 gcc_assert (dt == vect_reduction_def
3871 || dt == vect_nested_cycle 4078 || dt == vect_nested_cycle
3872 || ((dt == vect_internal_def || dt == vect_external_def 4079 || ((dt == vect_internal_def || dt == vect_external_def
3873 || dt == vect_constant_def || dt == vect_induction_def) 4080 || dt == vect_constant_def || dt == vect_induction_def)
3920 fprintf (vect_dump, "no optab."); 4127 fprintf (vect_dump, "no optab.");
3921 4128
3922 return false; 4129 return false;
3923 } 4130 }
3924 4131
3925 if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing) 4132 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
3926 { 4133 {
3927 if (vect_print_dump_info (REPORT_DETAILS)) 4134 if (vect_print_dump_info (REPORT_DETAILS))
3928 fprintf (vect_dump, "op not supported by target."); 4135 fprintf (vect_dump, "op not supported by target.");
3929 4136
3930 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD 4137 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
3962 4169
3963 This means that: 4170 This means that:
3964 1. The tree-code that is used to create the vector operation in the 4171 1. The tree-code that is used to create the vector operation in the
3965 epilog code (that reduces the partial results) is not the 4172 epilog code (that reduces the partial results) is not the
3966 tree-code of STMT, but is rather the tree-code of the original 4173 tree-code of STMT, but is rather the tree-code of the original
3967 stmt from the pattern that STMT is replacing. I.e, in the example 4174 stmt from the pattern that STMT is replacing. I.e, in the example
3968 above we want to use 'widen_sum' in the loop, but 'plus' in the 4175 above we want to use 'widen_sum' in the loop, but 'plus' in the
3969 epilog. 4176 epilog.
3970 2. The type (mode) we use to check available target support 4177 2. The type (mode) we use to check available target support
3971 for the vector operation to be created in the *epilog*, is 4178 for the vector operation to be created in the *epilog*, is
3972 determined by the type of the reduction variable (in the example 4179 determined by the type of the reduction variable (in the example
3973 above we'd check this: plus_optab[vect_int_mode]). 4180 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
3974 However the type (mode) we use to check available target support 4181 However the type (mode) we use to check available target support
3975 for the vector operation to be created *inside the loop*, is 4182 for the vector operation to be created *inside the loop*, is
3976 determined by the type of the other arguments to STMT (in the 4183 determined by the type of the other arguments to STMT (in the
3977 example we'd check this: widen_sum_optab[vect_short_mode]). 4184 example we'd check this: optab_handler (widen_sum_optab,
4185 vect_short_mode)).
3978 4186
3979 This is contrary to "regular" reductions, in which the types of all 4187 This is contrary to "regular" reductions, in which the types of all
3980 the arguments are the same as the type of the reduction variable. 4188 the arguments are the same as the type of the reduction variable.
3981 For "regular" reductions we can therefore use the same vector type 4189 For "regular" reductions we can therefore use the same vector type
3982 (and also the same tree-code) when generating the epilog code and 4190 (and also the same tree-code) when generating the epilog code and
4025 4233
4026 epilog_reduc_code = ERROR_MARK; 4234 epilog_reduc_code = ERROR_MARK;
4027 } 4235 }
4028 4236
4029 if (reduc_optab 4237 if (reduc_optab
4030 && optab_handler (reduc_optab, vec_mode)->insn_code 4238 && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
4031 == CODE_FOR_nothing)
4032 { 4239 {
4033 if (vect_print_dump_info (REPORT_DETAILS)) 4240 if (vect_print_dump_info (REPORT_DETAILS))
4034 fprintf (vect_dump, "reduc op not supported by target."); 4241 fprintf (vect_dump, "reduc op not supported by target.");
4035 4242
4036 epilog_reduc_code = ERROR_MARK; 4243 epilog_reduc_code = ERROR_MARK;
4159 } 4366 }
4160 4367
4161 /* Handle uses. */ 4368 /* Handle uses. */
4162 if (j == 0) 4369 if (j == 0)
4163 { 4370 {
4371 tree op0, op1 = NULL_TREE;
4372
4373 op0 = ops[!reduc_index];
4374 if (op_type == ternary_op)
4375 {
4376 if (reduc_index == 0)
4377 op1 = ops[2];
4378 else
4379 op1 = ops[1];
4380 }
4381
4164 if (slp_node) 4382 if (slp_node)
4165 vect_get_slp_defs (slp_node, &vec_oprnds0, &vec_oprnds1, -1); 4383 vect_get_slp_defs (op0, op1, slp_node, &vec_oprnds0, &vec_oprnds1,
4384 -1);
4166 else 4385 else
4167 { 4386 {
4168 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index], 4387 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
4169 stmt, NULL); 4388 stmt, NULL);
4170 VEC_quick_push (tree, vec_oprnds0, loop_vec_def0); 4389 VEC_quick_push (tree, vec_oprnds0, loop_vec_def0);
4171 if (op_type == ternary_op) 4390 if (op_type == ternary_op)
4172 { 4391 {
4173 if (reduc_index == 0) 4392 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
4174 loop_vec_def1 = vect_get_vec_def_for_operand (ops[2], stmt, 4393 NULL);
4175 NULL);
4176 else
4177 loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt,
4178 NULL);
4179
4180 VEC_quick_push (tree, vec_oprnds1, loop_vec_def1); 4394 VEC_quick_push (tree, vec_oprnds1, loop_vec_def1);
4181 } 4395 }
4182 } 4396 }
4183 } 4397 }
4184 else 4398 else
4200 reduc_def = gimple_assign_lhs (new_stmt); 4414 reduc_def = gimple_assign_lhs (new_stmt);
4201 4415
4202 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi; 4416 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
4203 } 4417 }
4204 4418
4205 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, def0); i++) 4419 FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, def0)
4206 { 4420 {
4207 if (slp_node) 4421 if (slp_node)
4208 reduc_def = PHI_RESULT (VEC_index (gimple, phis, i)); 4422 reduc_def = PHI_RESULT (VEC_index (gimple, phis, i));
4209 else 4423 else
4210 { 4424 {
4365 return true; 4579 return true;
4366 } 4580 }
4367 4581
4368 /* Function vectorizable_live_operation. 4582 /* Function vectorizable_live_operation.
4369 4583
4370 STMT computes a value that is used outside the loop. Check if 4584 STMT computes a value that is used outside the loop. Check if
4371 it can be supported. */ 4585 it can be supported. */
4372 4586
4373 bool 4587 bool
4374 vectorizable_live_operation (gimple stmt, 4588 vectorizable_live_operation (gimple stmt,
4375 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED, 4589 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4406 op_type = TREE_CODE_LENGTH (code); 4620 op_type = TREE_CODE_LENGTH (code);
4407 rhs_class = get_gimple_rhs_class (code); 4621 rhs_class = get_gimple_rhs_class (code);
4408 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op); 4622 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
4409 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op); 4623 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
4410 4624
4411 /* FORNOW: support only if all uses are invariant. This means 4625 /* FORNOW: support only if all uses are invariant. This means
4412 that the scalar operations can remain in place, unvectorized. 4626 that the scalar operations can remain in place, unvectorized.
4413 The original last scalar value that they compute will be used. */ 4627 The original last scalar value that they compute will be used. */
4414 4628
4415 for (i = 0; i < op_type; i++) 4629 for (i = 0; i < op_type; i++)
4416 { 4630 {
4517 4731
4518 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a 4732 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
4519 compile time constant), or it is a constant that doesn't divide by the 4733 compile time constant), or it is a constant that doesn't divide by the
4520 vectorization factor, then an epilog loop needs to be created. 4734 vectorization factor, then an epilog loop needs to be created.
4521 We therefore duplicate the loop: the original loop will be vectorized, 4735 We therefore duplicate the loop: the original loop will be vectorized,
4522 and will compute the first (n/VF) iterations. The second copy of the loop 4736 and will compute the first (n/VF) iterations. The second copy of the loop
4523 will remain scalar and will compute the remaining (n%VF) iterations. 4737 will remain scalar and will compute the remaining (n%VF) iterations.
4524 (VF is the vectorization factor). */ 4738 (VF is the vectorization factor). */
4525 4739
4526 if (do_peeling_for_loop_bound) 4740 if (do_peeling_for_loop_bound)
4527 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio, 4741 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,