Mercurial > hg > CbC > CbC_gcc
comparison gcc/tree-vect-loop.c @ 67:f6334be47118
update gcc from gcc-4.6-20100522 to gcc-4.6-20110318
author | nobuyasu <dimolto@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Tue, 22 Mar 2011 17:18:12 +0900 |
parents | b7f97abdc517 |
children | 04ced10e8804 |
comparison
equal
deleted
inserted
replaced
65:65488c3d617d | 67:f6334be47118 |
---|---|
25 #include "coretypes.h" | 25 #include "coretypes.h" |
26 #include "tm.h" | 26 #include "tm.h" |
27 #include "ggc.h" | 27 #include "ggc.h" |
28 #include "tree.h" | 28 #include "tree.h" |
29 #include "basic-block.h" | 29 #include "basic-block.h" |
30 #include "diagnostic.h" | |
31 #include "tree-pretty-print.h" | 30 #include "tree-pretty-print.h" |
32 #include "gimple-pretty-print.h" | 31 #include "gimple-pretty-print.h" |
33 #include "tree-flow.h" | 32 #include "tree-flow.h" |
34 #include "tree-dump.h" | 33 #include "tree-dump.h" |
35 #include "cfgloop.h" | 34 #include "cfgloop.h" |
36 #include "cfglayout.h" | 35 #include "cfglayout.h" |
37 #include "expr.h" | 36 #include "expr.h" |
38 #include "recog.h" | 37 #include "recog.h" |
39 #include "optabs.h" | 38 #include "optabs.h" |
40 #include "params.h" | 39 #include "params.h" |
41 #include "toplev.h" | 40 #include "diagnostic-core.h" |
42 #include "tree-chrec.h" | 41 #include "tree-chrec.h" |
43 #include "tree-scalar-evolution.h" | 42 #include "tree-scalar-evolution.h" |
44 #include "tree-vectorizer.h" | 43 #include "tree-vectorizer.h" |
44 #include "target.h" | |
45 | 45 |
46 /* Loop Vectorization Pass. | 46 /* Loop Vectorization Pass. |
47 | 47 |
48 This pass tries to vectorize loops. | 48 This pass tries to vectorize loops. |
49 | 49 |
73 the vectorizer applies a set of analyses on a given set of loops, | 73 the vectorizer applies a set of analyses on a given set of loops, |
74 followed by the actual vectorization transformation for the loops that | 74 followed by the actual vectorization transformation for the loops that |
75 had successfully passed the analysis phase. | 75 had successfully passed the analysis phase. |
76 Throughout this pass we make a distinction between two types of | 76 Throughout this pass we make a distinction between two types of |
77 data: scalars (which are represented by SSA_NAMES), and memory references | 77 data: scalars (which are represented by SSA_NAMES), and memory references |
78 ("data-refs"). These two types of data require different handling both | 78 ("data-refs"). These two types of data require different handling both |
79 during analysis and transformation. The types of data-refs that the | 79 during analysis and transformation. The types of data-refs that the |
80 vectorizer currently supports are ARRAY_REFS which base is an array DECL | 80 vectorizer currently supports are ARRAY_REFS which base is an array DECL |
81 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer | 81 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer |
82 accesses are required to have a simple (consecutive) access pattern. | 82 accesses are required to have a simple (consecutive) access pattern. |
83 | 83 |
94 | 94 |
95 Transformation phase: | 95 Transformation phase: |
96 ===================== | 96 ===================== |
97 The loop transformation phase scans all the stmts in the loop, and | 97 The loop transformation phase scans all the stmts in the loop, and |
98 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in | 98 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in |
99 the loop that needs to be vectorized. It inserts the vector code sequence | 99 the loop that needs to be vectorized. It inserts the vector code sequence |
100 just before the scalar stmt S, and records a pointer to the vector code | 100 just before the scalar stmt S, and records a pointer to the vector code |
101 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct | 101 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct |
102 attached to S). This pointer will be used for the vectorization of following | 102 attached to S). This pointer will be used for the vectorization of following |
103 stmts which use the def of stmt S. Stmt S is removed if it writes to memory; | 103 stmts which use the def of stmt S. Stmt S is removed if it writes to memory; |
104 otherwise, we rely on dead code elimination for removing it. | 104 otherwise, we rely on dead code elimination for removing it. |
105 | 105 |
106 For example, say stmt S1 was vectorized into stmt VS1: | 106 For example, say stmt S1 was vectorized into stmt VS1: |
107 | 107 |
109 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1 | 109 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1 |
110 S2: a = b; | 110 S2: a = b; |
111 | 111 |
112 To vectorize stmt S2, the vectorizer first finds the stmt that defines | 112 To vectorize stmt S2, the vectorizer first finds the stmt that defines |
113 the operand 'b' (S1), and gets the relevant vector def 'vb' from the | 113 the operand 'b' (S1), and gets the relevant vector def 'vb' from the |
114 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The | 114 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The |
115 resulting sequence would be: | 115 resulting sequence would be: |
116 | 116 |
117 VS1: vb = px[i]; | 117 VS1: vb = px[i]; |
118 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1 | 118 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1 |
119 VS2: va = vb; | 119 VS2: va = vb; |
123 load/store operations (like 'x[i]' in S1), and are handled differently. | 123 load/store operations (like 'x[i]' in S1), and are handled differently. |
124 | 124 |
125 Target modeling: | 125 Target modeling: |
126 ================= | 126 ================= |
127 Currently the only target specific information that is used is the | 127 Currently the only target specific information that is used is the |
128 size of the vector (in bytes) - "UNITS_PER_SIMD_WORD". Targets that can | 128 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". |
129 support different sizes of vectors, for now will need to specify one value | 129 Targets that can support different sizes of vectors, for now will need |
130 for "UNITS_PER_SIMD_WORD". More flexibility will be added in the future. | 130 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More |
131 flexibility will be added in the future. | |
131 | 132 |
132 Since we only vectorize operations which vector form can be | 133 Since we only vectorize operations which vector form can be |
133 expressed using existing tree codes, to verify that an operation is | 134 expressed using existing tree codes, to verify that an operation is |
134 supported, the vectorizer checks the relevant optab at the relevant | 135 supported, the vectorizer checks the relevant optab at the relevant |
135 machine_mode (e.g, optab_handler (add_optab, V8HImode)->insn_code). If | 136 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If |
136 the value found is CODE_FOR_nothing, then there's no target support, and | 137 the value found is CODE_FOR_nothing, then there's no target support, and |
137 we can't vectorize the stmt. | 138 we can't vectorize the stmt. |
138 | 139 |
139 For additional information on this project see: | 140 For additional information on this project see: |
140 http://gcc.gnu.org/projects/tree-ssa/vectorization.html | 141 http://gcc.gnu.org/projects/tree-ssa/vectorization.html |
141 */ | 142 */ |
142 | 143 |
143 /* Function vect_determine_vectorization_factor | 144 /* Function vect_determine_vectorization_factor |
144 | 145 |
145 Determine the vectorization factor (VF). VF is the number of data elements | 146 Determine the vectorization factor (VF). VF is the number of data elements |
146 that are operated upon in parallel in a single iteration of the vectorized | 147 that are operated upon in parallel in a single iteration of the vectorized |
147 loop. For example, when vectorizing a loop that operates on 4byte elements, | 148 loop. For example, when vectorizing a loop that operates on 4byte elements, |
148 on a target with vector size (VS) 16byte, the VF is set to 4, since 4 | 149 on a target with vector size (VS) 16byte, the VF is set to 4, since 4 |
149 elements can fit in a single vector register. | 150 elements can fit in a single vector register. |
150 | 151 |
151 We currently support vectorization of loops in which all types operated upon | 152 We currently support vectorization of loops in which all types operated upon |
152 are of the same size. Therefore this function currently sets VF according to | 153 are of the same size. Therefore this function currently sets VF according to |
153 the size of the types operated upon, and fails if there are multiple sizes | 154 the size of the types operated upon, and fails if there are multiple sizes |
154 in the loop. | 155 in the loop. |
155 | 156 |
156 VF is also the factor by which the loop iterations are strip-mined, e.g.: | 157 VF is also the factor by which the loop iterations are strip-mined, e.g.: |
157 original loop: | 158 original loop: |
434 } | 435 } |
435 | 436 |
436 /* Function vect_analyze_scalar_cycles_1. | 437 /* Function vect_analyze_scalar_cycles_1. |
437 | 438 |
438 Examine the cross iteration def-use cycles of scalar variables | 439 Examine the cross iteration def-use cycles of scalar variables |
439 in LOOP. LOOP_VINFO represents the loop that is now being | 440 in LOOP. LOOP_VINFO represents the loop that is now being |
440 considered for vectorization (can be LOOP, or an outer-loop | 441 considered for vectorization (can be LOOP, or an outer-loop |
441 enclosing LOOP). */ | 442 enclosing LOOP). */ |
442 | 443 |
443 static void | 444 static void |
444 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop) | 445 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop) |
450 bool double_reduc; | 451 bool double_reduc; |
451 | 452 |
452 if (vect_print_dump_info (REPORT_DETAILS)) | 453 if (vect_print_dump_info (REPORT_DETAILS)) |
453 fprintf (vect_dump, "=== vect_analyze_scalar_cycles ==="); | 454 fprintf (vect_dump, "=== vect_analyze_scalar_cycles ==="); |
454 | 455 |
455 /* First - identify all inductions. Reduction detection assumes that all the | 456 /* First - identify all inductions. Reduction detection assumes that all the |
456 inductions have been identified, therefore, this order must not be | 457 inductions have been identified, therefore, this order must not be |
457 changed. */ | 458 changed. */ |
458 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi)) | 459 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi)) |
459 { | 460 { |
460 gimple phi = gsi_stmt (gsi); | 461 gimple phi = gsi_stmt (gsi); |
466 { | 467 { |
467 fprintf (vect_dump, "Analyze phi: "); | 468 fprintf (vect_dump, "Analyze phi: "); |
468 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM); | 469 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM); |
469 } | 470 } |
470 | 471 |
471 /* Skip virtual phi's. The data dependences that are associated with | 472 /* Skip virtual phi's. The data dependences that are associated with |
472 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */ | 473 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */ |
473 if (!is_gimple_reg (SSA_NAME_VAR (def))) | 474 if (!is_gimple_reg (SSA_NAME_VAR (def))) |
474 continue; | 475 continue; |
475 | 476 |
476 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type; | 477 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type; |
477 | 478 |
478 /* Analyze the evolution function. */ | 479 /* Analyze the evolution function. */ |
479 access_fn = analyze_scalar_evolution (loop, def); | 480 access_fn = analyze_scalar_evolution (loop, def); |
481 if (access_fn) | |
482 STRIP_NOPS (access_fn); | |
480 if (access_fn && vect_print_dump_info (REPORT_DETAILS)) | 483 if (access_fn && vect_print_dump_info (REPORT_DETAILS)) |
481 { | 484 { |
482 fprintf (vect_dump, "Access function of PHI: "); | 485 fprintf (vect_dump, "Access function of PHI: "); |
483 print_generic_expr (vect_dump, access_fn, TDF_SLIM); | 486 print_generic_expr (vect_dump, access_fn, TDF_SLIM); |
484 } | 487 } |
565 | 568 |
566 | 569 |
567 /* Function vect_analyze_scalar_cycles. | 570 /* Function vect_analyze_scalar_cycles. |
568 | 571 |
569 Examine the cross iteration def-use cycles of scalar variables, by | 572 Examine the cross iteration def-use cycles of scalar variables, by |
570 analyzing the loop-header PHIs of scalar variables; Classify each | 573 analyzing the loop-header PHIs of scalar variables. Classify each |
571 cycle as one of the following: invariant, induction, reduction, unknown. | 574 cycle as one of the following: invariant, induction, reduction, unknown. |
572 We do that for the loop represented by LOOP_VINFO, and also to its | 575 We do that for the loop represented by LOOP_VINFO, and also to its |
573 inner-loop, if exists. | 576 inner-loop, if exists. |
574 Examples for scalar cycles: | 577 Examples for scalar cycles: |
575 | 578 |
740 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL; | 743 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL; |
741 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0; | 744 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0; |
742 LOOP_VINFO_VECTORIZABLE_P (res) = 0; | 745 LOOP_VINFO_VECTORIZABLE_P (res) = 0; |
743 LOOP_PEELING_FOR_ALIGNMENT (res) = 0; | 746 LOOP_PEELING_FOR_ALIGNMENT (res) = 0; |
744 LOOP_VINFO_VECT_FACTOR (res) = 0; | 747 LOOP_VINFO_VECT_FACTOR (res) = 0; |
748 LOOP_VINFO_LOOP_NEST (res) = VEC_alloc (loop_p, heap, 3); | |
745 LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10); | 749 LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10); |
746 LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10); | 750 LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10); |
747 LOOP_VINFO_UNALIGNED_DR (res) = NULL; | 751 LOOP_VINFO_UNALIGNED_DR (res) = NULL; |
748 LOOP_VINFO_MAY_MISALIGN_STMTS (res) = | 752 LOOP_VINFO_MAY_MISALIGN_STMTS (res) = |
749 VEC_alloc (gimple, heap, | 753 VEC_alloc (gimple, heap, |
753 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS)); | 757 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS)); |
754 LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10); | 758 LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10); |
755 LOOP_VINFO_REDUCTIONS (res) = VEC_alloc (gimple, heap, 10); | 759 LOOP_VINFO_REDUCTIONS (res) = VEC_alloc (gimple, heap, 10); |
756 LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10); | 760 LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10); |
757 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1; | 761 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1; |
762 LOOP_VINFO_PEELING_HTAB (res) = NULL; | |
758 | 763 |
759 return res; | 764 return res; |
760 } | 765 } |
761 | 766 |
762 | 767 |
787 if (!clean_stmts) | 792 if (!clean_stmts) |
788 { | 793 { |
789 free (LOOP_VINFO_BBS (loop_vinfo)); | 794 free (LOOP_VINFO_BBS (loop_vinfo)); |
790 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo)); | 795 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo)); |
791 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo)); | 796 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo)); |
797 VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo)); | |
792 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)); | 798 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)); |
799 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)); | |
793 | 800 |
794 free (loop_vinfo); | 801 free (loop_vinfo); |
795 loop->aux = NULL; | 802 loop->aux = NULL; |
796 return; | 803 return; |
797 } | 804 } |
833 } | 840 } |
834 | 841 |
835 free (LOOP_VINFO_BBS (loop_vinfo)); | 842 free (LOOP_VINFO_BBS (loop_vinfo)); |
836 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo)); | 843 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo)); |
837 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo)); | 844 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo)); |
845 VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo)); | |
838 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)); | 846 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)); |
839 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)); | 847 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)); |
840 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo); | 848 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo); |
841 for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++) | 849 FOR_EACH_VEC_ELT (slp_instance, slp_instances, j, instance) |
842 vect_free_slp_instance (instance); | 850 vect_free_slp_instance (instance); |
843 | 851 |
844 VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo)); | 852 VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo)); |
845 VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo)); | 853 VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo)); |
846 VEC_free (gimple, heap, LOOP_VINFO_REDUCTIONS (loop_vinfo)); | 854 VEC_free (gimple, heap, LOOP_VINFO_REDUCTIONS (loop_vinfo)); |
855 | |
856 if (LOOP_VINFO_PEELING_HTAB (loop_vinfo)) | |
857 htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo)); | |
847 | 858 |
848 free (loop_vinfo); | 859 free (loop_vinfo); |
849 loop->aux = NULL; | 860 loop->aux = NULL; |
850 } | 861 } |
851 | 862 |
1115 loop->aux = loop_vinfo; | 1126 loop->aux = loop_vinfo; |
1116 return loop_vinfo; | 1127 return loop_vinfo; |
1117 } | 1128 } |
1118 | 1129 |
1119 | 1130 |
1131 /* Get cost by calling cost target builtin. */ | |
1132 | |
1133 static inline int | |
1134 vect_get_cost (enum vect_cost_for_stmt type_of_cost) | |
1135 { | |
1136 tree dummy_type = NULL; | |
1137 int dummy = 0; | |
1138 | |
1139 return targetm.vectorize.builtin_vectorization_cost (type_of_cost, | |
1140 dummy_type, dummy); | |
1141 } | |
1142 | |
1143 | |
1120 /* Function vect_analyze_loop_operations. | 1144 /* Function vect_analyze_loop_operations. |
1121 | 1145 |
1122 Scan the loop stmts and make sure they are all vectorizable. */ | 1146 Scan the loop stmts and make sure they are all vectorizable. */ |
1123 | 1147 |
1124 static bool | 1148 static bool |
1280 fprintf (vect_dump,"not vectorized: iteration count smaller than " | 1304 fprintf (vect_dump,"not vectorized: iteration count smaller than " |
1281 "vectorization factor."); | 1305 "vectorization factor."); |
1282 return false; | 1306 return false; |
1283 } | 1307 } |
1284 | 1308 |
1285 /* Analyze cost. Decide if worth while to vectorize. */ | 1309 /* Analyze cost. Decide if worth while to vectorize. */ |
1286 | 1310 |
1287 /* Once VF is set, SLP costs should be updated since the number of created | 1311 /* Once VF is set, SLP costs should be updated since the number of created |
1288 vector stmts depends on VF. */ | 1312 vector stmts depends on VF. */ |
1289 vect_update_slp_costs_according_to_vf (loop_vinfo); | 1313 vect_update_slp_costs_according_to_vf (loop_vinfo); |
1290 | 1314 |
1350 | 1374 |
1351 return true; | 1375 return true; |
1352 } | 1376 } |
1353 | 1377 |
1354 | 1378 |
1355 /* Function vect_analyze_loop. | 1379 /* Function vect_analyze_loop_2. |
1356 | 1380 |
1357 Apply a set of analyses on LOOP, and create a loop_vec_info struct | 1381 Apply a set of analyses on LOOP, and create a loop_vec_info struct |
1358 for it. The different analyses will record information in the | 1382 for it. The different analyses will record information in the |
1359 loop_vec_info struct. */ | 1383 loop_vec_info struct. */ |
1360 loop_vec_info | 1384 static bool |
1361 vect_analyze_loop (struct loop *loop) | 1385 vect_analyze_loop_2 (loop_vec_info loop_vinfo) |
1362 { | 1386 { |
1363 bool ok; | 1387 bool ok, dummy; |
1364 loop_vec_info loop_vinfo; | |
1365 int max_vf = MAX_VECTORIZATION_FACTOR; | 1388 int max_vf = MAX_VECTORIZATION_FACTOR; |
1366 int min_vf = 2; | 1389 int min_vf = 2; |
1367 | |
1368 if (vect_print_dump_info (REPORT_DETAILS)) | |
1369 fprintf (vect_dump, "===== analyze_loop_nest ====="); | |
1370 | |
1371 if (loop_outer (loop) | |
1372 && loop_vec_info_for_loop (loop_outer (loop)) | |
1373 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop)))) | |
1374 { | |
1375 if (vect_print_dump_info (REPORT_DETAILS)) | |
1376 fprintf (vect_dump, "outer-loop already vectorized."); | |
1377 return NULL; | |
1378 } | |
1379 | |
1380 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */ | |
1381 | |
1382 loop_vinfo = vect_analyze_loop_form (loop); | |
1383 if (!loop_vinfo) | |
1384 { | |
1385 if (vect_print_dump_info (REPORT_DETAILS)) | |
1386 fprintf (vect_dump, "bad loop form."); | |
1387 return NULL; | |
1388 } | |
1389 | 1390 |
1390 /* Find all data references in the loop (which correspond to vdefs/vuses) | 1391 /* Find all data references in the loop (which correspond to vdefs/vuses) |
1391 and analyze their evolution in the loop. Also adjust the minimal | 1392 and analyze their evolution in the loop. Also adjust the minimal |
1392 vectorization factor according to the loads and stores. | 1393 vectorization factor according to the loads and stores. |
1393 | 1394 |
1397 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf); | 1398 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf); |
1398 if (!ok) | 1399 if (!ok) |
1399 { | 1400 { |
1400 if (vect_print_dump_info (REPORT_DETAILS)) | 1401 if (vect_print_dump_info (REPORT_DETAILS)) |
1401 fprintf (vect_dump, "bad data references."); | 1402 fprintf (vect_dump, "bad data references."); |
1402 destroy_loop_vec_info (loop_vinfo, true); | 1403 return false; |
1403 return NULL; | |
1404 } | 1404 } |
1405 | 1405 |
1406 /* Classify all cross-iteration scalar data-flow cycles. | 1406 /* Classify all cross-iteration scalar data-flow cycles. |
1407 Cross-iteration cycles caused by virtual phis are analyzed separately. */ | 1407 Cross-iteration cycles caused by virtual phis are analyzed separately. */ |
1408 | 1408 |
1415 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo); | 1415 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo); |
1416 if (!ok) | 1416 if (!ok) |
1417 { | 1417 { |
1418 if (vect_print_dump_info (REPORT_DETAILS)) | 1418 if (vect_print_dump_info (REPORT_DETAILS)) |
1419 fprintf (vect_dump, "unexpected pattern."); | 1419 fprintf (vect_dump, "unexpected pattern."); |
1420 destroy_loop_vec_info (loop_vinfo, true); | 1420 return false; |
1421 return NULL; | |
1422 } | 1421 } |
1423 | 1422 |
1424 /* Analyze data dependences between the data-refs in the loop | 1423 /* Analyze data dependences between the data-refs in the loop |
1425 and adjust the maximum vectorization factor according to | 1424 and adjust the maximum vectorization factor according to |
1426 the dependences. | 1425 the dependences. |
1427 FORNOW: fail at the first data dependence that we encounter. */ | 1426 FORNOW: fail at the first data dependence that we encounter. */ |
1428 | 1427 |
1429 ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf); | 1428 ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf, &dummy); |
1430 if (!ok | 1429 if (!ok |
1431 || max_vf < min_vf) | 1430 || max_vf < min_vf) |
1432 { | 1431 { |
1433 if (vect_print_dump_info (REPORT_DETAILS)) | 1432 if (vect_print_dump_info (REPORT_DETAILS)) |
1434 fprintf (vect_dump, "bad data dependence."); | 1433 fprintf (vect_dump, "bad data dependence."); |
1435 destroy_loop_vec_info (loop_vinfo, true); | 1434 return false; |
1436 return NULL; | |
1437 } | 1435 } |
1438 | 1436 |
1439 ok = vect_determine_vectorization_factor (loop_vinfo); | 1437 ok = vect_determine_vectorization_factor (loop_vinfo); |
1440 if (!ok) | 1438 if (!ok) |
1441 { | 1439 { |
1442 if (vect_print_dump_info (REPORT_DETAILS)) | 1440 if (vect_print_dump_info (REPORT_DETAILS)) |
1443 fprintf (vect_dump, "can't determine vectorization factor."); | 1441 fprintf (vect_dump, "can't determine vectorization factor."); |
1444 destroy_loop_vec_info (loop_vinfo, true); | 1442 return false; |
1445 return NULL; | |
1446 } | 1443 } |
1447 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo)) | 1444 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo)) |
1448 { | 1445 { |
1449 if (vect_print_dump_info (REPORT_DETAILS)) | 1446 if (vect_print_dump_info (REPORT_DETAILS)) |
1450 fprintf (vect_dump, "bad data dependence."); | 1447 fprintf (vect_dump, "bad data dependence."); |
1451 destroy_loop_vec_info (loop_vinfo, true); | 1448 return false; |
1452 return NULL; | |
1453 } | 1449 } |
1454 | 1450 |
1455 /* Analyze the alignment of the data-refs in the loop. | 1451 /* Analyze the alignment of the data-refs in the loop. |
1456 Fail if a data reference is found that cannot be vectorized. */ | 1452 Fail if a data reference is found that cannot be vectorized. */ |
1457 | 1453 |
1458 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL); | 1454 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL); |
1459 if (!ok) | 1455 if (!ok) |
1460 { | 1456 { |
1461 if (vect_print_dump_info (REPORT_DETAILS)) | 1457 if (vect_print_dump_info (REPORT_DETAILS)) |
1462 fprintf (vect_dump, "bad data alignment."); | 1458 fprintf (vect_dump, "bad data alignment."); |
1463 destroy_loop_vec_info (loop_vinfo, true); | 1459 return false; |
1464 return NULL; | |
1465 } | 1460 } |
1466 | 1461 |
1467 /* Analyze the access patterns of the data-refs in the loop (consecutive, | 1462 /* Analyze the access patterns of the data-refs in the loop (consecutive, |
1468 complex, etc.). FORNOW: Only handle consecutive access pattern. */ | 1463 complex, etc.). FORNOW: Only handle consecutive access pattern. */ |
1469 | 1464 |
1470 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL); | 1465 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL); |
1471 if (!ok) | 1466 if (!ok) |
1472 { | 1467 { |
1473 if (vect_print_dump_info (REPORT_DETAILS)) | 1468 if (vect_print_dump_info (REPORT_DETAILS)) |
1474 fprintf (vect_dump, "bad data access."); | 1469 fprintf (vect_dump, "bad data access."); |
1475 destroy_loop_vec_info (loop_vinfo, true); | 1470 return false; |
1476 return NULL; | |
1477 } | 1471 } |
1478 | 1472 |
1479 /* Prune the list of ddrs to be tested at run-time by versioning for alias. | 1473 /* Prune the list of ddrs to be tested at run-time by versioning for alias. |
1480 It is important to call pruning after vect_analyze_data_ref_accesses, | 1474 It is important to call pruning after vect_analyze_data_ref_accesses, |
1481 since we use grouping information gathered by interleaving analysis. */ | 1475 since we use grouping information gathered by interleaving analysis. */ |
1483 if (!ok) | 1477 if (!ok) |
1484 { | 1478 { |
1485 if (vect_print_dump_info (REPORT_DETAILS)) | 1479 if (vect_print_dump_info (REPORT_DETAILS)) |
1486 fprintf (vect_dump, "too long list of versioning for alias " | 1480 fprintf (vect_dump, "too long list of versioning for alias " |
1487 "run-time tests."); | 1481 "run-time tests."); |
1488 destroy_loop_vec_info (loop_vinfo, true); | 1482 return false; |
1489 return NULL; | 1483 } |
1484 | |
1485 /* This pass will decide on using loop versioning and/or loop peeling in | |
1486 order to enhance the alignment of data references in the loop. */ | |
1487 | |
1488 ok = vect_enhance_data_refs_alignment (loop_vinfo); | |
1489 if (!ok) | |
1490 { | |
1491 if (vect_print_dump_info (REPORT_DETAILS)) | |
1492 fprintf (vect_dump, "bad data alignment."); | |
1493 return false; | |
1490 } | 1494 } |
1491 | 1495 |
1492 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */ | 1496 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */ |
1493 ok = vect_analyze_slp (loop_vinfo, NULL); | 1497 ok = vect_analyze_slp (loop_vinfo, NULL); |
1494 if (ok) | 1498 if (ok) |
1498 | 1502 |
1499 /* Find stmts that need to be both vectorized and SLPed. */ | 1503 /* Find stmts that need to be both vectorized and SLPed. */ |
1500 vect_detect_hybrid_slp (loop_vinfo); | 1504 vect_detect_hybrid_slp (loop_vinfo); |
1501 } | 1505 } |
1502 | 1506 |
1503 /* This pass will decide on using loop versioning and/or loop peeling in | |
1504 order to enhance the alignment of data references in the loop. */ | |
1505 | |
1506 ok = vect_enhance_data_refs_alignment (loop_vinfo); | |
1507 if (!ok) | |
1508 { | |
1509 if (vect_print_dump_info (REPORT_DETAILS)) | |
1510 fprintf (vect_dump, "bad data alignment."); | |
1511 destroy_loop_vec_info (loop_vinfo, true); | |
1512 return NULL; | |
1513 } | |
1514 | |
1515 /* Scan all the operations in the loop and make sure they are | 1507 /* Scan all the operations in the loop and make sure they are |
1516 vectorizable. */ | 1508 vectorizable. */ |
1517 | 1509 |
1518 ok = vect_analyze_loop_operations (loop_vinfo); | 1510 ok = vect_analyze_loop_operations (loop_vinfo); |
1519 if (!ok) | 1511 if (!ok) |
1520 { | 1512 { |
1521 if (vect_print_dump_info (REPORT_DETAILS)) | 1513 if (vect_print_dump_info (REPORT_DETAILS)) |
1522 fprintf (vect_dump, "bad operation or unsupported loop bound."); | 1514 fprintf (vect_dump, "bad operation or unsupported loop bound."); |
1515 return false; | |
1516 } | |
1517 | |
1518 return true; | |
1519 } | |
1520 | |
1521 /* Function vect_analyze_loop. | |
1522 | |
1523 Apply a set of analyses on LOOP, and create a loop_vec_info struct | |
1524 for it. The different analyses will record information in the | |
1525 loop_vec_info struct. */ | |
1526 loop_vec_info | |
1527 vect_analyze_loop (struct loop *loop) | |
1528 { | |
1529 loop_vec_info loop_vinfo; | |
1530 unsigned int vector_sizes; | |
1531 | |
1532 /* Autodetect first vector size we try. */ | |
1533 current_vector_size = 0; | |
1534 vector_sizes = targetm.vectorize.autovectorize_vector_sizes (); | |
1535 | |
1536 if (vect_print_dump_info (REPORT_DETAILS)) | |
1537 fprintf (vect_dump, "===== analyze_loop_nest ====="); | |
1538 | |
1539 if (loop_outer (loop) | |
1540 && loop_vec_info_for_loop (loop_outer (loop)) | |
1541 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop)))) | |
1542 { | |
1543 if (vect_print_dump_info (REPORT_DETAILS)) | |
1544 fprintf (vect_dump, "outer-loop already vectorized."); | |
1545 return NULL; | |
1546 } | |
1547 | |
1548 while (1) | |
1549 { | |
1550 /* Check the CFG characteristics of the loop (nesting, entry/exit). */ | |
1551 loop_vinfo = vect_analyze_loop_form (loop); | |
1552 if (!loop_vinfo) | |
1553 { | |
1554 if (vect_print_dump_info (REPORT_DETAILS)) | |
1555 fprintf (vect_dump, "bad loop form."); | |
1556 return NULL; | |
1557 } | |
1558 | |
1559 if (vect_analyze_loop_2 (loop_vinfo)) | |
1560 { | |
1561 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1; | |
1562 | |
1563 return loop_vinfo; | |
1564 } | |
1565 | |
1523 destroy_loop_vec_info (loop_vinfo, true); | 1566 destroy_loop_vec_info (loop_vinfo, true); |
1524 return NULL; | 1567 |
1525 } | 1568 vector_sizes &= ~current_vector_size; |
1526 | 1569 if (vector_sizes == 0 |
1527 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1; | 1570 || current_vector_size == 0) |
1528 | 1571 return NULL; |
1529 return loop_vinfo; | 1572 |
1573 /* Try the next biggest vector size. */ | |
1574 current_vector_size = 1 << floor_log2 (vector_sizes); | |
1575 if (vect_print_dump_info (REPORT_DETAILS)) | |
1576 fprintf (vect_dump, "***** Re-trying analysis with " | |
1577 "vector size %d\n", current_vector_size); | |
1578 } | |
1530 } | 1579 } |
1531 | 1580 |
1532 | 1581 |
1533 /* Function reduction_code_for_scalar_code | 1582 /* Function reduction_code_for_scalar_code |
1534 | 1583 |
1573 return false; | 1622 return false; |
1574 } | 1623 } |
1575 } | 1624 } |
1576 | 1625 |
1577 | 1626 |
1578 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement | 1627 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement |
1579 STMT is printed with a message MSG. */ | 1628 STMT is printed with a message MSG. */ |
1580 | 1629 |
1581 static void | 1630 static void |
1582 report_vect_op (gimple stmt, const char *msg) | 1631 report_vect_op (gimple stmt, const char *msg) |
1583 { | 1632 { |
1587 | 1636 |
1588 | 1637 |
1589 /* Function vect_is_simple_reduction_1 | 1638 /* Function vect_is_simple_reduction_1 |
1590 | 1639 |
1591 (1) Detect a cross-iteration def-use cycle that represents a simple | 1640 (1) Detect a cross-iteration def-use cycle that represents a simple |
1592 reduction computation. We look for the following pattern: | 1641 reduction computation. We look for the following pattern: |
1593 | 1642 |
1594 loop_header: | 1643 loop_header: |
1595 a1 = phi < a0, a2 > | 1644 a1 = phi < a0, a2 > |
1596 a3 = ... | 1645 a3 = ... |
1597 a2 = operation (a3, a1) | 1646 a2 = operation (a3, a1) |
1598 | 1647 |
1599 such that: | 1648 such that: |
1600 1. operation is commutative and associative and it is safe to | 1649 1. operation is commutative and associative and it is safe to |
1601 change the order of the computation (if CHECK_REDUCTION is true) | 1650 change the order of the computation (if CHECK_REDUCTION is true) |
1602 2. no uses for a2 in the loop (a2 is used out of the loop) | 1651 2. no uses for a2 in the loop (a2 is used out of the loop) |
1603 3. no uses of a1 in the loop besides the reduction operation. | 1652 3. no uses of a1 in the loop besides the reduction operation |
1604 | 1653 4. no uses of a1 outside the loop. |
1605 Condition 1 is tested here. | 1654 |
1655 Conditions 1,4 are tested here. | |
1606 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized. | 1656 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized. |
1607 | 1657 |
1608 (2) Detect a cross-iteration def-use cycle in nested loops, i.e., | 1658 (2) Detect a cross-iteration def-use cycle in nested loops, i.e., |
1609 nested cycles, if CHECK_REDUCTION is false. | 1659 nested cycles, if CHECK_REDUCTION is false. |
1610 | 1660 |
1651 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name) | 1701 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name) |
1652 { | 1702 { |
1653 gimple use_stmt = USE_STMT (use_p); | 1703 gimple use_stmt = USE_STMT (use_p); |
1654 if (is_gimple_debug (use_stmt)) | 1704 if (is_gimple_debug (use_stmt)) |
1655 continue; | 1705 continue; |
1656 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)) | 1706 |
1657 && vinfo_for_stmt (use_stmt) | 1707 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))) |
1708 { | |
1709 if (vect_print_dump_info (REPORT_DETAILS)) | |
1710 fprintf (vect_dump, "intermediate value used outside loop."); | |
1711 | |
1712 return NULL; | |
1713 } | |
1714 | |
1715 if (vinfo_for_stmt (use_stmt) | |
1658 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt))) | 1716 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt))) |
1659 nloop_uses++; | 1717 nloop_uses++; |
1660 if (nloop_uses > 1) | 1718 if (nloop_uses > 1) |
1661 { | 1719 { |
1662 if (vect_print_dump_info (REPORT_DETAILS)) | 1720 if (vect_print_dump_info (REPORT_DETAILS)) |
1754 | 1812 |
1755 /* We can handle "res -= x[i]", which is non-associative by | 1813 /* We can handle "res -= x[i]", which is non-associative by |
1756 simply rewriting this into "res += -x[i]". Avoid changing | 1814 simply rewriting this into "res += -x[i]". Avoid changing |
1757 gimple instruction for the first simple tests and only do this | 1815 gimple instruction for the first simple tests and only do this |
1758 if we're allowed to change code at all. */ | 1816 if we're allowed to change code at all. */ |
1759 if (code == MINUS_EXPR && modify) | 1817 if (code == MINUS_EXPR |
1818 && modify | |
1819 && (op1 = gimple_assign_rhs1 (def_stmt)) | |
1820 && TREE_CODE (op1) == SSA_NAME | |
1821 && SSA_NAME_DEF_STMT (op1) == phi) | |
1760 code = PLUS_EXPR; | 1822 code = PLUS_EXPR; |
1761 | 1823 |
1762 if (check_reduction | 1824 if (check_reduction |
1763 && (!commutative_tree_code (code) || !associative_tree_code (code))) | 1825 && (!commutative_tree_code (code) || !associative_tree_code (code))) |
1764 { | 1826 { |
1919 | 1981 |
1920 if (def2 && def2 == phi | 1982 if (def2 && def2 == phi |
1921 && (code == COND_EXPR | 1983 && (code == COND_EXPR |
1922 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1)) | 1984 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1)) |
1923 && (is_gimple_assign (def1) | 1985 && (is_gimple_assign (def1) |
1986 || is_gimple_call (def1) | |
1924 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) | 1987 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) |
1925 == vect_induction_def | 1988 == vect_induction_def |
1926 || (gimple_code (def1) == GIMPLE_PHI | 1989 || (gimple_code (def1) == GIMPLE_PHI |
1927 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) | 1990 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) |
1928 == vect_internal_def | 1991 == vect_internal_def |
1934 } | 1997 } |
1935 else if (def1 && def1 == phi | 1998 else if (def1 && def1 == phi |
1936 && (code == COND_EXPR | 1999 && (code == COND_EXPR |
1937 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2)) | 2000 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2)) |
1938 && (is_gimple_assign (def2) | 2001 && (is_gimple_assign (def2) |
2002 || is_gimple_call (def2) | |
1939 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) | 2003 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) |
1940 == vect_induction_def | 2004 == vect_induction_def |
1941 || (gimple_code (def2) == GIMPLE_PHI | 2005 || (gimple_code (def2) == GIMPLE_PHI |
1942 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) | 2006 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) |
1943 == vect_internal_def | 2007 == vect_internal_def |
1991 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi, | 2055 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi, |
1992 bool check_reduction, bool *double_reduc) | 2056 bool check_reduction, bool *double_reduc) |
1993 { | 2057 { |
1994 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction, | 2058 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction, |
1995 double_reduc, true); | 2059 double_reduc, true); |
2060 } | |
2061 | |
2062 /* Calculate the cost of one scalar iteration of the loop. */ | |
2063 int | |
2064 vect_get_single_scalar_iteraion_cost (loop_vec_info loop_vinfo) | |
2065 { | |
2066 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
2067 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); | |
2068 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0; | |
2069 int innerloop_iters, i, stmt_cost; | |
2070 | |
2071 /* Count statements in scalar loop. Using this as scalar cost for a single | |
2072 iteration for now. | |
2073 | |
2074 TODO: Add outer loop support. | |
2075 | |
2076 TODO: Consider assigning different costs to different scalar | |
2077 statements. */ | |
2078 | |
2079 /* FORNOW. */ | |
2080 innerloop_iters = 1; | |
2081 if (loop->inner) | |
2082 innerloop_iters = 50; /* FIXME */ | |
2083 | |
2084 for (i = 0; i < nbbs; i++) | |
2085 { | |
2086 gimple_stmt_iterator si; | |
2087 basic_block bb = bbs[i]; | |
2088 | |
2089 if (bb->loop_father == loop->inner) | |
2090 factor = innerloop_iters; | |
2091 else | |
2092 factor = 1; | |
2093 | |
2094 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) | |
2095 { | |
2096 gimple stmt = gsi_stmt (si); | |
2097 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
2098 | |
2099 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt)) | |
2100 continue; | |
2101 | |
2102 /* Skip stmts that are not vectorized inside the loop. */ | |
2103 if (stmt_info | |
2104 && !STMT_VINFO_RELEVANT_P (stmt_info) | |
2105 && (!STMT_VINFO_LIVE_P (stmt_info) | |
2106 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)) | |
2107 continue; | |
2108 | |
2109 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))) | |
2110 { | |
2111 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))) | |
2112 stmt_cost = vect_get_cost (scalar_load); | |
2113 else | |
2114 stmt_cost = vect_get_cost (scalar_store); | |
2115 } | |
2116 else | |
2117 stmt_cost = vect_get_cost (scalar_stmt); | |
2118 | |
2119 scalar_single_iter_cost += stmt_cost * factor; | |
2120 } | |
2121 } | |
2122 return scalar_single_iter_cost; | |
2123 } | |
2124 | |
2125 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */ | |
2126 int | |
2127 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue, | |
2128 int *peel_iters_epilogue, | |
2129 int scalar_single_iter_cost) | |
2130 { | |
2131 int peel_guard_costs = 0; | |
2132 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); | |
2133 | |
2134 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) | |
2135 { | |
2136 *peel_iters_epilogue = vf/2; | |
2137 if (vect_print_dump_info (REPORT_COST)) | |
2138 fprintf (vect_dump, "cost model: " | |
2139 "epilogue peel iters set to vf/2 because " | |
2140 "loop iterations are unknown ."); | |
2141 | |
2142 /* If peeled iterations are known but number of scalar loop | |
2143 iterations are unknown, count a taken branch per peeled loop. */ | |
2144 peel_guard_costs = 2 * vect_get_cost (cond_branch_taken); | |
2145 } | |
2146 else | |
2147 { | |
2148 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo); | |
2149 peel_iters_prologue = niters < peel_iters_prologue ? | |
2150 niters : peel_iters_prologue; | |
2151 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf; | |
2152 } | |
2153 | |
2154 return (peel_iters_prologue * scalar_single_iter_cost) | |
2155 + (*peel_iters_epilogue * scalar_single_iter_cost) | |
2156 + peel_guard_costs; | |
1996 } | 2157 } |
1997 | 2158 |
1998 /* Function vect_estimate_min_profitable_iters | 2159 /* Function vect_estimate_min_profitable_iters |
1999 | 2160 |
2000 Return the number of iterations required for the vector version of the | 2161 Return the number of iterations required for the vector version of the |
2017 int scalar_outside_cost = 0; | 2178 int scalar_outside_cost = 0; |
2018 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); | 2179 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
2019 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | 2180 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
2020 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); | 2181 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); |
2021 int nbbs = loop->num_nodes; | 2182 int nbbs = loop->num_nodes; |
2022 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo); | 2183 int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo); |
2023 int peel_guard_costs = 0; | 2184 int peel_guard_costs = 0; |
2024 int innerloop_iters = 0, factor; | 2185 int innerloop_iters = 0, factor; |
2025 VEC (slp_instance, heap) *slp_instances; | 2186 VEC (slp_instance, heap) *slp_instances; |
2026 slp_instance instance; | 2187 slp_instance instance; |
2027 | 2188 |
2055 "versioning aliasing.\n"); | 2216 "versioning aliasing.\n"); |
2056 } | 2217 } |
2057 | 2218 |
2058 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo) | 2219 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo) |
2059 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)) | 2220 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)) |
2060 vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST; | 2221 vec_outside_cost += vect_get_cost (cond_branch_taken); |
2061 | 2222 |
2062 /* Count statements in scalar loop. Using this as scalar cost for a single | 2223 /* Count statements in scalar loop. Using this as scalar cost for a single |
2063 iteration for now. | 2224 iteration for now. |
2064 | 2225 |
2065 TODO: Add outer loop support. | 2226 TODO: Add outer loop support. |
2088 /* Skip stmts that are not vectorized inside the loop. */ | 2249 /* Skip stmts that are not vectorized inside the loop. */ |
2089 if (!STMT_VINFO_RELEVANT_P (stmt_info) | 2250 if (!STMT_VINFO_RELEVANT_P (stmt_info) |
2090 && (!STMT_VINFO_LIVE_P (stmt_info) | 2251 && (!STMT_VINFO_LIVE_P (stmt_info) |
2091 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)) | 2252 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)) |
2092 continue; | 2253 continue; |
2093 scalar_single_iter_cost += cost_for_stmt (stmt) * factor; | |
2094 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor; | 2254 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor; |
2095 /* FIXME: for stmts in the inner-loop in outer-loop vectorization, | 2255 /* FIXME: for stmts in the inner-loop in outer-loop vectorization, |
2096 some of the "outside" costs are generated inside the outer-loop. */ | 2256 some of the "outside" costs are generated inside the outer-loop. */ |
2097 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info); | 2257 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info); |
2098 } | 2258 } |
2099 } | 2259 } |
2100 | 2260 |
2261 scalar_single_iter_cost = vect_get_single_scalar_iteraion_cost (loop_vinfo); | |
2262 | |
2101 /* Add additional cost for the peeled instructions in prologue and epilogue | 2263 /* Add additional cost for the peeled instructions in prologue and epilogue |
2102 loop. | 2264 loop. |
2103 | 2265 |
2104 FORNOW: If we don't know the value of peel_iters for prologue or epilogue | 2266 FORNOW: If we don't know the value of peel_iters for prologue or epilogue |
2105 at compile-time - we assume it's vf/2 (the worst would be vf-1). | 2267 at compile-time - we assume it's vf/2 (the worst would be vf-1). |
2106 | 2268 |
2107 TODO: Build an expression that represents peel_iters for prologue and | 2269 TODO: Build an expression that represents peel_iters for prologue and |
2108 epilogue to be used in a run-time test. */ | 2270 epilogue to be used in a run-time test. */ |
2109 | 2271 |
2110 if (byte_misalign < 0) | 2272 if (npeel < 0) |
2111 { | 2273 { |
2112 peel_iters_prologue = vf/2; | 2274 peel_iters_prologue = vf/2; |
2113 if (vect_print_dump_info (REPORT_COST)) | 2275 if (vect_print_dump_info (REPORT_COST)) |
2114 fprintf (vect_dump, "cost model: " | 2276 fprintf (vect_dump, "cost model: " |
2115 "prologue peel iters set to vf/2."); | 2277 "prologue peel iters set to vf/2."); |
2124 | 2286 |
2125 /* If peeled iterations are unknown, count a taken branch and a not taken | 2287 /* If peeled iterations are unknown, count a taken branch and a not taken |
2126 branch per peeled loop. Even if scalar loop iterations are known, | 2288 branch per peeled loop. Even if scalar loop iterations are known, |
2127 vector iterations are not known since peeled prologue iterations are | 2289 vector iterations are not known since peeled prologue iterations are |
2128 not known. Hence guards remain the same. */ | 2290 not known. Hence guards remain the same. */ |
2129 peel_guard_costs += 2 * (TARG_COND_TAKEN_BRANCH_COST | 2291 peel_guard_costs += 2 * (vect_get_cost (cond_branch_taken) |
2130 + TARG_COND_NOT_TAKEN_BRANCH_COST); | 2292 + vect_get_cost (cond_branch_not_taken)); |
2293 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost) | |
2294 + (peel_iters_epilogue * scalar_single_iter_cost) | |
2295 + peel_guard_costs; | |
2131 } | 2296 } |
2132 else | 2297 else |
2133 { | 2298 { |
2134 if (byte_misalign) | 2299 peel_iters_prologue = npeel; |
2135 { | 2300 vec_outside_cost += vect_get_known_peeling_cost (loop_vinfo, |
2136 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo); | 2301 peel_iters_prologue, &peel_iters_epilogue, |
2137 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr)))); | 2302 scalar_single_iter_cost); |
2138 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))); | 2303 } |
2139 int nelements = TYPE_VECTOR_SUBPARTS (vectype); | |
2140 | |
2141 peel_iters_prologue = nelements - (byte_misalign / element_size); | |
2142 } | |
2143 else | |
2144 peel_iters_prologue = 0; | |
2145 | |
2146 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) | |
2147 { | |
2148 peel_iters_epilogue = vf/2; | |
2149 if (vect_print_dump_info (REPORT_COST)) | |
2150 fprintf (vect_dump, "cost model: " | |
2151 "epilogue peel iters set to vf/2 because " | |
2152 "loop iterations are unknown ."); | |
2153 | |
2154 /* If peeled iterations are known but number of scalar loop | |
2155 iterations are unknown, count a taken branch per peeled loop. */ | |
2156 peel_guard_costs += 2 * TARG_COND_TAKEN_BRANCH_COST; | |
2157 | |
2158 } | |
2159 else | |
2160 { | |
2161 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo); | |
2162 peel_iters_prologue = niters < peel_iters_prologue ? | |
2163 niters : peel_iters_prologue; | |
2164 peel_iters_epilogue = (niters - peel_iters_prologue) % vf; | |
2165 } | |
2166 } | |
2167 | |
2168 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost) | |
2169 + (peel_iters_epilogue * scalar_single_iter_cost) | |
2170 + peel_guard_costs; | |
2171 | 2304 |
2172 /* FORNOW: The scalar outside cost is incremented in one of the | 2305 /* FORNOW: The scalar outside cost is incremented in one of the |
2173 following ways: | 2306 following ways: |
2174 | 2307 |
2175 1. The vectorizer checks for alignment and aliasing and generates | 2308 1. The vectorizer checks for alignment and aliasing and generates |
2218 TODO: The back end may reorder the BBS's differently and reverse | 2351 TODO: The back end may reorder the BBS's differently and reverse |
2219 conditions/branch directions. Change the estimates below to | 2352 conditions/branch directions. Change the estimates below to |
2220 something more reasonable. */ | 2353 something more reasonable. */ |
2221 | 2354 |
2222 /* If the number of iterations is known and we do not do versioning, we can | 2355 /* If the number of iterations is known and we do not do versioning, we can |
2223 decide whether to vectorize at compile time. Hence the scalar version | 2356 decide whether to vectorize at compile time. Hence the scalar version |
2224 do not carry cost model guard costs. */ | 2357 do not carry cost model guard costs. */ |
2225 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) | 2358 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) |
2226 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo) | 2359 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo) |
2227 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)) | 2360 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)) |
2228 { | 2361 { |
2229 /* Cost model check occurs at versioning. */ | 2362 /* Cost model check occurs at versioning. */ |
2230 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo) | 2363 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo) |
2231 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)) | 2364 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)) |
2232 scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST; | 2365 scalar_outside_cost += vect_get_cost (cond_branch_not_taken); |
2233 else | 2366 else |
2234 { | 2367 { |
2235 /* Cost model check occurs at prologue generation. */ | 2368 /* Cost model check occurs at prologue generation. */ |
2236 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0) | 2369 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0) |
2237 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST | 2370 scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken) |
2238 + TARG_COND_NOT_TAKEN_BRANCH_COST; | 2371 + vect_get_cost (cond_branch_not_taken); |
2239 /* Cost model check occurs at epilogue generation. */ | 2372 /* Cost model check occurs at epilogue generation. */ |
2240 else | 2373 else |
2241 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST; | 2374 scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken); |
2242 } | 2375 } |
2243 } | 2376 } |
2244 | 2377 |
2245 /* Add SLP costs. */ | 2378 /* Add SLP costs. */ |
2246 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo); | 2379 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo); |
2247 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++) | 2380 FOR_EACH_VEC_ELT (slp_instance, slp_instances, i, instance) |
2248 { | 2381 { |
2249 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance); | 2382 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance); |
2250 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance); | 2383 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance); |
2251 } | 2384 } |
2252 | 2385 |
2253 /* Calculate number of iterations required to make the vector version | 2386 /* Calculate number of iterations required to make the vector version |
2254 profitable, relative to the loop bodies only. The following condition | 2387 profitable, relative to the loop bodies only. The following condition |
2255 must hold true: | 2388 must hold true: |
2256 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC | 2389 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC |
2257 where | 2390 where |
2258 SIC = scalar iteration cost, VIC = vector iteration cost, | 2391 SIC = scalar iteration cost, VIC = vector iteration cost, |
2259 VOC = vector outside cost, VF = vectorization factor, | 2392 VOC = vector outside cost, VF = vectorization factor, |
2346 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | 2479 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); |
2347 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | 2480 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
2348 | 2481 |
2349 | 2482 |
2350 /* Cost of reduction op inside loop. */ | 2483 /* Cost of reduction op inside loop. */ |
2351 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST; | 2484 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) |
2485 += ncopies * vect_get_cost (vector_stmt); | |
2352 | 2486 |
2353 stmt = STMT_VINFO_STMT (stmt_info); | 2487 stmt = STMT_VINFO_STMT (stmt_info); |
2354 | 2488 |
2355 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt))) | 2489 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt))) |
2356 { | 2490 { |
2386 orig_stmt = STMT_VINFO_STMT (stmt_info); | 2520 orig_stmt = STMT_VINFO_STMT (stmt_info); |
2387 | 2521 |
2388 code = gimple_assign_rhs_code (orig_stmt); | 2522 code = gimple_assign_rhs_code (orig_stmt); |
2389 | 2523 |
2390 /* Add in cost for initial definition. */ | 2524 /* Add in cost for initial definition. */ |
2391 outer_cost += TARG_SCALAR_TO_VEC_COST; | 2525 outer_cost += vect_get_cost (scalar_to_vec); |
2392 | 2526 |
2393 /* Determine cost of epilogue code. | 2527 /* Determine cost of epilogue code. |
2394 | 2528 |
2395 We have a reduction operator that will reduce the vector in one statement. | 2529 We have a reduction operator that will reduce the vector in one statement. |
2396 Also requires scalar extract. */ | 2530 Also requires scalar extract. */ |
2397 | 2531 |
2398 if (!nested_in_vect_loop_p (loop, orig_stmt)) | 2532 if (!nested_in_vect_loop_p (loop, orig_stmt)) |
2399 { | 2533 { |
2400 if (reduc_code != ERROR_MARK) | 2534 if (reduc_code != ERROR_MARK) |
2401 outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST; | 2535 outer_cost += vect_get_cost (vector_stmt) |
2536 + vect_get_cost (vec_to_scalar); | |
2402 else | 2537 else |
2403 { | 2538 { |
2404 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1); | 2539 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1); |
2405 tree bitsize = | 2540 tree bitsize = |
2406 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt))); | 2541 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt))); |
2409 | 2544 |
2410 optab = optab_for_tree_code (code, vectype, optab_default); | 2545 optab = optab_for_tree_code (code, vectype, optab_default); |
2411 | 2546 |
2412 /* We have a whole vector shift available. */ | 2547 /* We have a whole vector shift available. */ |
2413 if (VECTOR_MODE_P (mode) | 2548 if (VECTOR_MODE_P (mode) |
2414 && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing | 2549 && optab_handler (optab, mode) != CODE_FOR_nothing |
2415 && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing) | 2550 && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing) |
2416 /* Final reduction via vector shifts and the reduction operator. Also | 2551 /* Final reduction via vector shifts and the reduction operator. Also |
2417 requires scalar extract. */ | 2552 requires scalar extract. */ |
2418 outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST | 2553 outer_cost += ((exact_log2(nelements) * 2) |
2419 + TARG_VEC_TO_SCALAR_COST); | 2554 * vect_get_cost (vector_stmt) |
2555 + vect_get_cost (vec_to_scalar)); | |
2420 else | 2556 else |
2421 /* Use extracts and reduction op for final reduction. For N elements, | 2557 /* Use extracts and reduction op for final reduction. For N elements, |
2422 we have N extracts and N-1 reduction ops. */ | 2558 we have N extracts and N-1 reduction ops. */ |
2423 outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST); | 2559 outer_cost += ((nelements + nelements - 1) |
2560 * vect_get_cost (vector_stmt)); | |
2424 } | 2561 } |
2425 } | 2562 } |
2426 | 2563 |
2427 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost; | 2564 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost; |
2428 | 2565 |
2441 | 2578 |
2442 static void | 2579 static void |
2443 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies) | 2580 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies) |
2444 { | 2581 { |
2445 /* loop cost for vec_loop. */ | 2582 /* loop cost for vec_loop. */ |
2446 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST; | 2583 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) |
2584 = ncopies * vect_get_cost (vector_stmt); | |
2447 /* prologue cost for vec_init and vec_step. */ | 2585 /* prologue cost for vec_init and vec_step. */ |
2448 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST; | 2586 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) |
2587 = 2 * vect_get_cost (scalar_to_vec); | |
2449 | 2588 |
2450 if (vect_print_dump_info (REPORT_COST)) | 2589 if (vect_print_dump_info (REPORT_COST)) |
2451 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, " | 2590 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, " |
2452 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info), | 2591 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info), |
2453 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info)); | 2592 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info)); |
2460 STMT - a stmt that performs an induction operation in the loop. | 2599 STMT - a stmt that performs an induction operation in the loop. |
2461 IV_PHI - the initial value of the induction variable | 2600 IV_PHI - the initial value of the induction variable |
2462 | 2601 |
2463 Output: | 2602 Output: |
2464 Return a vector variable, initialized with the first VF values of | 2603 Return a vector variable, initialized with the first VF values of |
2465 the induction variable. E.g., for an iv with IV_PHI='X' and | 2604 the induction variable. E.g., for an iv with IV_PHI='X' and |
2466 evolution S, for a vector of 4 units, we want to return: | 2605 evolution S, for a vector of 4 units, we want to return: |
2467 [X, X + S, X + 2*S, X + 3*S]. */ | 2606 [X, X + S, X + 2*S, X + 3*S]. */ |
2468 | 2607 |
2469 static tree | 2608 static tree |
2470 get_initial_def_for_induction (gimple iv_phi) | 2609 get_initial_def_for_induction (gimple iv_phi) |
2471 { | 2610 { |
2472 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi); | 2611 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi); |
2473 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo); | 2612 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo); |
2474 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | 2613 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
2475 tree scalar_type = TREE_TYPE (gimple_phi_result (iv_phi)); | 2614 tree scalar_type; |
2476 tree vectype; | 2615 tree vectype; |
2477 int nunits; | 2616 int nunits; |
2478 edge pe = loop_preheader_edge (loop); | 2617 edge pe = loop_preheader_edge (loop); |
2479 struct loop *iv_loop; | 2618 struct loop *iv_loop; |
2480 basic_block new_bb; | 2619 basic_block new_bb; |
2499 edge latch_e; | 2638 edge latch_e; |
2500 tree loop_arg; | 2639 tree loop_arg; |
2501 gimple_stmt_iterator si; | 2640 gimple_stmt_iterator si; |
2502 basic_block bb = gimple_bb (iv_phi); | 2641 basic_block bb = gimple_bb (iv_phi); |
2503 tree stepvectype; | 2642 tree stepvectype; |
2504 | 2643 tree resvectype; |
2505 vectype = get_vectype_for_scalar_type (scalar_type); | |
2506 gcc_assert (vectype); | |
2507 nunits = TYPE_VECTOR_SUBPARTS (vectype); | |
2508 ncopies = vf / nunits; | |
2509 | |
2510 gcc_assert (phi_info); | |
2511 gcc_assert (ncopies >= 1); | |
2512 | |
2513 /* Find the first insertion point in the BB. */ | |
2514 si = gsi_after_labels (bb); | |
2515 | |
2516 if (INTEGRAL_TYPE_P (scalar_type)) | |
2517 step_expr = build_int_cst (scalar_type, 0); | |
2518 else if (POINTER_TYPE_P (scalar_type)) | |
2519 step_expr = build_int_cst (sizetype, 0); | |
2520 else | |
2521 step_expr = build_real (scalar_type, dconst0); | |
2522 | 2644 |
2523 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */ | 2645 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */ |
2524 if (nested_in_vect_loop_p (loop, iv_phi)) | 2646 if (nested_in_vect_loop_p (loop, iv_phi)) |
2525 { | 2647 { |
2526 nested_in_vect_loop = true; | 2648 nested_in_vect_loop = true; |
2533 latch_e = loop_latch_edge (iv_loop); | 2655 latch_e = loop_latch_edge (iv_loop); |
2534 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e); | 2656 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e); |
2535 | 2657 |
2536 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi)); | 2658 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi)); |
2537 gcc_assert (access_fn); | 2659 gcc_assert (access_fn); |
2660 STRIP_NOPS (access_fn); | |
2538 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn, | 2661 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn, |
2539 &init_expr, &step_expr); | 2662 &init_expr, &step_expr); |
2540 gcc_assert (ok); | 2663 gcc_assert (ok); |
2541 pe = loop_preheader_edge (iv_loop); | 2664 pe = loop_preheader_edge (iv_loop); |
2542 | 2665 |
2666 scalar_type = TREE_TYPE (init_expr); | |
2667 vectype = get_vectype_for_scalar_type (scalar_type); | |
2668 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi))); | |
2669 gcc_assert (vectype); | |
2670 nunits = TYPE_VECTOR_SUBPARTS (vectype); | |
2671 ncopies = vf / nunits; | |
2672 | |
2673 gcc_assert (phi_info); | |
2674 gcc_assert (ncopies >= 1); | |
2675 | |
2676 /* Find the first insertion point in the BB. */ | |
2677 si = gsi_after_labels (bb); | |
2678 | |
2543 /* Create the vector that holds the initial_value of the induction. */ | 2679 /* Create the vector that holds the initial_value of the induction. */ |
2544 if (nested_in_vect_loop) | 2680 if (nested_in_vect_loop) |
2545 { | 2681 { |
2546 /* iv_loop is nested in the loop to be vectorized. init_expr had already | 2682 /* iv_loop is nested in the loop to be vectorized. init_expr had already |
2547 been created during vectorization of previous stmts; We obtain it from | 2683 been created during vectorization of previous stmts. We obtain it |
2548 the STMT_VINFO_VEC_STMT of the defining stmt. */ | 2684 from the STMT_VINFO_VEC_STMT of the defining stmt. */ |
2549 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi, | 2685 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi, |
2550 loop_preheader_edge (iv_loop)); | 2686 loop_preheader_edge (iv_loop)); |
2551 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL); | 2687 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL); |
2552 } | 2688 } |
2553 else | 2689 else |
2563 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); | 2699 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); |
2564 gcc_assert (!new_bb); | 2700 gcc_assert (!new_bb); |
2565 } | 2701 } |
2566 | 2702 |
2567 t = NULL_TREE; | 2703 t = NULL_TREE; |
2568 t = tree_cons (NULL_TREE, init_expr, t); | 2704 t = tree_cons (NULL_TREE, new_name, t); |
2569 for (i = 1; i < nunits; i++) | 2705 for (i = 1; i < nunits; i++) |
2570 { | 2706 { |
2571 /* Create: new_name_i = new_name + step_expr */ | 2707 /* Create: new_name_i = new_name + step_expr */ |
2572 enum tree_code code = POINTER_TYPE_P (scalar_type) | 2708 enum tree_code code = POINTER_TYPE_P (scalar_type) |
2573 ? POINTER_PLUS_EXPR : PLUS_EXPR; | 2709 ? POINTER_PLUS_EXPR : PLUS_EXPR; |
2604 expr = build_int_cst (TREE_TYPE (step_expr), vf); | 2740 expr = build_int_cst (TREE_TYPE (step_expr), vf); |
2605 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr), | 2741 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr), |
2606 expr, step_expr); | 2742 expr, step_expr); |
2607 } | 2743 } |
2608 | 2744 |
2609 t = NULL_TREE; | 2745 t = unshare_expr (new_name); |
2610 for (i = 0; i < nunits; i++) | |
2611 t = tree_cons (NULL_TREE, unshare_expr (new_name), t); | |
2612 gcc_assert (CONSTANT_CLASS_P (new_name)); | 2746 gcc_assert (CONSTANT_CLASS_P (new_name)); |
2613 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name)); | 2747 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name)); |
2614 gcc_assert (stepvectype); | 2748 gcc_assert (stepvectype); |
2615 vec = build_vector (stepvectype, t); | 2749 vec = build_vector_from_val (stepvectype, t); |
2616 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL); | 2750 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL); |
2617 | 2751 |
2618 | 2752 |
2619 /* Create the following def-use cycle: | 2753 /* Create the following def-use cycle: |
2620 loop prolog: | 2754 loop prolog: |
2664 | 2798 |
2665 /* Create the vector that holds the step of the induction. */ | 2799 /* Create the vector that holds the step of the induction. */ |
2666 expr = build_int_cst (TREE_TYPE (step_expr), nunits); | 2800 expr = build_int_cst (TREE_TYPE (step_expr), nunits); |
2667 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr), | 2801 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr), |
2668 expr, step_expr); | 2802 expr, step_expr); |
2669 t = NULL_TREE; | 2803 t = unshare_expr (new_name); |
2670 for (i = 0; i < nunits; i++) | |
2671 t = tree_cons (NULL_TREE, unshare_expr (new_name), t); | |
2672 gcc_assert (CONSTANT_CLASS_P (new_name)); | 2804 gcc_assert (CONSTANT_CLASS_P (new_name)); |
2673 vec = build_vector (stepvectype, t); | 2805 vec = build_vector_from_val (stepvectype, t); |
2674 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL); | 2806 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL); |
2675 | 2807 |
2676 vec_def = induc_def; | 2808 vec_def = induc_def; |
2677 prev_stmt_vinfo = vinfo_for_stmt (induction_phi); | 2809 prev_stmt_vinfo = vinfo_for_stmt (induction_phi); |
2678 for (i = 1; i < ncopies; i++) | 2810 for (i = 1; i < ncopies; i++) |
2682 vec_def, vec_step); | 2814 vec_def, vec_step); |
2683 vec_def = make_ssa_name (vec_dest, new_stmt); | 2815 vec_def = make_ssa_name (vec_dest, new_stmt); |
2684 gimple_assign_set_lhs (new_stmt, vec_def); | 2816 gimple_assign_set_lhs (new_stmt, vec_def); |
2685 | 2817 |
2686 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT); | 2818 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT); |
2819 if (!useless_type_conversion_p (resvectype, vectype)) | |
2820 { | |
2821 new_stmt = gimple_build_assign_with_ops | |
2822 (VIEW_CONVERT_EXPR, | |
2823 vect_get_new_vect_var (resvectype, vect_simple_var, | |
2824 "vec_iv_"), | |
2825 build1 (VIEW_CONVERT_EXPR, resvectype, | |
2826 gimple_assign_lhs (new_stmt)), NULL_TREE); | |
2827 gimple_assign_set_lhs (new_stmt, | |
2828 make_ssa_name | |
2829 (gimple_assign_lhs (new_stmt), new_stmt)); | |
2830 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT); | |
2831 } | |
2687 set_vinfo_for_stmt (new_stmt, | 2832 set_vinfo_for_stmt (new_stmt, |
2688 new_stmt_vec_info (new_stmt, loop_vinfo, NULL)); | 2833 new_stmt_vec_info (new_stmt, loop_vinfo, NULL)); |
2689 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt; | 2834 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt; |
2690 prev_stmt_vinfo = vinfo_for_stmt (new_stmt); | 2835 prev_stmt_vinfo = vinfo_for_stmt (new_stmt); |
2691 } | 2836 } |
2729 fprintf (vect_dump, "\n"); | 2874 fprintf (vect_dump, "\n"); |
2730 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM); | 2875 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM); |
2731 } | 2876 } |
2732 | 2877 |
2733 STMT_VINFO_VEC_STMT (phi_info) = induction_phi; | 2878 STMT_VINFO_VEC_STMT (phi_info) = induction_phi; |
2879 if (!useless_type_conversion_p (resvectype, vectype)) | |
2880 { | |
2881 new_stmt = gimple_build_assign_with_ops | |
2882 (VIEW_CONVERT_EXPR, | |
2883 vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"), | |
2884 build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE); | |
2885 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt); | |
2886 gimple_assign_set_lhs (new_stmt, induc_def); | |
2887 si = gsi_start_bb (bb); | |
2888 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT); | |
2889 set_vinfo_for_stmt (new_stmt, | |
2890 new_stmt_vec_info (new_stmt, loop_vinfo, NULL)); | |
2891 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt)) | |
2892 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi)); | |
2893 } | |
2894 | |
2734 return induc_def; | 2895 return induc_def; |
2735 } | 2896 } |
2736 | 2897 |
2737 | 2898 |
2738 /* Function get_initial_def_for_reduction | 2899 /* Function get_initial_def_for_reduction |
2809 nested_in_vect_loop = true; | 2970 nested_in_vect_loop = true; |
2810 else | 2971 else |
2811 gcc_assert (loop == (gimple_bb (stmt))->loop_father); | 2972 gcc_assert (loop == (gimple_bb (stmt))->loop_father); |
2812 | 2973 |
2813 /* In case of double reduction we only create a vector variable to be put | 2974 /* In case of double reduction we only create a vector variable to be put |
2814 in the reduction phi node. The actual statement creation is done in | 2975 in the reduction phi node. The actual statement creation is done in |
2815 vect_create_epilog_for_reduction. */ | 2976 vect_create_epilog_for_reduction. */ |
2816 if (adjustment_def && nested_in_vect_loop | 2977 if (adjustment_def && nested_in_vect_loop |
2817 && TREE_CODE (init_val) == SSA_NAME | 2978 && TREE_CODE (init_val) == SSA_NAME |
2818 && (def_stmt = SSA_NAME_DEF_STMT (init_val)) | 2979 && (def_stmt = SSA_NAME_DEF_STMT (init_val)) |
2819 && gimple_code (def_stmt) == GIMPLE_PHI | 2980 && gimple_code (def_stmt) == GIMPLE_PHI |
2855 NULL); | 3016 NULL); |
2856 else | 3017 else |
2857 *adjustment_def = init_val; | 3018 *adjustment_def = init_val; |
2858 } | 3019 } |
2859 | 3020 |
2860 if (code == MULT_EXPR || code == BIT_AND_EXPR) | 3021 if (code == MULT_EXPR) |
2861 { | 3022 { |
2862 real_init_val = dconst1; | 3023 real_init_val = dconst1; |
2863 int_init_val = 1; | 3024 int_init_val = 1; |
2864 } | 3025 } |
3026 | |
3027 if (code == BIT_AND_EXPR) | |
3028 int_init_val = -1; | |
2865 | 3029 |
2866 if (SCALAR_FLOAT_TYPE_P (scalar_type)) | 3030 if (SCALAR_FLOAT_TYPE_P (scalar_type)) |
2867 def_for_init = build_real (scalar_type, real_init_val); | 3031 def_for_init = build_real (scalar_type, real_init_val); |
2868 else | 3032 else |
2869 def_for_init = build_int_cst (scalar_type, int_init_val); | 3033 def_for_init = build_int_cst (scalar_type, int_init_val); |
2897 *adjustment_def = NULL_TREE; | 3061 *adjustment_def = NULL_TREE; |
2898 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL); | 3062 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL); |
2899 break; | 3063 break; |
2900 } | 3064 } |
2901 | 3065 |
2902 for (i = nunits - 1; i >= 0; --i) | 3066 init_def = build_vector_from_val (vectype, init_value); |
2903 t = tree_cons (NULL_TREE, init_value, t); | |
2904 | |
2905 if (TREE_CONSTANT (init_val)) | |
2906 init_def = build_vector (vectype, t); | |
2907 else | |
2908 init_def = build_constructor_from_list (vectype, t); | |
2909 | |
2910 break; | 3067 break; |
2911 | 3068 |
2912 default: | 3069 default: |
2913 gcc_unreachable (); | 3070 gcc_unreachable (); |
2914 } | 3071 } |
2924 | 3081 |
2925 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector | 3082 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector |
2926 reduction statements. | 3083 reduction statements. |
2927 STMT is the scalar reduction stmt that is being vectorized. | 3084 STMT is the scalar reduction stmt that is being vectorized. |
2928 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the | 3085 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the |
2929 number of elements that we can fit in a vectype (nunits). In this case | 3086 number of elements that we can fit in a vectype (nunits). In this case |
2930 we have to generate more than one vector stmt - i.e - we need to "unroll" | 3087 we have to generate more than one vector stmt - i.e - we need to "unroll" |
2931 the vector stmt by a factor VF/nunits. For more details see documentation | 3088 the vector stmt by a factor VF/nunits. For more details see documentation |
2932 in vectorizable_operation. | 3089 in vectorizable_operation. |
2933 REDUC_CODE is the tree-code for the epilog reduction. | 3090 REDUC_CODE is the tree-code for the epilog reduction. |
2934 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction | 3091 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction |
3004 tree bitsize, bitpos; | 3161 tree bitsize, bitpos; |
3005 tree adjustment_def = NULL; | 3162 tree adjustment_def = NULL; |
3006 tree vec_initial_def = NULL; | 3163 tree vec_initial_def = NULL; |
3007 tree reduction_op, expr, def; | 3164 tree reduction_op, expr, def; |
3008 tree orig_name, scalar_result; | 3165 tree orig_name, scalar_result; |
3009 imm_use_iterator imm_iter; | 3166 imm_use_iterator imm_iter, phi_imm_iter; |
3010 use_operand_p use_p; | 3167 use_operand_p use_p, phi_use_p; |
3011 bool extract_scalar_result = false; | 3168 bool extract_scalar_result = false; |
3012 gimple use_stmt, orig_stmt, reduction_phi = NULL; | 3169 gimple use_stmt, orig_stmt, reduction_phi = NULL; |
3013 bool nested_in_vect_loop = false; | 3170 bool nested_in_vect_loop = false; |
3014 VEC (gimple, heap) *new_phis = NULL; | 3171 VEC (gimple, heap) *new_phis = NULL; |
3015 enum vect_def_type dt = vect_unknown_def_type; | 3172 enum vect_def_type dt = vect_unknown_def_type; |
3069 | 3226 |
3070 (in case of SLP, do it for all the phis). */ | 3227 (in case of SLP, do it for all the phis). */ |
3071 | 3228 |
3072 /* Get the loop-entry arguments. */ | 3229 /* Get the loop-entry arguments. */ |
3073 if (slp_node) | 3230 if (slp_node) |
3074 vect_get_slp_defs (slp_node, &vec_initial_defs, NULL, reduc_index); | 3231 vect_get_slp_defs (reduction_op, NULL_TREE, slp_node, &vec_initial_defs, |
3232 NULL, reduc_index); | |
3075 else | 3233 else |
3076 { | 3234 { |
3077 vec_initial_defs = VEC_alloc (tree, heap, 1); | 3235 vec_initial_defs = VEC_alloc (tree, heap, 1); |
3078 /* For the case of reduction, vect_get_vec_def_for_operand returns | 3236 /* For the case of reduction, vect_get_vec_def_for_operand returns |
3079 the scalar def before the loop, that defines the initial value | 3237 the scalar def before the loop, that defines the initial value |
3082 &adjustment_def); | 3240 &adjustment_def); |
3083 VEC_quick_push (tree, vec_initial_defs, vec_initial_def); | 3241 VEC_quick_push (tree, vec_initial_defs, vec_initial_def); |
3084 } | 3242 } |
3085 | 3243 |
3086 /* Set phi nodes arguments. */ | 3244 /* Set phi nodes arguments. */ |
3087 for (i = 0; VEC_iterate (gimple, reduction_phis, i, phi); i++) | 3245 FOR_EACH_VEC_ELT (gimple, reduction_phis, i, phi) |
3088 { | 3246 { |
3089 tree vec_init_def = VEC_index (tree, vec_initial_defs, i); | 3247 tree vec_init_def = VEC_index (tree, vec_initial_defs, i); |
3090 tree def = VEC_index (tree, vect_defs, i); | 3248 tree def = VEC_index (tree, vect_defs, i); |
3091 for (j = 0; j < ncopies; j++) | 3249 for (j = 0; j < ncopies; j++) |
3092 { | 3250 { |
3148 Store them in NEW_PHIS. */ | 3306 Store them in NEW_PHIS. */ |
3149 | 3307 |
3150 exit_bb = single_exit (loop)->dest; | 3308 exit_bb = single_exit (loop)->dest; |
3151 prev_phi_info = NULL; | 3309 prev_phi_info = NULL; |
3152 new_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs)); | 3310 new_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs)); |
3153 for (i = 0; VEC_iterate (tree, vect_defs, i, def); i++) | 3311 FOR_EACH_VEC_ELT (tree, vect_defs, i, def) |
3154 { | 3312 { |
3155 for (j = 0; j < ncopies; j++) | 3313 for (j = 0; j < ncopies; j++) |
3156 { | 3314 { |
3157 phi = create_phi_node (SSA_NAME_VAR (def), exit_bb); | 3315 phi = create_phi_node (SSA_NAME_VAR (def), exit_bb); |
3158 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL)); | 3316 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL)); |
3165 } | 3323 } |
3166 | 3324 |
3167 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def); | 3325 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def); |
3168 prev_phi_info = vinfo_for_stmt (phi); | 3326 prev_phi_info = vinfo_for_stmt (phi); |
3169 } | 3327 } |
3328 } | |
3329 | |
3330 /* The epilogue is created for the outer-loop, i.e., for the loop being | |
3331 vectorized. */ | |
3332 if (double_reduc) | |
3333 { | |
3334 loop = outer_loop; | |
3335 exit_bb = single_exit (loop)->dest; | |
3170 } | 3336 } |
3171 | 3337 |
3172 exit_gsi = gsi_after_labels (exit_bb); | 3338 exit_gsi = gsi_after_labels (exit_bb); |
3173 | 3339 |
3174 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3 | 3340 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3 |
3207 bitsize = TYPE_SIZE (scalar_type); | 3373 bitsize = TYPE_SIZE (scalar_type); |
3208 | 3374 |
3209 /* In case this is a reduction in an inner-loop while vectorizing an outer | 3375 /* In case this is a reduction in an inner-loop while vectorizing an outer |
3210 loop - we don't need to extract a single scalar result at the end of the | 3376 loop - we don't need to extract a single scalar result at the end of the |
3211 inner-loop (unless it is double reduction, i.e., the use of reduction is | 3377 inner-loop (unless it is double reduction, i.e., the use of reduction is |
3212 outside the outer-loop). The final vector of partial results will be used | 3378 outside the outer-loop). The final vector of partial results will be used |
3213 in the vectorized outer-loop, or reduced to a scalar result at the end of | 3379 in the vectorized outer-loop, or reduced to a scalar result at the end of |
3214 the outer-loop. */ | 3380 the outer-loop. */ |
3215 if (nested_in_vect_loop && !double_reduc) | 3381 if (nested_in_vect_loop && !double_reduc) |
3216 goto vect_finalize_reduction; | 3382 goto vect_finalize_reduction; |
3217 | 3383 |
3245 int bit_offset; | 3411 int bit_offset; |
3246 int element_bitsize = tree_low_cst (bitsize, 1); | 3412 int element_bitsize = tree_low_cst (bitsize, 1); |
3247 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1); | 3413 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1); |
3248 tree vec_temp; | 3414 tree vec_temp; |
3249 | 3415 |
3250 if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing) | 3416 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing) |
3251 shift_code = VEC_RSHIFT_EXPR; | 3417 shift_code = VEC_RSHIFT_EXPR; |
3252 else | 3418 else |
3253 have_whole_vector_shift = false; | 3419 have_whole_vector_shift = false; |
3254 | 3420 |
3255 /* Regardless of whether we have a whole vector shift, if we're | 3421 /* Regardless of whether we have a whole vector shift, if we're |
3261 if (!VECTOR_MODE_P (mode)) | 3427 if (!VECTOR_MODE_P (mode)) |
3262 have_whole_vector_shift = false; | 3428 have_whole_vector_shift = false; |
3263 else | 3429 else |
3264 { | 3430 { |
3265 optab optab = optab_for_tree_code (code, vectype, optab_default); | 3431 optab optab = optab_for_tree_code (code, vectype, optab_default); |
3266 if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing) | 3432 if (optab_handler (optab, mode) == CODE_FOR_nothing) |
3267 have_whole_vector_shift = false; | 3433 have_whole_vector_shift = false; |
3268 } | 3434 } |
3269 | 3435 |
3270 if (have_whole_vector_shift && !slp_node) | 3436 if (have_whole_vector_shift && !slp_node) |
3271 { | 3437 { |
3319 | 3485 |
3320 if (vect_print_dump_info (REPORT_DETAILS)) | 3486 if (vect_print_dump_info (REPORT_DETAILS)) |
3321 fprintf (vect_dump, "Reduce using scalar code. "); | 3487 fprintf (vect_dump, "Reduce using scalar code. "); |
3322 | 3488 |
3323 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1); | 3489 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1); |
3324 for (i = 0; VEC_iterate (gimple, new_phis, i, new_phi); i++) | 3490 FOR_EACH_VEC_ELT (gimple, new_phis, i, new_phi) |
3325 { | 3491 { |
3326 vec_temp = PHI_RESULT (new_phi); | 3492 vec_temp = PHI_RESULT (new_phi); |
3327 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize, | 3493 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize, |
3328 bitsize_zero_node); | 3494 bitsize_zero_node); |
3329 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs); | 3495 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs); |
3366 } | 3532 } |
3367 } | 3533 } |
3368 } | 3534 } |
3369 | 3535 |
3370 /* The only case where we need to reduce scalar results in SLP, is | 3536 /* The only case where we need to reduce scalar results in SLP, is |
3371 unrolling. If the size of SCALAR_RESULTS is greater than | 3537 unrolling. If the size of SCALAR_RESULTS is greater than |
3372 GROUP_SIZE, we reduce them combining elements modulo | 3538 GROUP_SIZE, we reduce them combining elements modulo |
3373 GROUP_SIZE. */ | 3539 GROUP_SIZE. */ |
3374 if (slp_node) | 3540 if (slp_node) |
3375 { | 3541 { |
3376 tree res, first_res, new_res; | 3542 tree res, first_res, new_res; |
3422 VEC_safe_push (tree, heap, scalar_results, new_temp); | 3588 VEC_safe_push (tree, heap, scalar_results, new_temp); |
3423 } | 3589 } |
3424 | 3590 |
3425 vect_finalize_reduction: | 3591 vect_finalize_reduction: |
3426 | 3592 |
3593 if (double_reduc) | |
3594 loop = loop->inner; | |
3595 | |
3427 /* 2.5 Adjust the final result by the initial value of the reduction | 3596 /* 2.5 Adjust the final result by the initial value of the reduction |
3428 variable. (When such adjustment is not needed, then | 3597 variable. (When such adjustment is not needed, then |
3429 'adjustment_def' is zero). For example, if code is PLUS we create: | 3598 'adjustment_def' is zero). For example, if code is PLUS we create: |
3430 new_temp = loop_exit_def + adjustment_def */ | 3599 new_temp = loop_exit_def + adjustment_def */ |
3431 | 3600 |
3469 VEC_replace (tree, scalar_results, 0, new_temp); | 3638 VEC_replace (tree, scalar_results, 0, new_temp); |
3470 | 3639 |
3471 VEC_replace (gimple, new_phis, 0, epilog_stmt); | 3640 VEC_replace (gimple, new_phis, 0, epilog_stmt); |
3472 } | 3641 } |
3473 | 3642 |
3474 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit | 3643 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit |
3475 phis with new adjusted scalar results, i.e., replace use <s_out0> | 3644 phis with new adjusted scalar results, i.e., replace use <s_out0> |
3476 with use <s_out4>. | 3645 with use <s_out4>. |
3477 | 3646 |
3478 Transform: | 3647 Transform: |
3479 loop_exit: | 3648 loop_exit: |
3495 s_out4 = adjust_result <s_out3> | 3664 s_out4 = adjust_result <s_out3> |
3496 use <s_out4> | 3665 use <s_out4> |
3497 use <s_out4> */ | 3666 use <s_out4> */ |
3498 | 3667 |
3499 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in | 3668 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in |
3500 case that GROUP_SIZE is greater than vectorization factor). Therefore, we | 3669 case that GROUP_SIZE is greater than vectorization factor). Therefore, we |
3501 need to match SCALAR_RESULTS with corresponding statements. The first | 3670 need to match SCALAR_RESULTS with corresponding statements. The first |
3502 (GROUP_SIZE / number of new vector stmts) scalar results correspond to | 3671 (GROUP_SIZE / number of new vector stmts) scalar results correspond to |
3503 the first vector stmt, etc. | 3672 the first vector stmt, etc. |
3504 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */ | 3673 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */ |
3505 if (group_size > VEC_length (gimple, new_phis)) | 3674 if (group_size > VEC_length (gimple, new_phis)) |
3506 { | 3675 { |
3529 scalar_dest = gimple_assign_lhs (current_stmt); | 3698 scalar_dest = gimple_assign_lhs (current_stmt); |
3530 } | 3699 } |
3531 | 3700 |
3532 phis = VEC_alloc (gimple, heap, 3); | 3701 phis = VEC_alloc (gimple, heap, 3); |
3533 /* Find the loop-closed-use at the loop exit of the original scalar | 3702 /* Find the loop-closed-use at the loop exit of the original scalar |
3534 result. (The reduction result is expected to have two immediate uses - | 3703 result. (The reduction result is expected to have two immediate uses - |
3535 one at the latch block, and one at the loop exit). */ | 3704 one at the latch block, and one at the loop exit). */ |
3536 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest) | 3705 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest) |
3537 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))) | 3706 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))) |
3538 VEC_safe_push (gimple, heap, phis, USE_STMT (use_p)); | 3707 VEC_safe_push (gimple, heap, phis, USE_STMT (use_p)); |
3539 | 3708 |
3540 /* We expect to have found an exit_phi because of loop-closed-ssa | 3709 /* We expect to have found an exit_phi because of loop-closed-ssa |
3541 form. */ | 3710 form. */ |
3542 gcc_assert (!VEC_empty (gimple, phis)); | 3711 gcc_assert (!VEC_empty (gimple, phis)); |
3543 | 3712 |
3544 for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++) | 3713 FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi) |
3545 { | 3714 { |
3546 if (outer_loop) | 3715 if (outer_loop) |
3547 { | 3716 { |
3548 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi); | 3717 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi); |
3549 gimple vect_phi; | 3718 gimple vect_phi; |
3630 } | 3799 } |
3631 | 3800 |
3632 vect_phi_res = PHI_RESULT (vect_phi); | 3801 vect_phi_res = PHI_RESULT (vect_phi); |
3633 | 3802 |
3634 /* Replace the use, i.e., set the correct vs1 in the regular | 3803 /* Replace the use, i.e., set the correct vs1 in the regular |
3635 reduction phi node. FORNOW, NCOPIES is always 1, so the | 3804 reduction phi node. FORNOW, NCOPIES is always 1, so the |
3636 loop is redundant. */ | 3805 loop is redundant. */ |
3637 use = reduction_phi; | 3806 use = reduction_phi; |
3638 for (j = 0; j < ncopies; j++) | 3807 for (j = 0; j < ncopies; j++) |
3639 { | 3808 { |
3640 edge pr_edge = loop_preheader_edge (loop); | 3809 edge pr_edge = loop_preheader_edge (loop); |
3641 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res); | 3810 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res); |
3642 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use)); | 3811 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use)); |
3643 } | 3812 } |
3644 } | 3813 } |
3645 } | 3814 } |
3646 | 3815 } |
3816 | |
3817 VEC_free (gimple, heap, phis); | |
3818 if (nested_in_vect_loop) | |
3819 { | |
3820 if (double_reduc) | |
3821 loop = outer_loop; | |
3822 else | |
3823 continue; | |
3824 } | |
3825 | |
3826 phis = VEC_alloc (gimple, heap, 3); | |
3827 /* Find the loop-closed-use at the loop exit of the original scalar | |
3828 result. (The reduction result is expected to have two immediate uses, | |
3829 one at the latch block, and one at the loop exit). For double | |
3830 reductions we are looking for exit phis of the outer loop. */ | |
3831 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest) | |
3832 { | |
3833 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))) | |
3834 VEC_safe_push (gimple, heap, phis, USE_STMT (use_p)); | |
3835 else | |
3836 { | |
3837 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI) | |
3838 { | |
3839 tree phi_res = PHI_RESULT (USE_STMT (use_p)); | |
3840 | |
3841 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res) | |
3842 { | |
3843 if (!flow_bb_inside_loop_p (loop, | |
3844 gimple_bb (USE_STMT (phi_use_p)))) | |
3845 VEC_safe_push (gimple, heap, phis, | |
3846 USE_STMT (phi_use_p)); | |
3847 } | |
3848 } | |
3849 } | |
3850 } | |
3851 | |
3852 FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi) | |
3853 { | |
3647 /* Replace the uses: */ | 3854 /* Replace the uses: */ |
3648 orig_name = PHI_RESULT (exit_phi); | 3855 orig_name = PHI_RESULT (exit_phi); |
3649 scalar_result = VEC_index (tree, scalar_results, k); | 3856 scalar_result = VEC_index (tree, scalar_results, k); |
3650 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name) | 3857 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name) |
3651 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) | 3858 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) |
3666 If VEC_STMT is also passed, vectorize the STMT: create a vectorized | 3873 If VEC_STMT is also passed, vectorize the STMT: create a vectorized |
3667 stmt to replace it, put it in VEC_STMT, and insert it at GSI. | 3874 stmt to replace it, put it in VEC_STMT, and insert it at GSI. |
3668 Return FALSE if not a vectorizable STMT, TRUE otherwise. | 3875 Return FALSE if not a vectorizable STMT, TRUE otherwise. |
3669 | 3876 |
3670 This function also handles reduction idioms (patterns) that have been | 3877 This function also handles reduction idioms (patterns) that have been |
3671 recognized in advance during vect_pattern_recog. In this case, STMT may be | 3878 recognized in advance during vect_pattern_recog. In this case, STMT may be |
3672 of this form: | 3879 of this form: |
3673 X = pattern_expr (arg0, arg1, ..., X) | 3880 X = pattern_expr (arg0, arg1, ..., X) |
3674 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original | 3881 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original |
3675 sequence that had been detected and replaced by the pattern-stmt (STMT). | 3882 sequence that had been detected and replaced by the pattern-stmt (STMT). |
3676 | 3883 |
3687 vectorization factor should be V8HI); on the other hand, the vectype that | 3894 vectorization factor should be V8HI); on the other hand, the vectype that |
3688 is used to create the vector form is actually V4SI (the type of the result). | 3895 is used to create the vector form is actually V4SI (the type of the result). |
3689 | 3896 |
3690 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that | 3897 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that |
3691 indicates what is the actual level of parallelism (V8HI in the example), so | 3898 indicates what is the actual level of parallelism (V8HI in the example), so |
3692 that the right vectorization factor would be derived. This vectype | 3899 that the right vectorization factor would be derived. This vectype |
3693 corresponds to the type of arguments to the reduction stmt, and should *NOT* | 3900 corresponds to the type of arguments to the reduction stmt, and should *NOT* |
3694 be used to create the vectorized stmt. The right vectype for the vectorized | 3901 be used to create the vectorized stmt. The right vectype for the vectorized |
3695 stmt is obtained from the type of the result X: | 3902 stmt is obtained from the type of the result X: |
3696 get_vectype_for_scalar_type (TREE_TYPE (X)) | 3903 get_vectype_for_scalar_type (TREE_TYPE (X)) |
3697 | 3904 |
3698 This means that, contrary to "regular" reductions (or "regular" stmts in | 3905 This means that, contrary to "regular" reductions (or "regular" stmts in |
3699 general), the following equation: | 3906 general), the following equation: |
3745 tree def_arg; | 3952 tree def_arg; |
3746 gimple def_arg_stmt; | 3953 gimple def_arg_stmt; |
3747 VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL, *vect_defs = NULL; | 3954 VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL, *vect_defs = NULL; |
3748 VEC (gimple, heap) *phis = NULL; | 3955 VEC (gimple, heap) *phis = NULL; |
3749 int vec_num; | 3956 int vec_num; |
3750 tree def0, def1; | 3957 tree def0, def1, tem; |
3751 | 3958 |
3752 if (nested_in_vect_loop_p (loop, stmt)) | 3959 if (nested_in_vect_loop_p (loop, stmt)) |
3753 { | 3960 { |
3754 outer_loop = loop; | 3961 outer_loop = loop; |
3755 loop = loop->inner; | 3962 loop = loop->inner; |
3786 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt); | 3993 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt); |
3787 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)); | 3994 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)); |
3788 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info)); | 3995 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info)); |
3789 } | 3996 } |
3790 | 3997 |
3791 /* 3. Check the operands of the operation. The first operands are defined | 3998 /* 3. Check the operands of the operation. The first operands are defined |
3792 inside the loop body. The last operand is the reduction variable, | 3999 inside the loop body. The last operand is the reduction variable, |
3793 which is defined by the loop-header-phi. */ | 4000 which is defined by the loop-header-phi. */ |
3794 | 4001 |
3795 gcc_assert (is_gimple_assign (stmt)); | 4002 gcc_assert (is_gimple_assign (stmt)); |
3796 | 4003 |
3797 /* Flatten RHS */ | 4004 /* Flatten RHS. */ |
3798 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt))) | 4005 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt))) |
3799 { | 4006 { |
3800 case GIMPLE_SINGLE_RHS: | 4007 case GIMPLE_SINGLE_RHS: |
3801 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)); | 4008 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)); |
3802 if (op_type == ternary_op) | 4009 if (op_type == ternary_op) |
3831 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type) | 4038 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type) |
3832 && !SCALAR_FLOAT_TYPE_P (scalar_type)) | 4039 && !SCALAR_FLOAT_TYPE_P (scalar_type)) |
3833 return false; | 4040 return false; |
3834 | 4041 |
3835 /* All uses but the last are expected to be defined in the loop. | 4042 /* All uses but the last are expected to be defined in the loop. |
3836 The last use is the reduction variable. In case of nested cycle this | 4043 The last use is the reduction variable. In case of nested cycle this |
3837 assumption is not true: we use reduc_index to record the index of the | 4044 assumption is not true: we use reduc_index to record the index of the |
3838 reduction variable. */ | 4045 reduction variable. */ |
3839 for (i = 0; i < op_type-1; i++) | 4046 for (i = 0; i < op_type-1; i++) |
3840 { | 4047 { |
3841 tree tem; | |
3842 | |
3843 /* The condition of COND_EXPR is checked in vectorizable_condition(). */ | 4048 /* The condition of COND_EXPR is checked in vectorizable_condition(). */ |
3844 if (i == 0 && code == COND_EXPR) | 4049 if (i == 0 && code == COND_EXPR) |
3845 continue; | 4050 continue; |
3846 | 4051 |
3847 is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL, | 4052 is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL, |
3862 reduc_def_stmt = def_stmt; | 4067 reduc_def_stmt = def_stmt; |
3863 reduc_index = i; | 4068 reduc_index = i; |
3864 } | 4069 } |
3865 } | 4070 } |
3866 | 4071 |
3867 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, NULL, &def_stmt, | 4072 is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL, &def_stmt, |
3868 &def, &dt); | 4073 &def, &dt, &tem); |
4074 if (!vectype_in) | |
4075 vectype_in = tem; | |
3869 gcc_assert (is_simple_use); | 4076 gcc_assert (is_simple_use); |
3870 gcc_assert (dt == vect_reduction_def | 4077 gcc_assert (dt == vect_reduction_def |
3871 || dt == vect_nested_cycle | 4078 || dt == vect_nested_cycle |
3872 || ((dt == vect_internal_def || dt == vect_external_def | 4079 || ((dt == vect_internal_def || dt == vect_external_def |
3873 || dt == vect_constant_def || dt == vect_induction_def) | 4080 || dt == vect_constant_def || dt == vect_induction_def) |
3920 fprintf (vect_dump, "no optab."); | 4127 fprintf (vect_dump, "no optab."); |
3921 | 4128 |
3922 return false; | 4129 return false; |
3923 } | 4130 } |
3924 | 4131 |
3925 if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing) | 4132 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing) |
3926 { | 4133 { |
3927 if (vect_print_dump_info (REPORT_DETAILS)) | 4134 if (vect_print_dump_info (REPORT_DETAILS)) |
3928 fprintf (vect_dump, "op not supported by target."); | 4135 fprintf (vect_dump, "op not supported by target."); |
3929 | 4136 |
3930 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD | 4137 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD |
3962 | 4169 |
3963 This means that: | 4170 This means that: |
3964 1. The tree-code that is used to create the vector operation in the | 4171 1. The tree-code that is used to create the vector operation in the |
3965 epilog code (that reduces the partial results) is not the | 4172 epilog code (that reduces the partial results) is not the |
3966 tree-code of STMT, but is rather the tree-code of the original | 4173 tree-code of STMT, but is rather the tree-code of the original |
3967 stmt from the pattern that STMT is replacing. I.e, in the example | 4174 stmt from the pattern that STMT is replacing. I.e, in the example |
3968 above we want to use 'widen_sum' in the loop, but 'plus' in the | 4175 above we want to use 'widen_sum' in the loop, but 'plus' in the |
3969 epilog. | 4176 epilog. |
3970 2. The type (mode) we use to check available target support | 4177 2. The type (mode) we use to check available target support |
3971 for the vector operation to be created in the *epilog*, is | 4178 for the vector operation to be created in the *epilog*, is |
3972 determined by the type of the reduction variable (in the example | 4179 determined by the type of the reduction variable (in the example |
3973 above we'd check this: plus_optab[vect_int_mode]). | 4180 above we'd check this: optab_handler (plus_optab, vect_int_mode])). |
3974 However the type (mode) we use to check available target support | 4181 However the type (mode) we use to check available target support |
3975 for the vector operation to be created *inside the loop*, is | 4182 for the vector operation to be created *inside the loop*, is |
3976 determined by the type of the other arguments to STMT (in the | 4183 determined by the type of the other arguments to STMT (in the |
3977 example we'd check this: widen_sum_optab[vect_short_mode]). | 4184 example we'd check this: optab_handler (widen_sum_optab, |
4185 vect_short_mode)). | |
3978 | 4186 |
3979 This is contrary to "regular" reductions, in which the types of all | 4187 This is contrary to "regular" reductions, in which the types of all |
3980 the arguments are the same as the type of the reduction variable. | 4188 the arguments are the same as the type of the reduction variable. |
3981 For "regular" reductions we can therefore use the same vector type | 4189 For "regular" reductions we can therefore use the same vector type |
3982 (and also the same tree-code) when generating the epilog code and | 4190 (and also the same tree-code) when generating the epilog code and |
4025 | 4233 |
4026 epilog_reduc_code = ERROR_MARK; | 4234 epilog_reduc_code = ERROR_MARK; |
4027 } | 4235 } |
4028 | 4236 |
4029 if (reduc_optab | 4237 if (reduc_optab |
4030 && optab_handler (reduc_optab, vec_mode)->insn_code | 4238 && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing) |
4031 == CODE_FOR_nothing) | |
4032 { | 4239 { |
4033 if (vect_print_dump_info (REPORT_DETAILS)) | 4240 if (vect_print_dump_info (REPORT_DETAILS)) |
4034 fprintf (vect_dump, "reduc op not supported by target."); | 4241 fprintf (vect_dump, "reduc op not supported by target."); |
4035 | 4242 |
4036 epilog_reduc_code = ERROR_MARK; | 4243 epilog_reduc_code = ERROR_MARK; |
4159 } | 4366 } |
4160 | 4367 |
4161 /* Handle uses. */ | 4368 /* Handle uses. */ |
4162 if (j == 0) | 4369 if (j == 0) |
4163 { | 4370 { |
4371 tree op0, op1 = NULL_TREE; | |
4372 | |
4373 op0 = ops[!reduc_index]; | |
4374 if (op_type == ternary_op) | |
4375 { | |
4376 if (reduc_index == 0) | |
4377 op1 = ops[2]; | |
4378 else | |
4379 op1 = ops[1]; | |
4380 } | |
4381 | |
4164 if (slp_node) | 4382 if (slp_node) |
4165 vect_get_slp_defs (slp_node, &vec_oprnds0, &vec_oprnds1, -1); | 4383 vect_get_slp_defs (op0, op1, slp_node, &vec_oprnds0, &vec_oprnds1, |
4384 -1); | |
4166 else | 4385 else |
4167 { | 4386 { |
4168 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index], | 4387 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index], |
4169 stmt, NULL); | 4388 stmt, NULL); |
4170 VEC_quick_push (tree, vec_oprnds0, loop_vec_def0); | 4389 VEC_quick_push (tree, vec_oprnds0, loop_vec_def0); |
4171 if (op_type == ternary_op) | 4390 if (op_type == ternary_op) |
4172 { | 4391 { |
4173 if (reduc_index == 0) | 4392 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt, |
4174 loop_vec_def1 = vect_get_vec_def_for_operand (ops[2], stmt, | 4393 NULL); |
4175 NULL); | |
4176 else | |
4177 loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt, | |
4178 NULL); | |
4179 | |
4180 VEC_quick_push (tree, vec_oprnds1, loop_vec_def1); | 4394 VEC_quick_push (tree, vec_oprnds1, loop_vec_def1); |
4181 } | 4395 } |
4182 } | 4396 } |
4183 } | 4397 } |
4184 else | 4398 else |
4200 reduc_def = gimple_assign_lhs (new_stmt); | 4414 reduc_def = gimple_assign_lhs (new_stmt); |
4201 | 4415 |
4202 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi; | 4416 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi; |
4203 } | 4417 } |
4204 | 4418 |
4205 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, def0); i++) | 4419 FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, def0) |
4206 { | 4420 { |
4207 if (slp_node) | 4421 if (slp_node) |
4208 reduc_def = PHI_RESULT (VEC_index (gimple, phis, i)); | 4422 reduc_def = PHI_RESULT (VEC_index (gimple, phis, i)); |
4209 else | 4423 else |
4210 { | 4424 { |
4365 return true; | 4579 return true; |
4366 } | 4580 } |
4367 | 4581 |
4368 /* Function vectorizable_live_operation. | 4582 /* Function vectorizable_live_operation. |
4369 | 4583 |
4370 STMT computes a value that is used outside the loop. Check if | 4584 STMT computes a value that is used outside the loop. Check if |
4371 it can be supported. */ | 4585 it can be supported. */ |
4372 | 4586 |
4373 bool | 4587 bool |
4374 vectorizable_live_operation (gimple stmt, | 4588 vectorizable_live_operation (gimple stmt, |
4375 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED, | 4589 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED, |
4406 op_type = TREE_CODE_LENGTH (code); | 4620 op_type = TREE_CODE_LENGTH (code); |
4407 rhs_class = get_gimple_rhs_class (code); | 4621 rhs_class = get_gimple_rhs_class (code); |
4408 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op); | 4622 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op); |
4409 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op); | 4623 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op); |
4410 | 4624 |
4411 /* FORNOW: support only if all uses are invariant. This means | 4625 /* FORNOW: support only if all uses are invariant. This means |
4412 that the scalar operations can remain in place, unvectorized. | 4626 that the scalar operations can remain in place, unvectorized. |
4413 The original last scalar value that they compute will be used. */ | 4627 The original last scalar value that they compute will be used. */ |
4414 | 4628 |
4415 for (i = 0; i < op_type; i++) | 4629 for (i = 0; i < op_type; i++) |
4416 { | 4630 { |
4517 | 4731 |
4518 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a | 4732 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a |
4519 compile time constant), or it is a constant that doesn't divide by the | 4733 compile time constant), or it is a constant that doesn't divide by the |
4520 vectorization factor, then an epilog loop needs to be created. | 4734 vectorization factor, then an epilog loop needs to be created. |
4521 We therefore duplicate the loop: the original loop will be vectorized, | 4735 We therefore duplicate the loop: the original loop will be vectorized, |
4522 and will compute the first (n/VF) iterations. The second copy of the loop | 4736 and will compute the first (n/VF) iterations. The second copy of the loop |
4523 will remain scalar and will compute the remaining (n%VF) iterations. | 4737 will remain scalar and will compute the remaining (n%VF) iterations. |
4524 (VF is the vectorization factor). */ | 4738 (VF is the vectorization factor). */ |
4525 | 4739 |
4526 if (do_peeling_for_loop_bound) | 4740 if (do_peeling_for_loop_bound) |
4527 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio, | 4741 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio, |