Mercurial > hg > CbC > CbC_gcc
comparison gcc/tree-vect-loop.c @ 145:1830386684a0
gcc-9.2.0
author | anatofuz |
---|---|
date | Thu, 13 Feb 2020 11:34:05 +0900 |
parents | 84e7813d76e9 |
children |
comparison
equal
deleted
inserted
replaced
131:84e7813d76e9 | 145:1830386684a0 |
---|---|
1 /* Loop Vectorization | 1 /* Loop Vectorization |
2 Copyright (C) 2003-2018 Free Software Foundation, Inc. | 2 Copyright (C) 2003-2020 Free Software Foundation, Inc. |
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and | 3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and |
4 Ira Rosen <irar@il.ibm.com> | 4 Ira Rosen <irar@il.ibm.com> |
5 | 5 |
6 This file is part of GCC. | 6 This file is part of GCC. |
7 | 7 |
41 #include "tree-ssa-loop-ivopts.h" | 41 #include "tree-ssa-loop-ivopts.h" |
42 #include "tree-ssa-loop-manip.h" | 42 #include "tree-ssa-loop-manip.h" |
43 #include "tree-ssa-loop-niter.h" | 43 #include "tree-ssa-loop-niter.h" |
44 #include "tree-ssa-loop.h" | 44 #include "tree-ssa-loop.h" |
45 #include "cfgloop.h" | 45 #include "cfgloop.h" |
46 #include "params.h" | |
47 #include "tree-scalar-evolution.h" | 46 #include "tree-scalar-evolution.h" |
48 #include "tree-vectorizer.h" | 47 #include "tree-vectorizer.h" |
49 #include "gimple-fold.h" | 48 #include "gimple-fold.h" |
50 #include "cgraph.h" | 49 #include "cgraph.h" |
51 #include "tree-cfg.h" | 50 #include "tree-cfg.h" |
152 For additional information on this project see: | 151 For additional information on this project see: |
153 http://gcc.gnu.org/projects/tree-ssa/vectorization.html | 152 http://gcc.gnu.org/projects/tree-ssa/vectorization.html |
154 */ | 153 */ |
155 | 154 |
156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *); | 155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *); |
156 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info, | |
157 bool *, bool *); | |
157 | 158 |
158 /* Subroutine of vect_determine_vf_for_stmt that handles only one | 159 /* Subroutine of vect_determine_vf_for_stmt that handles only one |
159 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE | 160 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE |
160 may already be set for general statements (not just data refs). */ | 161 may already be set for general statements (not just data refs). */ |
161 | 162 |
162 static opt_result | 163 static opt_result |
163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info, | 164 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info, |
164 bool vectype_maybe_set_p, | 165 bool vectype_maybe_set_p, |
165 poly_uint64 *vf, | 166 poly_uint64 *vf) |
166 vec<stmt_vec_info > *mask_producers) | |
167 { | 167 { |
168 gimple *stmt = stmt_info->stmt; | 168 gimple *stmt = stmt_info->stmt; |
169 | 169 |
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info) | 170 if ((!STMT_VINFO_RELEVANT_P (stmt_info) |
171 && !STMT_VINFO_LIVE_P (stmt_info)) | 171 && !STMT_VINFO_LIVE_P (stmt_info)) |
189 that contain a data ref, or for "pattern-stmts" (stmts generated | 189 that contain a data ref, or for "pattern-stmts" (stmts generated |
190 by the vectorizer to represent/replace a certain idiom). */ | 190 by the vectorizer to represent/replace a certain idiom). */ |
191 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info) | 191 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info) |
192 || vectype_maybe_set_p) | 192 || vectype_maybe_set_p) |
193 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype); | 193 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype); |
194 else if (stmt_vectype == boolean_type_node) | |
195 mask_producers->safe_push (stmt_info); | |
196 else | 194 else |
197 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype; | 195 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype; |
198 } | 196 } |
199 | 197 |
200 if (nunits_vectype) | 198 if (nunits_vectype) |
203 return opt_result::success (); | 201 return opt_result::success (); |
204 } | 202 } |
205 | 203 |
206 /* Subroutine of vect_determine_vectorization_factor. Set the vector | 204 /* Subroutine of vect_determine_vectorization_factor. Set the vector |
207 types of STMT_INFO and all attached pattern statements and update | 205 types of STMT_INFO and all attached pattern statements and update |
208 the vectorization factor VF accordingly. If some of the statements | 206 the vectorization factor VF accordingly. Return true on success |
209 produce a mask result whose vector type can only be calculated later, | 207 or false if something prevented vectorization. */ |
210 add them to MASK_PRODUCERS. Return true on success or false if | |
211 something prevented vectorization. */ | |
212 | 208 |
213 static opt_result | 209 static opt_result |
214 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf, | 210 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf) |
215 vec<stmt_vec_info > *mask_producers) | |
216 { | 211 { |
217 vec_info *vinfo = stmt_info->vinfo; | 212 vec_info *vinfo = stmt_info->vinfo; |
218 if (dump_enabled_p ()) | 213 if (dump_enabled_p ()) |
219 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G", | 214 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G", |
220 stmt_info->stmt); | 215 stmt_info->stmt); |
221 opt_result res | 216 opt_result res = vect_determine_vf_for_stmt_1 (stmt_info, false, vf); |
222 = vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers); | |
223 if (!res) | 217 if (!res) |
224 return res; | 218 return res; |
225 | 219 |
226 if (STMT_VINFO_IN_PATTERN_P (stmt_info) | 220 if (STMT_VINFO_IN_PATTERN_P (stmt_info) |
227 && STMT_VINFO_RELATED_STMT (stmt_info)) | 221 && STMT_VINFO_RELATED_STMT (stmt_info)) |
236 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si)); | 230 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si)); |
237 if (dump_enabled_p ()) | 231 if (dump_enabled_p ()) |
238 dump_printf_loc (MSG_NOTE, vect_location, | 232 dump_printf_loc (MSG_NOTE, vect_location, |
239 "==> examining pattern def stmt: %G", | 233 "==> examining pattern def stmt: %G", |
240 def_stmt_info->stmt); | 234 def_stmt_info->stmt); |
241 if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true, | 235 res = vect_determine_vf_for_stmt_1 (def_stmt_info, true, vf); |
242 vf, mask_producers)) | |
243 res = vect_determine_vf_for_stmt_1 (def_stmt_info, true, | |
244 vf, mask_producers); | |
245 if (!res) | 236 if (!res) |
246 return res; | 237 return res; |
247 } | 238 } |
248 | 239 |
249 if (dump_enabled_p ()) | 240 if (dump_enabled_p ()) |
250 dump_printf_loc (MSG_NOTE, vect_location, | 241 dump_printf_loc (MSG_NOTE, vect_location, |
251 "==> examining pattern statement: %G", | 242 "==> examining pattern statement: %G", |
252 stmt_info->stmt); | 243 stmt_info->stmt); |
253 res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers); | 244 res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf); |
254 if (!res) | 245 if (!res) |
255 return res; | 246 return res; |
256 } | 247 } |
257 | 248 |
258 return opt_result::success (); | 249 return opt_result::success (); |
284 */ | 275 */ |
285 | 276 |
286 static opt_result | 277 static opt_result |
287 vect_determine_vectorization_factor (loop_vec_info loop_vinfo) | 278 vect_determine_vectorization_factor (loop_vec_info loop_vinfo) |
288 { | 279 { |
289 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | 280 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
290 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); | 281 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); |
291 unsigned nbbs = loop->num_nodes; | 282 unsigned nbbs = loop->num_nodes; |
292 poly_uint64 vectorization_factor = 1; | 283 poly_uint64 vectorization_factor = 1; |
293 tree scalar_type = NULL_TREE; | 284 tree scalar_type = NULL_TREE; |
294 gphi *phi; | 285 gphi *phi; |
295 tree vectype; | 286 tree vectype; |
296 stmt_vec_info stmt_info; | 287 stmt_vec_info stmt_info; |
297 unsigned i; | 288 unsigned i; |
298 auto_vec<stmt_vec_info> mask_producers; | |
299 | 289 |
300 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor"); | 290 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor"); |
301 | 291 |
302 for (i = 0; i < nbbs; i++) | 292 for (i = 0; i < nbbs; i++) |
303 { | 293 { |
323 if (dump_enabled_p ()) | 313 if (dump_enabled_p ()) |
324 dump_printf_loc (MSG_NOTE, vect_location, | 314 dump_printf_loc (MSG_NOTE, vect_location, |
325 "get vectype for scalar type: %T\n", | 315 "get vectype for scalar type: %T\n", |
326 scalar_type); | 316 scalar_type); |
327 | 317 |
328 vectype = get_vectype_for_scalar_type (scalar_type); | 318 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type); |
329 if (!vectype) | 319 if (!vectype) |
330 return opt_result::failure_at (phi, | 320 return opt_result::failure_at (phi, |
331 "not vectorized: unsupported " | 321 "not vectorized: unsupported " |
332 "data-type %T\n", | 322 "data-type %T\n", |
333 scalar_type); | 323 scalar_type); |
351 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si); | 341 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si); |
352 gsi_next (&si)) | 342 gsi_next (&si)) |
353 { | 343 { |
354 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si)); | 344 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si)); |
355 opt_result res | 345 opt_result res |
356 = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor, | 346 = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor); |
357 &mask_producers); | |
358 if (!res) | 347 if (!res) |
359 return res; | 348 return res; |
360 } | 349 } |
361 } | 350 } |
362 | 351 |
370 | 359 |
371 if (known_le (vectorization_factor, 1U)) | 360 if (known_le (vectorization_factor, 1U)) |
372 return opt_result::failure_at (vect_location, | 361 return opt_result::failure_at (vect_location, |
373 "not vectorized: unsupported data-type\n"); | 362 "not vectorized: unsupported data-type\n"); |
374 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor; | 363 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor; |
375 | |
376 for (i = 0; i < mask_producers.length (); i++) | |
377 { | |
378 stmt_info = mask_producers[i]; | |
379 opt_tree mask_type = vect_get_mask_type_for_stmt (stmt_info); | |
380 if (!mask_type) | |
381 return opt_result::propagate_failure (mask_type); | |
382 STMT_VINFO_VECTYPE (stmt_info) = mask_type; | |
383 } | |
384 | |
385 return opt_result::success (); | 364 return opt_result::success (); |
386 } | 365 } |
387 | 366 |
388 | 367 |
389 /* Function vect_is_simple_iv_evolution. | 368 /* Function vect_is_simple_iv_evolution. |
479 in LOOP. LOOP_VINFO represents the loop that is now being | 458 in LOOP. LOOP_VINFO represents the loop that is now being |
480 considered for vectorization (can be LOOP, or an outer-loop | 459 considered for vectorization (can be LOOP, or an outer-loop |
481 enclosing LOOP). */ | 460 enclosing LOOP). */ |
482 | 461 |
483 static void | 462 static void |
484 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop) | 463 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop) |
485 { | 464 { |
486 basic_block bb = loop->header; | 465 basic_block bb = loop->header; |
487 tree init, step; | 466 tree init, step; |
488 auto_vec<stmt_vec_info, 64> worklist; | 467 auto_vec<stmt_vec_info, 64> worklist; |
489 gphi_iterator gsi; | 468 gphi_iterator gsi; |
490 bool double_reduc; | 469 bool double_reduc, reduc_chain; |
491 | 470 |
492 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles"); | 471 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles"); |
493 | 472 |
494 /* First - identify all inductions. Reduction detection assumes that all the | 473 /* First - identify all inductions. Reduction detection assumes that all the |
495 inductions have been identified, therefore, this order must not be | 474 inductions have been identified, therefore, this order must not be |
557 | 536 |
558 gcc_assert (!virtual_operand_p (def) | 537 gcc_assert (!virtual_operand_p (def) |
559 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type); | 538 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type); |
560 | 539 |
561 stmt_vec_info reduc_stmt_info | 540 stmt_vec_info reduc_stmt_info |
562 = vect_force_simple_reduction (loop_vinfo, stmt_vinfo, | 541 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc, |
563 &double_reduc, false); | 542 &reduc_chain); |
564 if (reduc_stmt_info) | 543 if (reduc_stmt_info) |
565 { | 544 { |
566 if (double_reduc) | 545 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info; |
567 { | 546 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo; |
568 if (dump_enabled_p ()) | 547 if (double_reduc) |
569 dump_printf_loc (MSG_NOTE, vect_location, | 548 { |
549 if (dump_enabled_p ()) | |
550 dump_printf_loc (MSG_NOTE, vect_location, | |
570 "Detected double reduction.\n"); | 551 "Detected double reduction.\n"); |
571 | 552 |
572 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def; | 553 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def; |
573 STMT_VINFO_DEF_TYPE (reduc_stmt_info) | 554 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def; |
574 = vect_double_reduction_def; | |
575 } | 555 } |
576 else | 556 else |
577 { | 557 { |
578 if (loop != LOOP_VINFO_LOOP (loop_vinfo)) | 558 if (loop != LOOP_VINFO_LOOP (loop_vinfo)) |
579 { | 559 { |
580 if (dump_enabled_p ()) | 560 if (dump_enabled_p ()) |
581 dump_printf_loc (MSG_NOTE, vect_location, | 561 dump_printf_loc (MSG_NOTE, vect_location, |
582 "Detected vectorizable nested cycle.\n"); | 562 "Detected vectorizable nested cycle.\n"); |
583 | 563 |
584 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle; | 564 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle; |
585 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_nested_cycle; | |
586 } | 565 } |
587 else | 566 else |
588 { | 567 { |
589 if (dump_enabled_p ()) | 568 if (dump_enabled_p ()) |
590 dump_printf_loc (MSG_NOTE, vect_location, | 569 dump_printf_loc (MSG_NOTE, vect_location, |
593 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def; | 572 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def; |
594 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def; | 573 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def; |
595 /* Store the reduction cycles for possible vectorization in | 574 /* Store the reduction cycles for possible vectorization in |
596 loop-aware SLP if it was not detected as reduction | 575 loop-aware SLP if it was not detected as reduction |
597 chain. */ | 576 chain. */ |
598 if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info)) | 577 if (! reduc_chain) |
599 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push | 578 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push |
600 (reduc_stmt_info); | 579 (reduc_stmt_info); |
601 } | 580 } |
602 } | 581 } |
603 } | 582 } |
631 a[i] = i; */ | 610 a[i] = i; */ |
632 | 611 |
633 static void | 612 static void |
634 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo) | 613 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo) |
635 { | 614 { |
636 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | 615 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
637 | 616 |
638 vect_analyze_scalar_cycles_1 (loop_vinfo, loop); | 617 vect_analyze_scalar_cycles_1 (loop_vinfo, loop); |
639 | 618 |
640 /* When vectorizing an outer-loop, the inner-loop is executed sequentially. | 619 /* When vectorizing an outer-loop, the inner-loop is executed sequentially. |
641 Reductions in such inner-loop therefore have different properties than | 620 Reductions in such inner-loop therefore have different properties than |
662 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)); | 641 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)); |
663 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info); | 642 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info); |
664 do | 643 do |
665 { | 644 { |
666 stmtp = STMT_VINFO_RELATED_STMT (stmt_info); | 645 stmtp = STMT_VINFO_RELATED_STMT (stmt_info); |
646 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp) | |
647 == STMT_VINFO_DEF_TYPE (stmt_info)); | |
667 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp; | 648 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp; |
668 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info); | 649 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info); |
669 if (stmt_info) | 650 if (stmt_info) |
670 REDUC_GROUP_NEXT_ELEMENT (stmtp) | 651 REDUC_GROUP_NEXT_ELEMENT (stmtp) |
671 = STMT_VINFO_RELATED_STMT (stmt_info); | 652 = STMT_VINFO_RELATED_STMT (stmt_info); |
672 } | 653 } |
673 while (stmt_info); | 654 while (stmt_info); |
674 STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def; | |
675 } | 655 } |
676 | 656 |
677 /* Fixup scalar cycles that now have their stmts detected as patterns. */ | 657 /* Fixup scalar cycles that now have their stmts detected as patterns. */ |
678 | 658 |
679 static void | 659 static void |
686 if (STMT_VINFO_IN_PATTERN_P (first)) | 666 if (STMT_VINFO_IN_PATTERN_P (first)) |
687 { | 667 { |
688 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first); | 668 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first); |
689 while (next) | 669 while (next) |
690 { | 670 { |
691 if (! STMT_VINFO_IN_PATTERN_P (next)) | 671 if (! STMT_VINFO_IN_PATTERN_P (next) |
672 || STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (next)) == -1) | |
692 break; | 673 break; |
693 next = REDUC_GROUP_NEXT_ELEMENT (next); | 674 next = REDUC_GROUP_NEXT_ELEMENT (next); |
694 } | 675 } |
695 /* If not all stmt in the chain are patterns try to handle | 676 /* If not all stmt in the chain are patterns or if we failed |
696 the chain without patterns. */ | 677 to update STMT_VINFO_REDUC_IDX try to handle the chain |
697 if (! next) | 678 without patterns. */ |
679 if (! next | |
680 && STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (first)) != -1) | |
698 { | 681 { |
699 vect_fixup_reduc_chain (first); | 682 vect_fixup_reduc_chain (first); |
700 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i] | 683 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i] |
701 = STMT_VINFO_RELATED_STMT (first); | 684 = STMT_VINFO_RELATED_STMT (first); |
702 } | 685 } |
712 | 695 |
713 Return the loop exit condition. */ | 696 Return the loop exit condition. */ |
714 | 697 |
715 | 698 |
716 static gcond * | 699 static gcond * |
717 vect_get_loop_niters (struct loop *loop, tree *assumptions, | 700 vect_get_loop_niters (class loop *loop, tree *assumptions, |
718 tree *number_of_iterations, tree *number_of_iterationsm1) | 701 tree *number_of_iterations, tree *number_of_iterationsm1) |
719 { | 702 { |
720 edge exit = single_exit (loop); | 703 edge exit = single_exit (loop); |
721 struct tree_niter_desc niter_desc; | 704 class tree_niter_desc niter_desc; |
722 tree niter_assumptions, niter, may_be_zero; | 705 tree niter_assumptions, niter, may_be_zero; |
723 gcond *cond = get_loop_exit_condition (loop); | 706 gcond *cond = get_loop_exit_condition (loop); |
724 | 707 |
725 *assumptions = boolean_true_node; | 708 *assumptions = boolean_true_node; |
726 *number_of_iterationsm1 = chrec_dont_know; | 709 *number_of_iterationsm1 = chrec_dont_know; |
728 DUMP_VECT_SCOPE ("get_loop_niters"); | 711 DUMP_VECT_SCOPE ("get_loop_niters"); |
729 | 712 |
730 if (!exit) | 713 if (!exit) |
731 return cond; | 714 return cond; |
732 | 715 |
733 niter = chrec_dont_know; | |
734 may_be_zero = NULL_TREE; | 716 may_be_zero = NULL_TREE; |
735 niter_assumptions = boolean_true_node; | |
736 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL) | 717 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL) |
737 || chrec_contains_undetermined (niter_desc.niter)) | 718 || chrec_contains_undetermined (niter_desc.niter)) |
738 return cond; | 719 return cond; |
739 | 720 |
740 niter_assumptions = niter_desc.assumptions; | 721 niter_assumptions = niter_desc.assumptions; |
793 Used as predicate for dfs order traversal of the loop bbs. */ | 774 Used as predicate for dfs order traversal of the loop bbs. */ |
794 | 775 |
795 static bool | 776 static bool |
796 bb_in_loop_p (const_basic_block bb, const void *data) | 777 bb_in_loop_p (const_basic_block bb, const void *data) |
797 { | 778 { |
798 const struct loop *const loop = (const struct loop *)data; | 779 const class loop *const loop = (const class loop *)data; |
799 if (flow_bb_inside_loop_p (loop, bb)) | 780 if (flow_bb_inside_loop_p (loop, bb)) |
800 return true; | 781 return true; |
801 return false; | 782 return false; |
802 } | 783 } |
803 | 784 |
804 | 785 |
805 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as | 786 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as |
806 stmt_vec_info structs for all the stmts in LOOP_IN. */ | 787 stmt_vec_info structs for all the stmts in LOOP_IN. */ |
807 | 788 |
808 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared) | 789 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared) |
809 : vec_info (vec_info::loop, init_cost (loop_in), shared), | 790 : vec_info (vec_info::loop, init_cost (loop_in), shared), |
810 loop (loop_in), | 791 loop (loop_in), |
811 bbs (XCNEWVEC (basic_block, loop->num_nodes)), | 792 bbs (XCNEWVEC (basic_block, loop->num_nodes)), |
812 num_itersm1 (NULL_TREE), | 793 num_itersm1 (NULL_TREE), |
813 num_iters (NULL_TREE), | 794 num_iters (NULL_TREE), |
817 versioning_threshold (0), | 798 versioning_threshold (0), |
818 vectorization_factor (0), | 799 vectorization_factor (0), |
819 max_vectorization_factor (0), | 800 max_vectorization_factor (0), |
820 mask_skip_niters (NULL_TREE), | 801 mask_skip_niters (NULL_TREE), |
821 mask_compare_type (NULL_TREE), | 802 mask_compare_type (NULL_TREE), |
803 simd_if_cond (NULL_TREE), | |
822 unaligned_dr (NULL), | 804 unaligned_dr (NULL), |
823 peeling_for_alignment (0), | 805 peeling_for_alignment (0), |
824 ptr_mask (0), | 806 ptr_mask (0), |
825 ivexpr_map (NULL), | 807 ivexpr_map (NULL), |
808 scan_map (NULL), | |
826 slp_unrolling_factor (1), | 809 slp_unrolling_factor (1), |
827 single_scalar_iteration_cost (0), | 810 single_scalar_iteration_cost (0), |
811 vec_outside_cost (0), | |
812 vec_inside_cost (0), | |
828 vectorizable (false), | 813 vectorizable (false), |
829 can_fully_mask_p (true), | 814 can_fully_mask_p (true), |
830 fully_masked_p (false), | 815 fully_masked_p (false), |
831 peeling_for_gaps (false), | 816 peeling_for_gaps (false), |
832 peeling_for_niter (false), | 817 peeling_for_niter (false), |
833 operands_swapped (false), | |
834 no_data_dependencies (false), | 818 no_data_dependencies (false), |
835 has_mask_store (false), | 819 has_mask_store (false), |
820 scalar_loop_scaling (profile_probability::uninitialized ()), | |
836 scalar_loop (NULL), | 821 scalar_loop (NULL), |
837 orig_loop_info (NULL) | 822 orig_loop_info (NULL) |
838 { | 823 { |
839 /* CHECKME: We want to visit all BBs before their successors (except for | 824 /* CHECKME: We want to visit all BBs before their successors (except for |
840 latch blocks, for which this assertion wouldn't hold). In the simple | 825 latch blocks, for which this assertion wouldn't hold). In the simple |
860 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) | 845 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) |
861 { | 846 { |
862 gimple *stmt = gsi_stmt (si); | 847 gimple *stmt = gsi_stmt (si); |
863 gimple_set_uid (stmt, 0); | 848 gimple_set_uid (stmt, 0); |
864 add_stmt (stmt); | 849 add_stmt (stmt); |
865 } | 850 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the |
866 } | 851 third argument is the #pragma omp simd if (x) condition, when 0, |
852 loop shouldn't be vectorized, when non-zero constant, it should | |
853 be vectorized normally, otherwise versioned with vectorized loop | |
854 done if the condition is non-zero at runtime. */ | |
855 if (loop_in->simduid | |
856 && is_gimple_call (stmt) | |
857 && gimple_call_internal_p (stmt) | |
858 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE | |
859 && gimple_call_num_args (stmt) >= 3 | |
860 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME | |
861 && (loop_in->simduid | |
862 == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))) | |
863 { | |
864 tree arg = gimple_call_arg (stmt, 2); | |
865 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME) | |
866 simd_if_cond = arg; | |
867 else | |
868 gcc_assert (integer_nonzerop (arg)); | |
869 } | |
870 } | |
871 } | |
872 | |
873 epilogue_vinfos.create (6); | |
867 } | 874 } |
868 | 875 |
869 /* Free all levels of MASKS. */ | 876 /* Free all levels of MASKS. */ |
870 | 877 |
871 void | 878 void |
881 /* Free all memory used by the _loop_vec_info, as well as all the | 888 /* Free all memory used by the _loop_vec_info, as well as all the |
882 stmt_vec_info structs of all the stmts in the loop. */ | 889 stmt_vec_info structs of all the stmts in the loop. */ |
883 | 890 |
884 _loop_vec_info::~_loop_vec_info () | 891 _loop_vec_info::~_loop_vec_info () |
885 { | 892 { |
886 int nbbs; | |
887 gimple_stmt_iterator si; | |
888 int j; | |
889 | |
890 nbbs = loop->num_nodes; | |
891 for (j = 0; j < nbbs; j++) | |
892 { | |
893 basic_block bb = bbs[j]; | |
894 for (si = gsi_start_bb (bb); !gsi_end_p (si); ) | |
895 { | |
896 gimple *stmt = gsi_stmt (si); | |
897 | |
898 /* We may have broken canonical form by moving a constant | |
899 into RHS1 of a commutative op. Fix such occurrences. */ | |
900 if (operands_swapped && is_gimple_assign (stmt)) | |
901 { | |
902 enum tree_code code = gimple_assign_rhs_code (stmt); | |
903 | |
904 if ((code == PLUS_EXPR | |
905 || code == POINTER_PLUS_EXPR | |
906 || code == MULT_EXPR) | |
907 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt))) | |
908 swap_ssa_operands (stmt, | |
909 gimple_assign_rhs1_ptr (stmt), | |
910 gimple_assign_rhs2_ptr (stmt)); | |
911 else if (code == COND_EXPR | |
912 && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt))) | |
913 { | |
914 tree cond_expr = gimple_assign_rhs1 (stmt); | |
915 enum tree_code cond_code = TREE_CODE (cond_expr); | |
916 | |
917 if (TREE_CODE_CLASS (cond_code) == tcc_comparison) | |
918 { | |
919 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, | |
920 0)); | |
921 cond_code = invert_tree_comparison (cond_code, | |
922 honor_nans); | |
923 if (cond_code != ERROR_MARK) | |
924 { | |
925 TREE_SET_CODE (cond_expr, cond_code); | |
926 swap_ssa_operands (stmt, | |
927 gimple_assign_rhs2_ptr (stmt), | |
928 gimple_assign_rhs3_ptr (stmt)); | |
929 } | |
930 } | |
931 } | |
932 } | |
933 gsi_next (&si); | |
934 } | |
935 } | |
936 | |
937 free (bbs); | 893 free (bbs); |
938 | 894 |
939 release_vec_loop_masks (&masks); | 895 release_vec_loop_masks (&masks); |
940 delete ivexpr_map; | 896 delete ivexpr_map; |
897 delete scan_map; | |
898 epilogue_vinfos.release (); | |
941 | 899 |
942 loop->aux = NULL; | 900 loop->aux = NULL; |
943 } | 901 } |
944 | 902 |
945 /* Return an invariant or register for EXPR and emit necessary | 903 /* Return an invariant or register for EXPR and emit necessary |
1005 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */ | 963 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */ |
1006 | 964 |
1007 static bool | 965 static bool |
1008 vect_verify_full_masking (loop_vec_info loop_vinfo) | 966 vect_verify_full_masking (loop_vec_info loop_vinfo) |
1009 { | 967 { |
1010 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | 968 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
1011 unsigned int min_ni_width; | 969 unsigned int min_ni_width; |
970 unsigned int max_nscalars_per_iter | |
971 = vect_get_max_nscalars_per_iter (loop_vinfo); | |
1012 | 972 |
1013 /* Use a normal loop if there are no statements that need masking. | 973 /* Use a normal loop if there are no statements that need masking. |
1014 This only happens in rare degenerate cases: it means that the loop | 974 This only happens in rare degenerate cases: it means that the loop |
1015 has no loads, no stores, and no live-out values. */ | 975 has no loads, no stores, and no live-out values. */ |
1016 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ()) | 976 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ()) |
1025 widest_int max_back_edges; | 985 widest_int max_back_edges; |
1026 if (max_loop_iterations (loop, &max_back_edges)) | 986 if (max_loop_iterations (loop, &max_back_edges)) |
1027 max_ni = wi::smin (max_ni, max_back_edges + 1); | 987 max_ni = wi::smin (max_ni, max_back_edges + 1); |
1028 | 988 |
1029 /* Account for rgroup masks, in which each bit is replicated N times. */ | 989 /* Account for rgroup masks, in which each bit is replicated N times. */ |
1030 max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo); | 990 max_ni *= max_nscalars_per_iter; |
1031 | 991 |
1032 /* Work out how many bits we need to represent the limit. */ | 992 /* Work out how many bits we need to represent the limit. */ |
1033 min_ni_width = wi::min_precision (max_ni, UNSIGNED); | 993 min_ni_width = wi::min_precision (max_ni, UNSIGNED); |
1034 | 994 |
1035 /* Find a scalar mode for which WHILE_ULT is supported. */ | 995 /* Find a scalar mode for which WHILE_ULT is supported. */ |
1036 opt_scalar_int_mode cmp_mode_iter; | 996 opt_scalar_int_mode cmp_mode_iter; |
1037 tree cmp_type = NULL_TREE; | 997 tree cmp_type = NULL_TREE; |
998 tree iv_type = NULL_TREE; | |
999 widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo); | |
1000 unsigned int iv_precision = UINT_MAX; | |
1001 | |
1002 if (iv_limit != -1) | |
1003 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter, | |
1004 UNSIGNED); | |
1005 | |
1038 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT) | 1006 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT) |
1039 { | 1007 { |
1040 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ()); | 1008 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ()); |
1041 if (cmp_bits >= min_ni_width | 1009 if (cmp_bits >= min_ni_width |
1042 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ())) | 1010 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ())) |
1044 tree this_type = build_nonstandard_integer_type (cmp_bits, true); | 1012 tree this_type = build_nonstandard_integer_type (cmp_bits, true); |
1045 if (this_type | 1013 if (this_type |
1046 && can_produce_all_loop_masks_p (loop_vinfo, this_type)) | 1014 && can_produce_all_loop_masks_p (loop_vinfo, this_type)) |
1047 { | 1015 { |
1048 /* Although we could stop as soon as we find a valid mode, | 1016 /* Although we could stop as soon as we find a valid mode, |
1049 it's often better to continue until we hit Pmode, since the | 1017 there are at least two reasons why that's not always the |
1050 operands to the WHILE are more likely to be reusable in | 1018 best choice: |
1051 address calculations. */ | 1019 |
1052 cmp_type = this_type; | 1020 - An IV that's Pmode or wider is more likely to be reusable |
1021 in address calculations than an IV that's narrower than | |
1022 Pmode. | |
1023 | |
1024 - Doing the comparison in IV_PRECISION or wider allows | |
1025 a natural 0-based IV, whereas using a narrower comparison | |
1026 type requires mitigations against wrap-around. | |
1027 | |
1028 Conversely, if the IV limit is variable, doing the comparison | |
1029 in a wider type than the original type can introduce | |
1030 unnecessary extensions, so picking the widest valid mode | |
1031 is not always a good choice either. | |
1032 | |
1033 Here we prefer the first IV type that's Pmode or wider, | |
1034 and the first comparison type that's IV_PRECISION or wider. | |
1035 (The comparison type must be no wider than the IV type, | |
1036 to avoid extensions in the vector loop.) | |
1037 | |
1038 ??? We might want to try continuing beyond Pmode for ILP32 | |
1039 targets if CMP_BITS < IV_PRECISION. */ | |
1040 iv_type = this_type; | |
1041 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type)) | |
1042 cmp_type = this_type; | |
1053 if (cmp_bits >= GET_MODE_BITSIZE (Pmode)) | 1043 if (cmp_bits >= GET_MODE_BITSIZE (Pmode)) |
1054 break; | 1044 break; |
1055 } | 1045 } |
1056 } | 1046 } |
1057 } | 1047 } |
1058 | 1048 |
1059 if (!cmp_type) | 1049 if (!cmp_type) |
1060 return false; | 1050 return false; |
1061 | 1051 |
1062 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type; | 1052 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type; |
1053 LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type; | |
1063 return true; | 1054 return true; |
1064 } | 1055 } |
1065 | 1056 |
1066 /* Calculate the cost of one scalar iteration of the loop. */ | 1057 /* Calculate the cost of one scalar iteration of the loop. */ |
1067 static void | 1058 static void |
1068 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo) | 1059 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo) |
1069 { | 1060 { |
1070 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | 1061 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
1071 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); | 1062 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); |
1072 int nbbs = loop->num_nodes, factor; | 1063 int nbbs = loop->num_nodes, factor; |
1073 int innerloop_iters, i; | 1064 int innerloop_iters, i; |
1074 | 1065 |
1075 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost"); | 1066 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost"); |
1098 | 1089 |
1099 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt)) | 1090 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt)) |
1100 continue; | 1091 continue; |
1101 | 1092 |
1102 /* Skip stmts that are not vectorized inside the loop. */ | 1093 /* Skip stmts that are not vectorized inside the loop. */ |
1103 if (stmt_info | 1094 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info); |
1104 && !STMT_VINFO_RELEVANT_P (stmt_info) | 1095 if (!STMT_VINFO_RELEVANT_P (vstmt_info) |
1105 && (!STMT_VINFO_LIVE_P (stmt_info) | 1096 && (!STMT_VINFO_LIVE_P (vstmt_info) |
1106 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))) | 1097 || !VECTORIZABLE_CYCLE_DEF |
1107 && !STMT_VINFO_IN_PATTERN_P (stmt_info)) | 1098 (STMT_VINFO_DEF_TYPE (vstmt_info)))) |
1108 continue; | 1099 continue; |
1109 | 1100 |
1110 vect_cost_for_stmt kind; | 1101 vect_cost_for_stmt kind; |
1111 if (STMT_VINFO_DATA_REF (stmt_info)) | 1102 if (STMT_VINFO_DATA_REF (stmt_info)) |
1112 { | 1103 { |
1113 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) | 1104 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) |
1114 kind = scalar_load; | 1105 kind = scalar_load; |
1115 else | 1106 else |
1116 kind = scalar_store; | 1107 kind = scalar_store; |
1117 } | 1108 } |
1118 else | 1109 else if (vect_nop_conversion_p (stmt_info)) |
1110 continue; | |
1111 else | |
1119 kind = scalar_stmt; | 1112 kind = scalar_stmt; |
1120 | 1113 |
1121 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), | 1114 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), |
1122 factor, kind, stmt_info, 0, vect_prologue); | 1115 factor, kind, stmt_info, 0, vect_prologue); |
1123 } | 1116 } |
1147 - the loop exit condition is simple enough | 1140 - the loop exit condition is simple enough |
1148 - the number of iterations can be analyzed, i.e, a countable loop. The | 1141 - the number of iterations can be analyzed, i.e, a countable loop. The |
1149 niter could be analyzed under some assumptions. */ | 1142 niter could be analyzed under some assumptions. */ |
1150 | 1143 |
1151 opt_result | 1144 opt_result |
1152 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond, | 1145 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond, |
1153 tree *assumptions, tree *number_of_iterationsm1, | 1146 tree *assumptions, tree *number_of_iterationsm1, |
1154 tree *number_of_iterations, gcond **inner_loop_cond) | 1147 tree *number_of_iterations, gcond **inner_loop_cond) |
1155 { | 1148 { |
1156 DUMP_VECT_SCOPE ("vect_analyze_loop_form"); | 1149 DUMP_VECT_SCOPE ("vect_analyze_loop_form"); |
1157 | 1150 |
1182 return opt_result::failure_at (vect_location, | 1175 return opt_result::failure_at (vect_location, |
1183 "not vectorized: empty loop.\n"); | 1176 "not vectorized: empty loop.\n"); |
1184 } | 1177 } |
1185 else | 1178 else |
1186 { | 1179 { |
1187 struct loop *innerloop = loop->inner; | 1180 class loop *innerloop = loop->inner; |
1188 edge entryedge; | 1181 edge entryedge; |
1189 | 1182 |
1190 /* Nested loop. We currently require that the loop is doubly-nested, | 1183 /* Nested loop. We currently require that the loop is doubly-nested, |
1191 contains a single inner loop, and the number of BBs is exactly 5. | 1184 contains a single inner loop, and the number of BBs is exactly 5. |
1192 Vectorizable outer-loops look like this: | 1185 Vectorizable outer-loops look like this: |
1299 } | 1292 } |
1300 | 1293 |
1301 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */ | 1294 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */ |
1302 | 1295 |
1303 opt_loop_vec_info | 1296 opt_loop_vec_info |
1304 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared) | 1297 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared) |
1305 { | 1298 { |
1306 tree assumptions, number_of_iterations, number_of_iterationsm1; | 1299 tree assumptions, number_of_iterations, number_of_iterationsm1; |
1307 gcond *loop_cond, *inner_loop_cond = NULL; | 1300 gcond *loop_cond, *inner_loop_cond = NULL; |
1308 | 1301 |
1309 opt_result res | 1302 opt_result res |
1362 statements update the vectorization factor. */ | 1355 statements update the vectorization factor. */ |
1363 | 1356 |
1364 static void | 1357 static void |
1365 vect_update_vf_for_slp (loop_vec_info loop_vinfo) | 1358 vect_update_vf_for_slp (loop_vec_info loop_vinfo) |
1366 { | 1359 { |
1367 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | 1360 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
1368 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); | 1361 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); |
1369 int nbbs = loop->num_nodes; | 1362 int nbbs = loop->num_nodes; |
1370 poly_uint64 vectorization_factor; | 1363 poly_uint64 vectorization_factor; |
1371 int i; | 1364 int i; |
1372 | 1365 |
1382 exploited. */ | 1375 exploited. */ |
1383 bool only_slp_in_loop = true; | 1376 bool only_slp_in_loop = true; |
1384 for (i = 0; i < nbbs; i++) | 1377 for (i = 0; i < nbbs; i++) |
1385 { | 1378 { |
1386 basic_block bb = bbs[i]; | 1379 basic_block bb = bbs[i]; |
1380 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si); | |
1381 gsi_next (&si)) | |
1382 { | |
1383 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ()); | |
1384 if (!stmt_info) | |
1385 continue; | |
1386 if ((STMT_VINFO_RELEVANT_P (stmt_info) | |
1387 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))) | |
1388 && !PURE_SLP_STMT (stmt_info)) | |
1389 /* STMT needs both SLP and loop-based vectorization. */ | |
1390 only_slp_in_loop = false; | |
1391 } | |
1387 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si); | 1392 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si); |
1388 gsi_next (&si)) | 1393 gsi_next (&si)) |
1389 { | 1394 { |
1390 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si)); | 1395 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si)); |
1391 stmt_info = vect_stmt_to_vectorize (stmt_info); | 1396 stmt_info = vect_stmt_to_vectorize (stmt_info); |
1397 } | 1402 } |
1398 } | 1403 } |
1399 | 1404 |
1400 if (only_slp_in_loop) | 1405 if (only_slp_in_loop) |
1401 { | 1406 { |
1402 dump_printf_loc (MSG_NOTE, vect_location, | 1407 if (dump_enabled_p ()) |
1403 "Loop contains only SLP stmts\n"); | 1408 dump_printf_loc (MSG_NOTE, vect_location, |
1409 "Loop contains only SLP stmts\n"); | |
1404 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo); | 1410 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo); |
1405 } | 1411 } |
1406 else | 1412 else |
1407 { | 1413 { |
1408 dump_printf_loc (MSG_NOTE, vect_location, | 1414 if (dump_enabled_p ()) |
1409 "Loop contains SLP and non-SLP stmts\n"); | 1415 dump_printf_loc (MSG_NOTE, vect_location, |
1416 "Loop contains SLP and non-SLP stmts\n"); | |
1410 /* Both the vectorization factor and unroll factor have the form | 1417 /* Both the vectorization factor and unroll factor have the form |
1411 current_vector_size * X for some rational X, so they must have | 1418 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X, |
1412 a common multiple. */ | 1419 so they must have a common multiple. */ |
1413 vectorization_factor | 1420 vectorization_factor |
1414 = force_common_multiple (vectorization_factor, | 1421 = force_common_multiple (vectorization_factor, |
1415 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo)); | 1422 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo)); |
1416 } | 1423 } |
1417 | 1424 |
1456 Scan the loop stmts and make sure they are all vectorizable. */ | 1463 Scan the loop stmts and make sure they are all vectorizable. */ |
1457 | 1464 |
1458 static opt_result | 1465 static opt_result |
1459 vect_analyze_loop_operations (loop_vec_info loop_vinfo) | 1466 vect_analyze_loop_operations (loop_vec_info loop_vinfo) |
1460 { | 1467 { |
1461 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | 1468 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
1462 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); | 1469 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); |
1463 int nbbs = loop->num_nodes; | 1470 int nbbs = loop->num_nodes; |
1464 int i; | 1471 int i; |
1465 stmt_vec_info stmt_info; | 1472 stmt_vec_info stmt_info; |
1466 bool need_to_vectorize = false; | 1473 bool need_to_vectorize = false; |
1467 bool ok; | 1474 bool ok; |
1468 | 1475 |
1469 DUMP_VECT_SCOPE ("vect_analyze_loop_operations"); | 1476 DUMP_VECT_SCOPE ("vect_analyze_loop_operations"); |
1470 | 1477 |
1471 stmt_vector_for_cost cost_vec; | 1478 auto_vec<stmt_info_for_cost> cost_vec; |
1472 cost_vec.create (2); | |
1473 | 1479 |
1474 for (i = 0; i < nbbs; i++) | 1480 for (i = 0; i < nbbs; i++) |
1475 { | 1481 { |
1476 basic_block bb = bbs[i]; | 1482 basic_block bb = bbs[i]; |
1477 | 1483 |
1511 return opt_result::failure_at (phi, "unsupported phi"); | 1517 return opt_result::failure_at (phi, "unsupported phi"); |
1512 | 1518 |
1513 phi_op = PHI_ARG_DEF (phi, 0); | 1519 phi_op = PHI_ARG_DEF (phi, 0); |
1514 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op); | 1520 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op); |
1515 if (!op_def_info) | 1521 if (!op_def_info) |
1516 return opt_result::failure_at (phi, "unsupported phi"); | 1522 return opt_result::failure_at (phi, "unsupported phi\n"); |
1517 | 1523 |
1518 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer | 1524 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer |
1519 && (STMT_VINFO_RELEVANT (op_def_info) | 1525 && (STMT_VINFO_RELEVANT (op_def_info) |
1520 != vect_used_in_outer_by_reduction)) | 1526 != vect_used_in_outer_by_reduction)) |
1521 return opt_result::failure_at (phi, "unsupported phi"); | 1527 return opt_result::failure_at (phi, "unsupported phi\n"); |
1528 | |
1529 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def | |
1530 || (STMT_VINFO_DEF_TYPE (stmt_info) | |
1531 == vect_double_reduction_def)) | |
1532 && !vectorizable_lc_phi (stmt_info, NULL, NULL)) | |
1533 return opt_result::failure_at (phi, "unsupported phi\n"); | |
1522 } | 1534 } |
1523 | 1535 |
1524 continue; | 1536 continue; |
1525 } | 1537 } |
1526 | 1538 |
1540 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def | 1552 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def |
1541 && ! PURE_SLP_STMT (stmt_info)) | 1553 && ! PURE_SLP_STMT (stmt_info)) |
1542 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL, | 1554 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL, |
1543 &cost_vec); | 1555 &cost_vec); |
1544 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def | 1556 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def |
1557 || (STMT_VINFO_DEF_TYPE (stmt_info) | |
1558 == vect_double_reduction_def) | |
1545 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle) | 1559 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle) |
1546 && ! PURE_SLP_STMT (stmt_info)) | 1560 && ! PURE_SLP_STMT (stmt_info)) |
1547 ok = vectorizable_reduction (stmt_info, NULL, NULL, NULL, NULL, | 1561 ok = vectorizable_reduction (stmt_info, NULL, NULL, &cost_vec); |
1548 &cost_vec); | |
1549 } | 1562 } |
1550 | 1563 |
1551 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */ | 1564 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */ |
1552 if (ok | 1565 if (ok |
1553 && STMT_VINFO_LIVE_P (stmt_info) | 1566 && STMT_VINFO_LIVE_P (stmt_info) |
1554 && !PURE_SLP_STMT (stmt_info)) | 1567 && !PURE_SLP_STMT (stmt_info)) |
1555 ok = vectorizable_live_operation (stmt_info, NULL, NULL, -1, NULL, | 1568 ok = vectorizable_live_operation (stmt_info, NULL, NULL, NULL, |
1556 &cost_vec); | 1569 -1, false, &cost_vec); |
1557 | 1570 |
1558 if (!ok) | 1571 if (!ok) |
1559 return opt_result::failure_at (phi, | 1572 return opt_result::failure_at (phi, |
1560 "not vectorized: relevant phi not " | 1573 "not vectorized: relevant phi not " |
1561 "supported: %G", | 1574 "supported: %G", |
1577 } | 1590 } |
1578 } | 1591 } |
1579 } /* bbs */ | 1592 } /* bbs */ |
1580 | 1593 |
1581 add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec); | 1594 add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec); |
1582 cost_vec.release (); | |
1583 | 1595 |
1584 /* All operations in the loop are either irrelevant (deal with loop | 1596 /* All operations in the loop are either irrelevant (deal with loop |
1585 control, or dead), or only used outside the loop and can be moved | 1597 control, or dead), or only used outside the loop and can be moved |
1586 out of the loop (e.g. invariants, inductions). The loop can be | 1598 out of the loop (e.g. invariants, inductions). The loop can be |
1587 optimized away by scalar optimizations. We're better off not | 1599 optimized away by scalar optimizations. We're better off not |
1604 definitely no, or -1 if it's worth retrying. */ | 1616 definitely no, or -1 if it's worth retrying. */ |
1605 | 1617 |
1606 static int | 1618 static int |
1607 vect_analyze_loop_costing (loop_vec_info loop_vinfo) | 1619 vect_analyze_loop_costing (loop_vec_info loop_vinfo) |
1608 { | 1620 { |
1609 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | 1621 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
1610 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo); | 1622 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo); |
1611 | 1623 |
1612 /* Only fully-masked loops can have iteration counts less than the | 1624 /* Only fully-masked loops can have iteration counts less than the |
1613 vectorization factor. */ | 1625 vectorization factor. */ |
1614 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) | 1626 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) |
1645 "not vectorized: vector version will never be " | 1657 "not vectorized: vector version will never be " |
1646 "profitable.\n"); | 1658 "profitable.\n"); |
1647 return -1; | 1659 return -1; |
1648 } | 1660 } |
1649 | 1661 |
1650 int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND) | 1662 int min_scalar_loop_bound = (param_min_vect_loop_bound |
1651 * assumed_vf); | 1663 * assumed_vf); |
1652 | 1664 |
1653 /* Use the cost model only if it is more conservative than user specified | 1665 /* Use the cost model only if it is more conservative than user specified |
1654 threshold. */ | 1666 threshold. */ |
1655 unsigned int th = (unsigned) MAX (min_scalar_loop_bound, | 1667 unsigned int th = (unsigned) MAX (min_scalar_loop_bound, |
1669 "specified loop bound parameter or minimum profitable " | 1681 "specified loop bound parameter or minimum profitable " |
1670 "iterations (whichever is more conservative).\n"); | 1682 "iterations (whichever is more conservative).\n"); |
1671 return 0; | 1683 return 0; |
1672 } | 1684 } |
1673 | 1685 |
1674 HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop); | 1686 /* The static profitablity threshold min_profitable_estimate includes |
1675 if (estimated_niter == -1) | 1687 the cost of having to check at runtime whether the scalar loop |
1676 estimated_niter = likely_max_stmt_executions_int (loop); | 1688 should be used instead. If it turns out that we don't need or want |
1689 such a check, the threshold we should use for the static estimate | |
1690 is simply the point at which the vector loop becomes more profitable | |
1691 than the scalar loop. */ | |
1692 if (min_profitable_estimate > min_profitable_iters | |
1693 && !LOOP_REQUIRES_VERSIONING (loop_vinfo) | |
1694 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) | |
1695 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) | |
1696 && !vect_apply_runtime_profitability_check_p (loop_vinfo)) | |
1697 { | |
1698 if (dump_enabled_p ()) | |
1699 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime" | |
1700 " choice between the scalar and vector loops\n"); | |
1701 min_profitable_estimate = min_profitable_iters; | |
1702 } | |
1703 | |
1704 HOST_WIDE_INT estimated_niter; | |
1705 | |
1706 /* If we are vectorizing an epilogue then we know the maximum number of | |
1707 scalar iterations it will cover is at least one lower than the | |
1708 vectorization factor of the main loop. */ | |
1709 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) | |
1710 estimated_niter | |
1711 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1; | |
1712 else | |
1713 { | |
1714 estimated_niter = estimated_stmt_executions_int (loop); | |
1715 if (estimated_niter == -1) | |
1716 estimated_niter = likely_max_stmt_executions_int (loop); | |
1717 } | |
1677 if (estimated_niter != -1 | 1718 if (estimated_niter != -1 |
1678 && ((unsigned HOST_WIDE_INT) estimated_niter | 1719 && ((unsigned HOST_WIDE_INT) estimated_niter |
1679 < MAX (th, (unsigned) min_profitable_estimate))) | 1720 < MAX (th, (unsigned) min_profitable_estimate))) |
1680 { | 1721 { |
1681 if (dump_enabled_p ()) | 1722 if (dump_enabled_p ()) |
1744 return res; | 1785 return res; |
1745 } | 1786 } |
1746 /* If dependence analysis will give up due to the limit on the | 1787 /* If dependence analysis will give up due to the limit on the |
1747 number of datarefs stop here and fail fatally. */ | 1788 number of datarefs stop here and fail fatally. */ |
1748 if (datarefs->length () | 1789 if (datarefs->length () |
1749 > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS)) | 1790 > (unsigned)param_loop_max_datarefs_for_datadeps) |
1750 return opt_result::failure_at (stmt, "exceeded param " | 1791 return opt_result::failure_at (stmt, "exceeded param " |
1751 "loop-max-datarefs-for-datadeps\n"); | 1792 "loop-max-datarefs-for-datadeps\n"); |
1752 } | 1793 } |
1753 return opt_result::success (); | 1794 return opt_result::success (); |
1754 } | 1795 } |
1755 | 1796 |
1756 /* Function vect_analyze_loop_2. | 1797 /* Look for SLP-only access groups and turn each individual access into its own |
1757 | 1798 group. */ |
1758 Apply a set of analyses on LOOP, and create a loop_vec_info struct | 1799 static void |
1759 for it. The different analyses will record information in the | 1800 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo) |
1760 loop_vec_info struct. */ | |
1761 static opt_result | |
1762 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts) | |
1763 { | 1801 { |
1764 opt_result ok = opt_result::success (); | 1802 unsigned int i; |
1765 int res; | 1803 struct data_reference *dr; |
1766 unsigned int max_vf = MAX_VECTORIZATION_FACTOR; | 1804 |
1767 poly_uint64 min_vf = 2; | 1805 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups"); |
1768 | 1806 |
1769 /* The first group of checks is independent of the vector size. */ | 1807 vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs; |
1770 fatal = true; | 1808 FOR_EACH_VEC_ELT (datarefs, i, dr) |
1771 | 1809 { |
1772 /* Find all data references in the loop (which correspond to vdefs/vuses) | 1810 gcc_assert (DR_REF (dr)); |
1773 and analyze their evolution in the loop. */ | 1811 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr)); |
1774 | 1812 |
1775 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo); | 1813 /* Check if the load is a part of an interleaving chain. */ |
1776 | 1814 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) |
1777 /* Gather the data references and count stmts in the loop. */ | 1815 { |
1778 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ()) | 1816 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info); |
1779 { | 1817 unsigned int group_size = DR_GROUP_SIZE (first_element); |
1780 opt_result res | 1818 |
1781 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo), | 1819 /* Check if SLP-only groups. */ |
1782 &LOOP_VINFO_DATAREFS (loop_vinfo), | 1820 if (!STMT_SLP_TYPE (stmt_info) |
1783 n_stmts); | 1821 && STMT_VINFO_SLP_VECT_ONLY (first_element)) |
1784 if (!res) | 1822 { |
1785 { | 1823 /* Dissolve the group. */ |
1786 if (dump_enabled_p ()) | 1824 STMT_VINFO_SLP_VECT_ONLY (first_element) = false; |
1787 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | 1825 |
1788 "not vectorized: loop contains function " | 1826 stmt_vec_info vinfo = first_element; |
1789 "calls or data references that cannot " | 1827 while (vinfo) |
1790 "be analyzed\n"); | 1828 { |
1791 return res; | 1829 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo); |
1792 } | 1830 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo; |
1793 loop_vinfo->shared->save_datarefs (); | 1831 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL; |
1794 } | 1832 DR_GROUP_SIZE (vinfo) = 1; |
1795 else | 1833 if (STMT_VINFO_STRIDED_P (first_element)) |
1796 loop_vinfo->shared->check_datarefs (); | 1834 DR_GROUP_GAP (vinfo) = 0; |
1797 | 1835 else |
1798 /* Analyze the data references and also adjust the minimal | 1836 DR_GROUP_GAP (vinfo) = group_size - 1; |
1799 vectorization factor according to the loads and stores. */ | 1837 vinfo = next; |
1800 | 1838 } |
1801 ok = vect_analyze_data_refs (loop_vinfo, &min_vf); | 1839 } |
1802 if (!ok) | 1840 } |
1803 { | 1841 } |
1804 if (dump_enabled_p ()) | 1842 } |
1805 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | 1843 |
1806 "bad data references.\n"); | 1844 |
1807 return ok; | 1845 /* Decides whether we need to create an epilogue loop to handle |
1808 } | 1846 remaining scalar iterations and sets PEELING_FOR_NITERS accordingly. */ |
1809 | 1847 |
1810 /* Classify all cross-iteration scalar data-flow cycles. | 1848 void |
1811 Cross-iteration cycles caused by virtual phis are analyzed separately. */ | 1849 determine_peel_for_niter (loop_vec_info loop_vinfo) |
1812 vect_analyze_scalar_cycles (loop_vinfo); | 1850 { |
1813 | 1851 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false; |
1814 vect_pattern_recog (loop_vinfo); | 1852 |
1815 | 1853 unsigned HOST_WIDE_INT const_vf; |
1816 vect_fixup_scalar_cycles_with_patterns (loop_vinfo); | |
1817 | |
1818 /* Analyze the access patterns of the data-refs in the loop (consecutive, | |
1819 complex, etc.). FORNOW: Only handle consecutive access pattern. */ | |
1820 | |
1821 ok = vect_analyze_data_ref_accesses (loop_vinfo); | |
1822 if (!ok) | |
1823 { | |
1824 if (dump_enabled_p ()) | |
1825 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
1826 "bad data access.\n"); | |
1827 return ok; | |
1828 } | |
1829 | |
1830 /* Data-flow analysis to detect stmts that do not need to be vectorized. */ | |
1831 | |
1832 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo); | |
1833 if (!ok) | |
1834 { | |
1835 if (dump_enabled_p ()) | |
1836 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
1837 "unexpected pattern.\n"); | |
1838 return ok; | |
1839 } | |
1840 | |
1841 /* While the rest of the analysis below depends on it in some way. */ | |
1842 fatal = false; | |
1843 | |
1844 /* Analyze data dependences between the data-refs in the loop | |
1845 and adjust the maximum vectorization factor according to | |
1846 the dependences. | |
1847 FORNOW: fail at the first data dependence that we encounter. */ | |
1848 | |
1849 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf); | |
1850 if (!ok) | |
1851 { | |
1852 if (dump_enabled_p ()) | |
1853 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
1854 "bad data dependence.\n"); | |
1855 return ok; | |
1856 } | |
1857 if (max_vf != MAX_VECTORIZATION_FACTOR | |
1858 && maybe_lt (max_vf, min_vf)) | |
1859 return opt_result::failure_at (vect_location, "bad data dependence.\n"); | |
1860 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf; | |
1861 | |
1862 ok = vect_determine_vectorization_factor (loop_vinfo); | |
1863 if (!ok) | |
1864 { | |
1865 if (dump_enabled_p ()) | |
1866 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
1867 "can't determine vectorization factor.\n"); | |
1868 return ok; | |
1869 } | |
1870 if (max_vf != MAX_VECTORIZATION_FACTOR | |
1871 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo))) | |
1872 return opt_result::failure_at (vect_location, "bad data dependence.\n"); | |
1873 | |
1874 /* Compute the scalar iteration cost. */ | |
1875 vect_compute_single_scalar_iteration_cost (loop_vinfo); | |
1876 | |
1877 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); | |
1878 unsigned th; | |
1879 | |
1880 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */ | |
1881 ok = vect_analyze_slp (loop_vinfo, *n_stmts); | |
1882 if (!ok) | |
1883 return ok; | |
1884 | |
1885 /* If there are any SLP instances mark them as pure_slp. */ | |
1886 bool slp = vect_make_slp_decision (loop_vinfo); | |
1887 if (slp) | |
1888 { | |
1889 /* Find stmts that need to be both vectorized and SLPed. */ | |
1890 vect_detect_hybrid_slp (loop_vinfo); | |
1891 | |
1892 /* Update the vectorization factor based on the SLP decision. */ | |
1893 vect_update_vf_for_slp (loop_vinfo); | |
1894 } | |
1895 | |
1896 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo); | |
1897 | |
1898 /* We don't expect to have to roll back to anything other than an empty | |
1899 set of rgroups. */ | |
1900 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ()); | |
1901 | |
1902 /* This is the point where we can re-start analysis with SLP forced off. */ | |
1903 start_over: | |
1904 | |
1905 /* Now the vectorization factor is final. */ | |
1906 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); | |
1907 gcc_assert (known_ne (vectorization_factor, 0U)); | |
1908 | |
1909 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ()) | |
1910 { | |
1911 dump_printf_loc (MSG_NOTE, vect_location, | |
1912 "vectorization_factor = "); | |
1913 dump_dec (MSG_NOTE, vectorization_factor); | |
1914 dump_printf (MSG_NOTE, ", niters = %wd\n", | |
1915 LOOP_VINFO_INT_NITERS (loop_vinfo)); | |
1916 } | |
1917 | |
1918 HOST_WIDE_INT max_niter | 1854 HOST_WIDE_INT max_niter |
1919 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo)); | 1855 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo)); |
1920 | 1856 |
1921 /* Analyze the alignment of the data-refs in the loop. | 1857 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo); |
1922 Fail if a data reference is found that cannot be vectorized. */ | 1858 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) |
1923 | 1859 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO |
1924 ok = vect_analyze_data_refs_alignment (loop_vinfo); | 1860 (loop_vinfo)); |
1925 if (!ok) | 1861 |
1926 { | |
1927 if (dump_enabled_p ()) | |
1928 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
1929 "bad data alignment.\n"); | |
1930 return ok; | |
1931 } | |
1932 | |
1933 /* Prune the list of ddrs to be tested at run-time by versioning for alias. | |
1934 It is important to call pruning after vect_analyze_data_ref_accesses, | |
1935 since we use grouping information gathered by interleaving analysis. */ | |
1936 ok = vect_prune_runtime_alias_test_list (loop_vinfo); | |
1937 if (!ok) | |
1938 return ok; | |
1939 | |
1940 /* Do not invoke vect_enhance_data_refs_alignment for epilogue | |
1941 vectorization, since we do not want to add extra peeling or | |
1942 add versioning for alignment. */ | |
1943 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)) | |
1944 /* This pass will decide on using loop versioning and/or loop peeling in | |
1945 order to enhance the alignment of data references in the loop. */ | |
1946 ok = vect_enhance_data_refs_alignment (loop_vinfo); | |
1947 else | |
1948 ok = vect_verify_datarefs_alignment (loop_vinfo); | |
1949 if (!ok) | |
1950 return ok; | |
1951 | |
1952 if (slp) | |
1953 { | |
1954 /* Analyze operations in the SLP instances. Note this may | |
1955 remove unsupported SLP instances which makes the above | |
1956 SLP kind detection invalid. */ | |
1957 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length (); | |
1958 vect_slp_analyze_operations (loop_vinfo); | |
1959 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size) | |
1960 { | |
1961 ok = opt_result::failure_at (vect_location, | |
1962 "unsupported SLP instances\n"); | |
1963 goto again; | |
1964 } | |
1965 } | |
1966 | |
1967 /* Scan all the remaining operations in the loop that are not subject | |
1968 to SLP and make sure they are vectorizable. */ | |
1969 ok = vect_analyze_loop_operations (loop_vinfo); | |
1970 if (!ok) | |
1971 { | |
1972 if (dump_enabled_p ()) | |
1973 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
1974 "bad operation or unsupported loop bound.\n"); | |
1975 return ok; | |
1976 } | |
1977 | |
1978 /* Decide whether to use a fully-masked loop for this vectorization | |
1979 factor. */ | |
1980 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) | |
1981 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) | |
1982 && vect_verify_full_masking (loop_vinfo)); | |
1983 if (dump_enabled_p ()) | |
1984 { | |
1985 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) | |
1986 dump_printf_loc (MSG_NOTE, vect_location, | |
1987 "using a fully-masked loop.\n"); | |
1988 else | |
1989 dump_printf_loc (MSG_NOTE, vect_location, | |
1990 "not using a fully-masked loop.\n"); | |
1991 } | |
1992 | |
1993 /* If epilog loop is required because of data accesses with gaps, | |
1994 one additional iteration needs to be peeled. Check if there is | |
1995 enough iterations for vectorization. */ | |
1996 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) | |
1997 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) | |
1998 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) | |
1999 { | |
2000 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); | |
2001 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo); | |
2002 | |
2003 if (known_lt (wi::to_widest (scalar_niters), vf)) | |
2004 return opt_result::failure_at (vect_location, | |
2005 "loop has no enough iterations to" | |
2006 " support peeling for gaps.\n"); | |
2007 } | |
2008 | |
2009 /* Check the costings of the loop make vectorizing worthwhile. */ | |
2010 res = vect_analyze_loop_costing (loop_vinfo); | |
2011 if (res < 0) | |
2012 { | |
2013 ok = opt_result::failure_at (vect_location, | |
2014 "Loop costings may not be worthwhile.\n"); | |
2015 goto again; | |
2016 } | |
2017 if (!res) | |
2018 return opt_result::failure_at (vect_location, | |
2019 "Loop costings not worthwhile.\n"); | |
2020 | |
2021 /* Decide whether we need to create an epilogue loop to handle | |
2022 remaining scalar iterations. */ | |
2023 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo); | |
2024 | |
2025 unsigned HOST_WIDE_INT const_vf; | |
2026 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) | 1862 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) |
2027 /* The main loop handles all iterations. */ | 1863 /* The main loop handles all iterations. */ |
2028 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false; | 1864 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false; |
2029 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) | 1865 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) |
2030 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0) | 1866 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0) |
2051 the epilogue is unnecessary. */ | 1887 the epilogue is unnecessary. */ |
2052 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo) | 1888 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo) |
2053 || ((unsigned HOST_WIDE_INT) max_niter | 1889 || ((unsigned HOST_WIDE_INT) max_niter |
2054 > (th / const_vf) * const_vf)))) | 1890 > (th / const_vf) * const_vf)))) |
2055 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true; | 1891 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true; |
2056 | 1892 } |
1893 | |
1894 | |
1895 /* Function vect_analyze_loop_2. | |
1896 | |
1897 Apply a set of analyses on LOOP, and create a loop_vec_info struct | |
1898 for it. The different analyses will record information in the | |
1899 loop_vec_info struct. */ | |
1900 static opt_result | |
1901 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts) | |
1902 { | |
1903 opt_result ok = opt_result::success (); | |
1904 int res; | |
1905 unsigned int max_vf = MAX_VECTORIZATION_FACTOR; | |
1906 poly_uint64 min_vf = 2; | |
1907 loop_vec_info orig_loop_vinfo = NULL; | |
1908 | |
1909 /* If we are dealing with an epilogue then orig_loop_vinfo points to the | |
1910 loop_vec_info of the first vectorized loop. */ | |
1911 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) | |
1912 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo); | |
1913 else | |
1914 orig_loop_vinfo = loop_vinfo; | |
1915 gcc_assert (orig_loop_vinfo); | |
1916 | |
1917 /* The first group of checks is independent of the vector size. */ | |
1918 fatal = true; | |
1919 | |
1920 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo) | |
1921 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo))) | |
1922 return opt_result::failure_at (vect_location, | |
1923 "not vectorized: simd if(0)\n"); | |
1924 | |
1925 /* Find all data references in the loop (which correspond to vdefs/vuses) | |
1926 and analyze their evolution in the loop. */ | |
1927 | |
1928 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo); | |
1929 | |
1930 /* Gather the data references and count stmts in the loop. */ | |
1931 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ()) | |
1932 { | |
1933 opt_result res | |
1934 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo), | |
1935 &LOOP_VINFO_DATAREFS (loop_vinfo), | |
1936 n_stmts); | |
1937 if (!res) | |
1938 { | |
1939 if (dump_enabled_p ()) | |
1940 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
1941 "not vectorized: loop contains function " | |
1942 "calls or data references that cannot " | |
1943 "be analyzed\n"); | |
1944 return res; | |
1945 } | |
1946 loop_vinfo->shared->save_datarefs (); | |
1947 } | |
1948 else | |
1949 loop_vinfo->shared->check_datarefs (); | |
1950 | |
1951 /* Analyze the data references and also adjust the minimal | |
1952 vectorization factor according to the loads and stores. */ | |
1953 | |
1954 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal); | |
1955 if (!ok) | |
1956 { | |
1957 if (dump_enabled_p ()) | |
1958 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
1959 "bad data references.\n"); | |
1960 return ok; | |
1961 } | |
1962 | |
1963 /* Classify all cross-iteration scalar data-flow cycles. | |
1964 Cross-iteration cycles caused by virtual phis are analyzed separately. */ | |
1965 vect_analyze_scalar_cycles (loop_vinfo); | |
1966 | |
1967 vect_pattern_recog (loop_vinfo); | |
1968 | |
1969 vect_fixup_scalar_cycles_with_patterns (loop_vinfo); | |
1970 | |
1971 /* Analyze the access patterns of the data-refs in the loop (consecutive, | |
1972 complex, etc.). FORNOW: Only handle consecutive access pattern. */ | |
1973 | |
1974 ok = vect_analyze_data_ref_accesses (loop_vinfo); | |
1975 if (!ok) | |
1976 { | |
1977 if (dump_enabled_p ()) | |
1978 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
1979 "bad data access.\n"); | |
1980 return ok; | |
1981 } | |
1982 | |
1983 /* Data-flow analysis to detect stmts that do not need to be vectorized. */ | |
1984 | |
1985 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal); | |
1986 if (!ok) | |
1987 { | |
1988 if (dump_enabled_p ()) | |
1989 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
1990 "unexpected pattern.\n"); | |
1991 return ok; | |
1992 } | |
1993 | |
1994 /* While the rest of the analysis below depends on it in some way. */ | |
1995 fatal = false; | |
1996 | |
1997 /* Analyze data dependences between the data-refs in the loop | |
1998 and adjust the maximum vectorization factor according to | |
1999 the dependences. | |
2000 FORNOW: fail at the first data dependence that we encounter. */ | |
2001 | |
2002 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf); | |
2003 if (!ok) | |
2004 { | |
2005 if (dump_enabled_p ()) | |
2006 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
2007 "bad data dependence.\n"); | |
2008 return ok; | |
2009 } | |
2010 if (max_vf != MAX_VECTORIZATION_FACTOR | |
2011 && maybe_lt (max_vf, min_vf)) | |
2012 return opt_result::failure_at (vect_location, "bad data dependence.\n"); | |
2013 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf; | |
2014 | |
2015 ok = vect_determine_vectorization_factor (loop_vinfo); | |
2016 if (!ok) | |
2017 { | |
2018 if (dump_enabled_p ()) | |
2019 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
2020 "can't determine vectorization factor.\n"); | |
2021 return ok; | |
2022 } | |
2023 if (max_vf != MAX_VECTORIZATION_FACTOR | |
2024 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo))) | |
2025 return opt_result::failure_at (vect_location, "bad data dependence.\n"); | |
2026 | |
2027 /* Compute the scalar iteration cost. */ | |
2028 vect_compute_single_scalar_iteration_cost (loop_vinfo); | |
2029 | |
2030 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); | |
2031 | |
2032 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */ | |
2033 ok = vect_analyze_slp (loop_vinfo, *n_stmts); | |
2034 if (!ok) | |
2035 return ok; | |
2036 | |
2037 /* If there are any SLP instances mark them as pure_slp. */ | |
2038 bool slp = vect_make_slp_decision (loop_vinfo); | |
2039 if (slp) | |
2040 { | |
2041 /* Find stmts that need to be both vectorized and SLPed. */ | |
2042 vect_detect_hybrid_slp (loop_vinfo); | |
2043 | |
2044 /* Update the vectorization factor based on the SLP decision. */ | |
2045 vect_update_vf_for_slp (loop_vinfo); | |
2046 } | |
2047 | |
2048 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo); | |
2049 | |
2050 /* We don't expect to have to roll back to anything other than an empty | |
2051 set of rgroups. */ | |
2052 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ()); | |
2053 | |
2054 /* This is the point where we can re-start analysis with SLP forced off. */ | |
2055 start_over: | |
2056 | |
2057 /* Now the vectorization factor is final. */ | |
2058 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); | |
2059 gcc_assert (known_ne (vectorization_factor, 0U)); | |
2060 | |
2061 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ()) | |
2062 { | |
2063 dump_printf_loc (MSG_NOTE, vect_location, | |
2064 "vectorization_factor = "); | |
2065 dump_dec (MSG_NOTE, vectorization_factor); | |
2066 dump_printf (MSG_NOTE, ", niters = %wd\n", | |
2067 LOOP_VINFO_INT_NITERS (loop_vinfo)); | |
2068 } | |
2069 | |
2070 /* Analyze the alignment of the data-refs in the loop. | |
2071 Fail if a data reference is found that cannot be vectorized. */ | |
2072 | |
2073 ok = vect_analyze_data_refs_alignment (loop_vinfo); | |
2074 if (!ok) | |
2075 { | |
2076 if (dump_enabled_p ()) | |
2077 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
2078 "bad data alignment.\n"); | |
2079 return ok; | |
2080 } | |
2081 | |
2082 /* Prune the list of ddrs to be tested at run-time by versioning for alias. | |
2083 It is important to call pruning after vect_analyze_data_ref_accesses, | |
2084 since we use grouping information gathered by interleaving analysis. */ | |
2085 ok = vect_prune_runtime_alias_test_list (loop_vinfo); | |
2086 if (!ok) | |
2087 return ok; | |
2088 | |
2089 /* Do not invoke vect_enhance_data_refs_alignment for epilogue | |
2090 vectorization, since we do not want to add extra peeling or | |
2091 add versioning for alignment. */ | |
2092 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)) | |
2093 /* This pass will decide on using loop versioning and/or loop peeling in | |
2094 order to enhance the alignment of data references in the loop. */ | |
2095 ok = vect_enhance_data_refs_alignment (loop_vinfo); | |
2096 else | |
2097 ok = vect_verify_datarefs_alignment (loop_vinfo); | |
2098 if (!ok) | |
2099 return ok; | |
2100 | |
2101 if (slp) | |
2102 { | |
2103 /* Analyze operations in the SLP instances. Note this may | |
2104 remove unsupported SLP instances which makes the above | |
2105 SLP kind detection invalid. */ | |
2106 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length (); | |
2107 vect_slp_analyze_operations (loop_vinfo); | |
2108 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size) | |
2109 { | |
2110 ok = opt_result::failure_at (vect_location, | |
2111 "unsupported SLP instances\n"); | |
2112 goto again; | |
2113 } | |
2114 } | |
2115 | |
2116 /* Dissolve SLP-only groups. */ | |
2117 vect_dissolve_slp_only_groups (loop_vinfo); | |
2118 | |
2119 /* Scan all the remaining operations in the loop that are not subject | |
2120 to SLP and make sure they are vectorizable. */ | |
2121 ok = vect_analyze_loop_operations (loop_vinfo); | |
2122 if (!ok) | |
2123 { | |
2124 if (dump_enabled_p ()) | |
2125 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
2126 "bad operation or unsupported loop bound.\n"); | |
2127 return ok; | |
2128 } | |
2129 | |
2130 /* Decide whether to use a fully-masked loop for this vectorization | |
2131 factor. */ | |
2132 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) | |
2133 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) | |
2134 && vect_verify_full_masking (loop_vinfo)); | |
2135 if (dump_enabled_p ()) | |
2136 { | |
2137 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) | |
2138 dump_printf_loc (MSG_NOTE, vect_location, | |
2139 "using a fully-masked loop.\n"); | |
2140 else | |
2141 dump_printf_loc (MSG_NOTE, vect_location, | |
2142 "not using a fully-masked loop.\n"); | |
2143 } | |
2144 | |
2145 /* If epilog loop is required because of data accesses with gaps, | |
2146 one additional iteration needs to be peeled. Check if there is | |
2147 enough iterations for vectorization. */ | |
2148 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) | |
2149 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) | |
2150 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) | |
2151 { | |
2152 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); | |
2153 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo); | |
2154 | |
2155 if (known_lt (wi::to_widest (scalar_niters), vf)) | |
2156 return opt_result::failure_at (vect_location, | |
2157 "loop has no enough iterations to" | |
2158 " support peeling for gaps.\n"); | |
2159 } | |
2160 | |
2161 /* If we're vectorizing an epilogue loop, we either need a fully-masked | |
2162 loop or a loop that has a lower VF than the main loop. */ | |
2163 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo) | |
2164 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) | |
2165 && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), | |
2166 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo))) | |
2167 return opt_result::failure_at (vect_location, | |
2168 "Vectorization factor too high for" | |
2169 " epilogue loop.\n"); | |
2170 | |
2171 /* Check the costings of the loop make vectorizing worthwhile. */ | |
2172 res = vect_analyze_loop_costing (loop_vinfo); | |
2173 if (res < 0) | |
2174 { | |
2175 ok = opt_result::failure_at (vect_location, | |
2176 "Loop costings may not be worthwhile.\n"); | |
2177 goto again; | |
2178 } | |
2179 if (!res) | |
2180 return opt_result::failure_at (vect_location, | |
2181 "Loop costings not worthwhile.\n"); | |
2182 | |
2183 determine_peel_for_niter (loop_vinfo); | |
2057 /* If an epilogue loop is required make sure we can create one. */ | 2184 /* If an epilogue loop is required make sure we can create one. */ |
2058 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) | 2185 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) |
2059 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)) | 2186 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)) |
2060 { | 2187 { |
2061 if (dump_enabled_p ()) | 2188 if (dump_enabled_p ()) |
2073 } | 2200 } |
2074 | 2201 |
2075 /* During peeling, we need to check if number of loop iterations is | 2202 /* During peeling, we need to check if number of loop iterations is |
2076 enough for both peeled prolog loop and vector loop. This check | 2203 enough for both peeled prolog loop and vector loop. This check |
2077 can be merged along with threshold check of loop versioning, so | 2204 can be merged along with threshold check of loop versioning, so |
2078 increase threshold for this case if necessary. */ | 2205 increase threshold for this case if necessary. |
2079 if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) | 2206 |
2207 If we are analyzing an epilogue we still want to check what its | |
2208 versioning threshold would be. If we decide to vectorize the epilogues we | |
2209 will want to use the lowest versioning threshold of all epilogues and main | |
2210 loop. This will enable us to enter a vectorized epilogue even when | |
2211 versioning the loop. We can't simply check whether the epilogue requires | |
2212 versioning though since we may have skipped some versioning checks when | |
2213 analyzing the epilogue. For instance, checks for alias versioning will be | |
2214 skipped when dealing with epilogues as we assume we already checked them | |
2215 for the main loop. So instead we always check the 'orig_loop_vinfo'. */ | |
2216 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo)) | |
2080 { | 2217 { |
2081 poly_uint64 niters_th = 0; | 2218 poly_uint64 niters_th = 0; |
2219 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo); | |
2082 | 2220 |
2083 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo)) | 2221 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo)) |
2084 { | 2222 { |
2085 /* Niters for peeled prolog loop. */ | 2223 /* Niters for peeled prolog loop. */ |
2086 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0) | 2224 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0) |
2097 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) | 2235 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) |
2098 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo); | 2236 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
2099 /* One additional iteration because of peeling for gap. */ | 2237 /* One additional iteration because of peeling for gap. */ |
2100 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) | 2238 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) |
2101 niters_th += 1; | 2239 niters_th += 1; |
2240 | |
2241 /* Use the same condition as vect_transform_loop to decide when to use | |
2242 the cost to determine a versioning threshold. */ | |
2243 if (vect_apply_runtime_profitability_check_p (loop_vinfo) | |
2244 && ordered_p (th, niters_th)) | |
2245 niters_th = ordered_max (poly_uint64 (th), niters_th); | |
2246 | |
2102 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th; | 2247 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th; |
2103 } | 2248 } |
2104 | 2249 |
2105 gcc_assert (known_eq (vectorization_factor, | 2250 gcc_assert (known_eq (vectorization_factor, |
2106 LOOP_VINFO_VECT_FACTOR (loop_vinfo))); | 2251 LOOP_VINFO_VECT_FACTOR (loop_vinfo))); |
2174 for (gimple_stmt_iterator si = gsi_start_phis (bb); | 2319 for (gimple_stmt_iterator si = gsi_start_phis (bb); |
2175 !gsi_end_p (si); gsi_next (&si)) | 2320 !gsi_end_p (si); gsi_next (&si)) |
2176 { | 2321 { |
2177 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si)); | 2322 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si)); |
2178 STMT_SLP_TYPE (stmt_info) = loop_vect; | 2323 STMT_SLP_TYPE (stmt_info) = loop_vect; |
2324 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def | |
2325 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def) | |
2326 { | |
2327 /* vectorizable_reduction adjusts reduction stmt def-types, | |
2328 restore them to that of the PHI. */ | |
2329 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info)) | |
2330 = STMT_VINFO_DEF_TYPE (stmt_info); | |
2331 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize | |
2332 (STMT_VINFO_REDUC_DEF (stmt_info))) | |
2333 = STMT_VINFO_DEF_TYPE (stmt_info); | |
2334 } | |
2179 } | 2335 } |
2180 for (gimple_stmt_iterator si = gsi_start_bb (bb); | 2336 for (gimple_stmt_iterator si = gsi_start_bb (bb); |
2181 !gsi_end_p (si); gsi_next (&si)) | 2337 !gsi_end_p (si); gsi_next (&si)) |
2182 { | 2338 { |
2183 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si)); | 2339 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si)); |
2212 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p; | 2368 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p; |
2213 | 2369 |
2214 goto start_over; | 2370 goto start_over; |
2215 } | 2371 } |
2216 | 2372 |
2373 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears | |
2374 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that | |
2375 OLD_LOOP_VINFO is better unless something specifically indicates | |
2376 otherwise. | |
2377 | |
2378 Note that this deliberately isn't a partial order. */ | |
2379 | |
2380 static bool | |
2381 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo, | |
2382 loop_vec_info old_loop_vinfo) | |
2383 { | |
2384 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo); | |
2385 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop); | |
2386 | |
2387 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo); | |
2388 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo); | |
2389 | |
2390 /* Always prefer a VF of loop->simdlen over any other VF. */ | |
2391 if (loop->simdlen) | |
2392 { | |
2393 bool new_simdlen_p = known_eq (new_vf, loop->simdlen); | |
2394 bool old_simdlen_p = known_eq (old_vf, loop->simdlen); | |
2395 if (new_simdlen_p != old_simdlen_p) | |
2396 return new_simdlen_p; | |
2397 } | |
2398 | |
2399 /* Limit the VFs to what is likely to be the maximum number of iterations, | |
2400 to handle cases in which at least one loop_vinfo is fully-masked. */ | |
2401 HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop); | |
2402 if (estimated_max_niter != -1) | |
2403 { | |
2404 if (known_le (estimated_max_niter, new_vf)) | |
2405 new_vf = estimated_max_niter; | |
2406 if (known_le (estimated_max_niter, old_vf)) | |
2407 old_vf = estimated_max_niter; | |
2408 } | |
2409 | |
2410 /* Check whether the (fractional) cost per scalar iteration is lower | |
2411 or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf. */ | |
2412 poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost | |
2413 * poly_widest_int (old_vf)); | |
2414 poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost | |
2415 * poly_widest_int (new_vf)); | |
2416 if (maybe_lt (rel_old, rel_new)) | |
2417 return false; | |
2418 if (known_lt (rel_new, rel_old)) | |
2419 return true; | |
2420 | |
2421 /* If there's nothing to choose between the loop bodies, see whether | |
2422 there's a difference in the prologue and epilogue costs. */ | |
2423 if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost) | |
2424 return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost; | |
2425 | |
2426 return false; | |
2427 } | |
2428 | |
2429 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return | |
2430 true if we should. */ | |
2431 | |
2432 static bool | |
2433 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo, | |
2434 loop_vec_info old_loop_vinfo) | |
2435 { | |
2436 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo)) | |
2437 return false; | |
2438 | |
2439 if (dump_enabled_p ()) | |
2440 dump_printf_loc (MSG_NOTE, vect_location, | |
2441 "***** Preferring vector mode %s to vector mode %s\n", | |
2442 GET_MODE_NAME (new_loop_vinfo->vector_mode), | |
2443 GET_MODE_NAME (old_loop_vinfo->vector_mode)); | |
2444 return true; | |
2445 } | |
2446 | |
2217 /* Function vect_analyze_loop. | 2447 /* Function vect_analyze_loop. |
2218 | 2448 |
2219 Apply a set of analyses on LOOP, and create a loop_vec_info struct | 2449 Apply a set of analyses on LOOP, and create a loop_vec_info struct |
2220 for it. The different analyses will record information in the | 2450 for it. The different analyses will record information in the |
2221 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must | 2451 loop_vec_info struct. */ |
2222 be vectorized. */ | |
2223 opt_loop_vec_info | 2452 opt_loop_vec_info |
2224 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo, | 2453 vect_analyze_loop (class loop *loop, vec_info_shared *shared) |
2225 vec_info_shared *shared) | |
2226 { | 2454 { |
2227 auto_vector_sizes vector_sizes; | 2455 auto_vector_modes vector_modes; |
2228 | 2456 |
2229 /* Autodetect first vector size we try. */ | 2457 /* Autodetect first vector size we try. */ |
2230 current_vector_size = 0; | 2458 unsigned int autovec_flags |
2231 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes); | 2459 = targetm.vectorize.autovectorize_vector_modes (&vector_modes, |
2232 unsigned int next_size = 0; | 2460 loop->simdlen != 0); |
2461 unsigned int mode_i = 0; | |
2233 | 2462 |
2234 DUMP_VECT_SCOPE ("analyze_loop_nest"); | 2463 DUMP_VECT_SCOPE ("analyze_loop_nest"); |
2235 | 2464 |
2236 if (loop_outer (loop) | 2465 if (loop_outer (loop) |
2237 && loop_vec_info_for_loop (loop_outer (loop)) | 2466 && loop_vec_info_for_loop (loop_outer (loop)) |
2244 (vect_location, | 2473 (vect_location, |
2245 "not vectorized: loop nest containing two or more consecutive inner" | 2474 "not vectorized: loop nest containing two or more consecutive inner" |
2246 " loops cannot be vectorized\n"); | 2475 " loops cannot be vectorized\n"); |
2247 | 2476 |
2248 unsigned n_stmts = 0; | 2477 unsigned n_stmts = 0; |
2249 poly_uint64 autodetected_vector_size = 0; | 2478 machine_mode autodetected_vector_mode = VOIDmode; |
2479 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL); | |
2480 machine_mode next_vector_mode = VOIDmode; | |
2481 poly_uint64 lowest_th = 0; | |
2482 unsigned vectorized_loops = 0; | |
2483 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS) | |
2484 && !unlimited_cost_model (loop)); | |
2485 | |
2486 bool vect_epilogues = false; | |
2487 opt_result res = opt_result::success (); | |
2488 unsigned HOST_WIDE_INT simdlen = loop->simdlen; | |
2250 while (1) | 2489 while (1) |
2251 { | 2490 { |
2252 /* Check the CFG characteristics of the loop (nesting, entry/exit). */ | 2491 /* Check the CFG characteristics of the loop (nesting, entry/exit). */ |
2253 opt_loop_vec_info loop_vinfo | 2492 opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared); |
2254 = vect_analyze_loop_form (loop, shared); | |
2255 if (!loop_vinfo) | 2493 if (!loop_vinfo) |
2256 { | 2494 { |
2257 if (dump_enabled_p ()) | 2495 if (dump_enabled_p ()) |
2258 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | 2496 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2259 "bad loop form.\n"); | 2497 "bad loop form.\n"); |
2498 gcc_checking_assert (first_loop_vinfo == NULL); | |
2260 return loop_vinfo; | 2499 return loop_vinfo; |
2261 } | 2500 } |
2501 loop_vinfo->vector_mode = next_vector_mode; | |
2262 | 2502 |
2263 bool fatal = false; | 2503 bool fatal = false; |
2264 | 2504 |
2265 if (orig_loop_vinfo) | 2505 /* When pick_lowest_cost_p is true, we should in principle iterate |
2266 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo; | 2506 over all the loop_vec_infos that LOOP_VINFO could replace and |
2267 | 2507 try to vectorize LOOP_VINFO under the same conditions. |
2268 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts); | 2508 E.g. when trying to replace an epilogue loop, we should vectorize |
2509 LOOP_VINFO as an epilogue loop with the same VF limit. When trying | |
2510 to replace the main loop, we should vectorize LOOP_VINFO as a main | |
2511 loop too. | |
2512 | |
2513 However, autovectorize_vector_modes is usually sorted as follows: | |
2514 | |
2515 - Modes that naturally produce lower VFs usually follow modes that | |
2516 naturally produce higher VFs. | |
2517 | |
2518 - When modes naturally produce the same VF, maskable modes | |
2519 usually follow unmaskable ones, so that the maskable mode | |
2520 can be used to vectorize the epilogue of the unmaskable mode. | |
2521 | |
2522 This order is preferred because it leads to the maximum | |
2523 epilogue vectorization opportunities. Targets should only use | |
2524 a different order if they want to make wide modes available while | |
2525 disparaging them relative to earlier, smaller modes. The assumption | |
2526 in that case is that the wider modes are more expensive in some | |
2527 way that isn't reflected directly in the costs. | |
2528 | |
2529 There should therefore be few interesting cases in which | |
2530 LOOP_VINFO fails when treated as an epilogue loop, succeeds when | |
2531 treated as a standalone loop, and ends up being genuinely cheaper | |
2532 than FIRST_LOOP_VINFO. */ | |
2533 if (vect_epilogues) | |
2534 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo; | |
2535 | |
2536 res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts); | |
2537 if (mode_i == 0) | |
2538 autodetected_vector_mode = loop_vinfo->vector_mode; | |
2539 if (dump_enabled_p ()) | |
2540 { | |
2541 if (res) | |
2542 dump_printf_loc (MSG_NOTE, vect_location, | |
2543 "***** Analysis succeeded with vector mode %s\n", | |
2544 GET_MODE_NAME (loop_vinfo->vector_mode)); | |
2545 else | |
2546 dump_printf_loc (MSG_NOTE, vect_location, | |
2547 "***** Analysis failed with vector mode %s\n", | |
2548 GET_MODE_NAME (loop_vinfo->vector_mode)); | |
2549 } | |
2550 | |
2551 loop->aux = NULL; | |
2552 | |
2553 if (!fatal) | |
2554 while (mode_i < vector_modes.length () | |
2555 && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i])) | |
2556 { | |
2557 if (dump_enabled_p ()) | |
2558 dump_printf_loc (MSG_NOTE, vect_location, | |
2559 "***** The result for vector mode %s would" | |
2560 " be the same\n", | |
2561 GET_MODE_NAME (vector_modes[mode_i])); | |
2562 mode_i += 1; | |
2563 } | |
2564 | |
2269 if (res) | 2565 if (res) |
2270 { | 2566 { |
2271 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1; | 2567 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1; |
2272 | 2568 vectorized_loops++; |
2273 return loop_vinfo; | 2569 |
2274 } | 2570 /* Once we hit the desired simdlen for the first time, |
2275 | 2571 discard any previous attempts. */ |
2276 delete loop_vinfo; | 2572 if (simdlen |
2277 | 2573 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen)) |
2278 if (next_size == 0) | 2574 { |
2279 autodetected_vector_size = current_vector_size; | 2575 delete first_loop_vinfo; |
2280 | 2576 first_loop_vinfo = opt_loop_vec_info::success (NULL); |
2281 if (next_size < vector_sizes.length () | 2577 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL; |
2282 && known_eq (vector_sizes[next_size], autodetected_vector_size)) | 2578 simdlen = 0; |
2283 next_size += 1; | 2579 } |
2284 | 2580 else if (pick_lowest_cost_p && first_loop_vinfo) |
2285 if (fatal | 2581 { |
2286 || next_size == vector_sizes.length () | 2582 /* Keep trying to roll back vectorization attempts while the |
2287 || known_eq (current_vector_size, 0U)) | 2583 loop_vec_infos they produced were worse than this one. */ |
2288 return opt_loop_vec_info::propagate_failure (res); | 2584 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos; |
2585 while (!vinfos.is_empty () | |
2586 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ())) | |
2587 { | |
2588 gcc_assert (vect_epilogues); | |
2589 delete vinfos.pop (); | |
2590 } | |
2591 if (vinfos.is_empty () | |
2592 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo)) | |
2593 { | |
2594 delete first_loop_vinfo; | |
2595 first_loop_vinfo = opt_loop_vec_info::success (NULL); | |
2596 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL; | |
2597 } | |
2598 } | |
2599 | |
2600 if (first_loop_vinfo == NULL) | |
2601 { | |
2602 first_loop_vinfo = loop_vinfo; | |
2603 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo); | |
2604 } | |
2605 else if (vect_epilogues | |
2606 /* For now only allow one epilogue loop. */ | |
2607 && first_loop_vinfo->epilogue_vinfos.is_empty ()) | |
2608 { | |
2609 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo); | |
2610 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo); | |
2611 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo) | |
2612 || maybe_ne (lowest_th, 0U)); | |
2613 /* Keep track of the known smallest versioning | |
2614 threshold. */ | |
2615 if (ordered_p (lowest_th, th)) | |
2616 lowest_th = ordered_min (lowest_th, th); | |
2617 } | |
2618 else | |
2619 delete loop_vinfo; | |
2620 | |
2621 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is | |
2622 enabled, SIMDUID is not set, it is the innermost loop and we have | |
2623 either already found the loop's SIMDLEN or there was no SIMDLEN to | |
2624 begin with. | |
2625 TODO: Enable epilogue vectorization for loops with SIMDUID set. */ | |
2626 vect_epilogues = (!simdlen | |
2627 && loop->inner == NULL | |
2628 && param_vect_epilogues_nomask | |
2629 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo) | |
2630 && !loop->simduid | |
2631 /* For now only allow one epilogue loop, but allow | |
2632 pick_lowest_cost_p to replace it. */ | |
2633 && (first_loop_vinfo->epilogue_vinfos.is_empty () | |
2634 || pick_lowest_cost_p)); | |
2635 | |
2636 /* Commit to first_loop_vinfo if we have no reason to try | |
2637 alternatives. */ | |
2638 if (!simdlen && !vect_epilogues && !pick_lowest_cost_p) | |
2639 break; | |
2640 } | |
2641 else | |
2642 { | |
2643 delete loop_vinfo; | |
2644 if (fatal) | |
2645 { | |
2646 gcc_checking_assert (first_loop_vinfo == NULL); | |
2647 break; | |
2648 } | |
2649 } | |
2650 | |
2651 if (mode_i < vector_modes.length () | |
2652 && VECTOR_MODE_P (autodetected_vector_mode) | |
2653 && (related_vector_mode (vector_modes[mode_i], | |
2654 GET_MODE_INNER (autodetected_vector_mode)) | |
2655 == autodetected_vector_mode) | |
2656 && (related_vector_mode (autodetected_vector_mode, | |
2657 GET_MODE_INNER (vector_modes[mode_i])) | |
2658 == vector_modes[mode_i])) | |
2659 { | |
2660 if (dump_enabled_p ()) | |
2661 dump_printf_loc (MSG_NOTE, vect_location, | |
2662 "***** Skipping vector mode %s, which would" | |
2663 " repeat the analysis for %s\n", | |
2664 GET_MODE_NAME (vector_modes[mode_i]), | |
2665 GET_MODE_NAME (autodetected_vector_mode)); | |
2666 mode_i += 1; | |
2667 } | |
2668 | |
2669 if (mode_i == vector_modes.length () | |
2670 || autodetected_vector_mode == VOIDmode) | |
2671 break; | |
2289 | 2672 |
2290 /* Try the next biggest vector size. */ | 2673 /* Try the next biggest vector size. */ |
2291 current_vector_size = vector_sizes[next_size++]; | 2674 next_vector_mode = vector_modes[mode_i++]; |
2292 if (dump_enabled_p ()) | 2675 if (dump_enabled_p ()) |
2293 { | 2676 dump_printf_loc (MSG_NOTE, vect_location, |
2294 dump_printf_loc (MSG_NOTE, vect_location, | 2677 "***** Re-trying analysis with vector mode %s\n", |
2295 "***** Re-trying analysis with " | 2678 GET_MODE_NAME (next_vector_mode)); |
2296 "vector size "); | 2679 } |
2297 dump_dec (MSG_NOTE, current_vector_size); | 2680 |
2298 dump_printf (MSG_NOTE, "\n"); | 2681 if (first_loop_vinfo) |
2299 } | 2682 { |
2300 } | 2683 loop->aux = (loop_vec_info) first_loop_vinfo; |
2684 if (dump_enabled_p ()) | |
2685 dump_printf_loc (MSG_NOTE, vect_location, | |
2686 "***** Choosing vector mode %s\n", | |
2687 GET_MODE_NAME (first_loop_vinfo->vector_mode)); | |
2688 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th; | |
2689 return first_loop_vinfo; | |
2690 } | |
2691 | |
2692 return opt_loop_vec_info::propagate_failure (res); | |
2301 } | 2693 } |
2302 | 2694 |
2303 /* Return true if there is an in-order reduction function for CODE, storing | 2695 /* Return true if there is an in-order reduction function for CODE, storing |
2304 it in *REDUC_FN if so. */ | 2696 it in *REDUC_FN if so. */ |
2305 | 2697 |
2369 } | 2761 } |
2370 } | 2762 } |
2371 | 2763 |
2372 /* If there is a neutral value X such that SLP reduction NODE would not | 2764 /* If there is a neutral value X such that SLP reduction NODE would not |
2373 be affected by the introduction of additional X elements, return that X, | 2765 be affected by the introduction of additional X elements, return that X, |
2374 otherwise return null. CODE is the code of the reduction. REDUC_CHAIN | 2766 otherwise return null. CODE is the code of the reduction and VECTOR_TYPE |
2375 is true if the SLP statements perform a single reduction, false if each | 2767 is the vector type that would hold element X. REDUC_CHAIN is true if |
2376 statement performs an independent reduction. */ | 2768 the SLP statements perform a single reduction, false if each statement |
2769 performs an independent reduction. */ | |
2377 | 2770 |
2378 static tree | 2771 static tree |
2379 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code, | 2772 neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type, |
2380 bool reduc_chain) | 2773 tree_code code, bool reduc_chain) |
2381 { | 2774 { |
2382 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node); | 2775 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node); |
2383 stmt_vec_info stmt_vinfo = stmts[0]; | 2776 stmt_vec_info stmt_vinfo = stmts[0]; |
2384 tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo); | |
2385 tree scalar_type = TREE_TYPE (vector_type); | 2777 tree scalar_type = TREE_TYPE (vector_type); |
2386 struct loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father; | 2778 class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father; |
2387 gcc_assert (loop); | 2779 gcc_assert (loop); |
2388 | 2780 |
2389 switch (code) | 2781 switch (code) |
2390 { | 2782 { |
2391 case WIDEN_SUM_EXPR: | 2783 case WIDEN_SUM_EXPR: |
2425 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg) | 2817 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg) |
2426 { | 2818 { |
2427 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt); | 2819 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt); |
2428 } | 2820 } |
2429 | 2821 |
2430 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction | |
2431 operation. Return true if the results of DEF_STMT_INFO are something | |
2432 that can be accumulated by such a reduction. */ | |
2433 | |
2434 static bool | |
2435 vect_valid_reduction_input_p (stmt_vec_info def_stmt_info) | |
2436 { | |
2437 return (is_gimple_assign (def_stmt_info->stmt) | |
2438 || is_gimple_call (def_stmt_info->stmt) | |
2439 || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def | |
2440 || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI | |
2441 && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def | |
2442 && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt)))); | |
2443 } | |
2444 | |
2445 /* Detect SLP reduction of the form: | |
2446 | |
2447 #a1 = phi <a5, a0> | |
2448 a2 = operation (a1) | |
2449 a3 = operation (a2) | |
2450 a4 = operation (a3) | |
2451 a5 = operation (a4) | |
2452 | |
2453 #a = phi <a5> | |
2454 | |
2455 PHI is the reduction phi node (#a1 = phi <a5, a0> above) | |
2456 FIRST_STMT is the first reduction stmt in the chain | |
2457 (a2 = operation (a1)). | |
2458 | |
2459 Return TRUE if a reduction chain was detected. */ | |
2460 | |
2461 static bool | |
2462 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi, | |
2463 gimple *first_stmt) | |
2464 { | |
2465 struct loop *loop = (gimple_bb (phi))->loop_father; | |
2466 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info); | |
2467 enum tree_code code; | |
2468 gimple *loop_use_stmt = NULL; | |
2469 stmt_vec_info use_stmt_info, current_stmt_info = NULL; | |
2470 tree lhs; | |
2471 imm_use_iterator imm_iter; | |
2472 use_operand_p use_p; | |
2473 int nloop_uses, size = 0, n_out_of_loop_uses; | |
2474 bool found = false; | |
2475 | |
2476 if (loop != vect_loop) | |
2477 return false; | |
2478 | |
2479 lhs = PHI_RESULT (phi); | |
2480 code = gimple_assign_rhs_code (first_stmt); | |
2481 while (1) | |
2482 { | |
2483 nloop_uses = 0; | |
2484 n_out_of_loop_uses = 0; | |
2485 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs) | |
2486 { | |
2487 gimple *use_stmt = USE_STMT (use_p); | |
2488 if (is_gimple_debug (use_stmt)) | |
2489 continue; | |
2490 | |
2491 /* Check if we got back to the reduction phi. */ | |
2492 if (use_stmt == phi) | |
2493 { | |
2494 loop_use_stmt = use_stmt; | |
2495 found = true; | |
2496 break; | |
2497 } | |
2498 | |
2499 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))) | |
2500 { | |
2501 loop_use_stmt = use_stmt; | |
2502 nloop_uses++; | |
2503 } | |
2504 else | |
2505 n_out_of_loop_uses++; | |
2506 | |
2507 /* There are can be either a single use in the loop or two uses in | |
2508 phi nodes. */ | |
2509 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses)) | |
2510 return false; | |
2511 } | |
2512 | |
2513 if (found) | |
2514 break; | |
2515 | |
2516 /* We reached a statement with no loop uses. */ | |
2517 if (nloop_uses == 0) | |
2518 return false; | |
2519 | |
2520 /* This is a loop exit phi, and we haven't reached the reduction phi. */ | |
2521 if (gimple_code (loop_use_stmt) == GIMPLE_PHI) | |
2522 return false; | |
2523 | |
2524 if (!is_gimple_assign (loop_use_stmt) | |
2525 || code != gimple_assign_rhs_code (loop_use_stmt) | |
2526 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt))) | |
2527 return false; | |
2528 | |
2529 /* Insert USE_STMT into reduction chain. */ | |
2530 use_stmt_info = loop_info->lookup_stmt (loop_use_stmt); | |
2531 if (current_stmt_info) | |
2532 { | |
2533 REDUC_GROUP_NEXT_ELEMENT (current_stmt_info) = use_stmt_info; | |
2534 REDUC_GROUP_FIRST_ELEMENT (use_stmt_info) | |
2535 = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info); | |
2536 } | |
2537 else | |
2538 REDUC_GROUP_FIRST_ELEMENT (use_stmt_info) = use_stmt_info; | |
2539 | |
2540 lhs = gimple_assign_lhs (loop_use_stmt); | |
2541 current_stmt_info = use_stmt_info; | |
2542 size++; | |
2543 } | |
2544 | |
2545 if (!found || loop_use_stmt != phi || size < 2) | |
2546 return false; | |
2547 | |
2548 /* Swap the operands, if needed, to make the reduction operand be the second | |
2549 operand. */ | |
2550 lhs = PHI_RESULT (phi); | |
2551 stmt_vec_info next_stmt_info = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info); | |
2552 while (next_stmt_info) | |
2553 { | |
2554 gassign *next_stmt = as_a <gassign *> (next_stmt_info->stmt); | |
2555 if (gimple_assign_rhs2 (next_stmt) == lhs) | |
2556 { | |
2557 tree op = gimple_assign_rhs1 (next_stmt); | |
2558 stmt_vec_info def_stmt_info = loop_info->lookup_def (op); | |
2559 | |
2560 /* Check that the other def is either defined in the loop | |
2561 ("vect_internal_def"), or it's an induction (defined by a | |
2562 loop-header phi-node). */ | |
2563 if (def_stmt_info | |
2564 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)) | |
2565 && vect_valid_reduction_input_p (def_stmt_info)) | |
2566 { | |
2567 lhs = gimple_assign_lhs (next_stmt); | |
2568 next_stmt_info = REDUC_GROUP_NEXT_ELEMENT (next_stmt_info); | |
2569 continue; | |
2570 } | |
2571 | |
2572 return false; | |
2573 } | |
2574 else | |
2575 { | |
2576 tree op = gimple_assign_rhs2 (next_stmt); | |
2577 stmt_vec_info def_stmt_info = loop_info->lookup_def (op); | |
2578 | |
2579 /* Check that the other def is either defined in the loop | |
2580 ("vect_internal_def"), or it's an induction (defined by a | |
2581 loop-header phi-node). */ | |
2582 if (def_stmt_info | |
2583 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)) | |
2584 && vect_valid_reduction_input_p (def_stmt_info)) | |
2585 { | |
2586 if (dump_enabled_p ()) | |
2587 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: %G", | |
2588 next_stmt); | |
2589 | |
2590 swap_ssa_operands (next_stmt, | |
2591 gimple_assign_rhs1_ptr (next_stmt), | |
2592 gimple_assign_rhs2_ptr (next_stmt)); | |
2593 update_stmt (next_stmt); | |
2594 | |
2595 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt))) | |
2596 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true; | |
2597 } | |
2598 else | |
2599 return false; | |
2600 } | |
2601 | |
2602 lhs = gimple_assign_lhs (next_stmt); | |
2603 next_stmt_info = REDUC_GROUP_NEXT_ELEMENT (next_stmt_info); | |
2604 } | |
2605 | |
2606 /* Save the chain for further analysis in SLP detection. */ | |
2607 stmt_vec_info first_stmt_info | |
2608 = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info); | |
2609 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first_stmt_info); | |
2610 REDUC_GROUP_SIZE (first_stmt_info) = size; | |
2611 | |
2612 return true; | |
2613 } | |
2614 | |
2615 /* Return true if we need an in-order reduction for operation CODE | 2822 /* Return true if we need an in-order reduction for operation CODE |
2616 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer | 2823 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer |
2617 overflow must wrap. */ | 2824 overflow must wrap. */ |
2618 | 2825 |
2619 static bool | 2826 bool |
2620 needs_fold_left_reduction_p (tree type, tree_code code, | 2827 needs_fold_left_reduction_p (tree type, tree_code code) |
2621 bool need_wrapping_integral_overflow) | |
2622 { | 2828 { |
2623 /* CHECKME: check for !flag_finite_math_only too? */ | 2829 /* CHECKME: check for !flag_finite_math_only too? */ |
2624 if (SCALAR_FLOAT_TYPE_P (type)) | 2830 if (SCALAR_FLOAT_TYPE_P (type)) |
2625 switch (code) | 2831 switch (code) |
2626 { | 2832 { |
2634 | 2840 |
2635 if (INTEGRAL_TYPE_P (type)) | 2841 if (INTEGRAL_TYPE_P (type)) |
2636 { | 2842 { |
2637 if (!operation_no_trapping_overflow (type, code)) | 2843 if (!operation_no_trapping_overflow (type, code)) |
2638 return true; | 2844 return true; |
2639 if (need_wrapping_integral_overflow | |
2640 && !TYPE_OVERFLOW_WRAPS (type) | |
2641 && operation_can_overflow (code)) | |
2642 return true; | |
2643 return false; | 2845 return false; |
2644 } | 2846 } |
2645 | 2847 |
2646 if (SAT_FIXED_POINT_TYPE_P (type)) | 2848 if (SAT_FIXED_POINT_TYPE_P (type)) |
2647 return true; | 2849 return true; |
2648 | 2850 |
2649 return false; | 2851 return false; |
2650 } | 2852 } |
2651 | 2853 |
2652 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and | 2854 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and |
2653 reduction operation CODE has a handled computation expression. */ | 2855 has a handled computation expression. Store the main reduction |
2654 | 2856 operation in *CODE. */ |
2655 bool | 2857 |
2858 static bool | |
2656 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi, | 2859 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi, |
2657 tree loop_arg, enum tree_code code) | 2860 tree loop_arg, enum tree_code *code, |
2861 vec<std::pair<ssa_op_iter, use_operand_p> > &path) | |
2658 { | 2862 { |
2659 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path; | |
2660 auto_bitmap visited; | 2863 auto_bitmap visited; |
2661 tree lookfor = PHI_RESULT (phi); | 2864 tree lookfor = PHI_RESULT (phi); |
2662 ssa_op_iter curri; | 2865 ssa_op_iter curri; |
2663 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE); | 2866 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE); |
2664 while (USE_FROM_PTR (curr) != loop_arg) | 2867 while (USE_FROM_PTR (curr) != loop_arg) |
2722 } | 2925 } |
2723 | 2926 |
2724 /* Check whether the reduction path detected is valid. */ | 2927 /* Check whether the reduction path detected is valid. */ |
2725 bool fail = path.length () == 0; | 2928 bool fail = path.length () == 0; |
2726 bool neg = false; | 2929 bool neg = false; |
2930 int sign = -1; | |
2931 *code = ERROR_MARK; | |
2727 for (unsigned i = 1; i < path.length (); ++i) | 2932 for (unsigned i = 1; i < path.length (); ++i) |
2728 { | 2933 { |
2729 gimple *use_stmt = USE_STMT (path[i].second); | 2934 gimple *use_stmt = USE_STMT (path[i].second); |
2730 tree op = USE_FROM_PTR (path[i].second); | 2935 tree op = USE_FROM_PTR (path[i].second); |
2731 if (! has_single_use (op) | 2936 if (! is_gimple_assign (use_stmt) |
2732 || ! is_gimple_assign (use_stmt)) | 2937 /* The following make sure we can compute the operand index |
2938 easily plus it mostly disallows chaining via COND_EXPR condition | |
2939 operands. */ | |
2940 || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use | |
2941 && (gimple_num_ops (use_stmt) <= 2 | |
2942 || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use) | |
2943 && (gimple_num_ops (use_stmt) <= 3 | |
2944 || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use))) | |
2733 { | 2945 { |
2734 fail = true; | 2946 fail = true; |
2735 break; | 2947 break; |
2736 } | 2948 } |
2737 if (gimple_assign_rhs_code (use_stmt) != code) | 2949 /* Check there's only a single stmt the op is used on inside |
2738 { | 2950 of the loop. */ |
2739 if (code == PLUS_EXPR | 2951 imm_use_iterator imm_iter; |
2740 && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR) | 2952 gimple *op_use_stmt; |
2741 { | 2953 unsigned cnt = 0; |
2742 /* Track whether we negate the reduction value each iteration. */ | 2954 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op) |
2743 if (gimple_assign_rhs2 (use_stmt) == op) | 2955 if (!is_gimple_debug (op_use_stmt) |
2744 neg = ! neg; | 2956 && flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))) |
2745 } | 2957 { |
2746 else | 2958 /* We want to allow x + x but not x < 1 ? x : 2. */ |
2747 { | 2959 if (is_gimple_assign (op_use_stmt) |
2748 fail = true; | 2960 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR) |
2749 break; | 2961 { |
2750 } | 2962 use_operand_p use_p; |
2751 } | 2963 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) |
2752 } | 2964 cnt++; |
2753 return ! fail && ! neg; | 2965 } |
2966 else | |
2967 cnt++; | |
2968 } | |
2969 if (cnt != 1) | |
2970 { | |
2971 fail = true; | |
2972 break; | |
2973 } | |
2974 tree_code use_code = gimple_assign_rhs_code (use_stmt); | |
2975 if (use_code == MINUS_EXPR) | |
2976 { | |
2977 use_code = PLUS_EXPR; | |
2978 /* Track whether we negate the reduction value each iteration. */ | |
2979 if (gimple_assign_rhs2 (use_stmt) == op) | |
2980 neg = ! neg; | |
2981 } | |
2982 if (CONVERT_EXPR_CODE_P (use_code) | |
2983 && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)), | |
2984 TREE_TYPE (gimple_assign_rhs1 (use_stmt)))) | |
2985 ; | |
2986 else if (*code == ERROR_MARK) | |
2987 { | |
2988 *code = use_code; | |
2989 sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))); | |
2990 } | |
2991 else if (use_code != *code) | |
2992 { | |
2993 fail = true; | |
2994 break; | |
2995 } | |
2996 else if ((use_code == MIN_EXPR | |
2997 || use_code == MAX_EXPR) | |
2998 && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)))) | |
2999 { | |
3000 fail = true; | |
3001 break; | |
3002 } | |
3003 } | |
3004 return ! fail && ! neg && *code != ERROR_MARK; | |
2754 } | 3005 } |
3006 | |
3007 bool | |
3008 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi, | |
3009 tree loop_arg, enum tree_code code) | |
3010 { | |
3011 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path; | |
3012 enum tree_code code_; | |
3013 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path) | |
3014 && code_ == code); | |
3015 } | |
3016 | |
2755 | 3017 |
2756 | 3018 |
2757 /* Function vect_is_simple_reduction | 3019 /* Function vect_is_simple_reduction |
2758 | 3020 |
2759 (1) Detect a cross-iteration def-use cycle that represents a simple | 3021 (1) Detect a cross-iteration def-use cycle that represents a simple |
2798 | 3060 |
2799 */ | 3061 */ |
2800 | 3062 |
2801 static stmt_vec_info | 3063 static stmt_vec_info |
2802 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info, | 3064 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info, |
2803 bool *double_reduc, | 3065 bool *double_reduc, bool *reduc_chain_p) |
2804 bool need_wrapping_integral_overflow, | |
2805 enum vect_reduction_type *v_reduc_type) | |
2806 { | 3066 { |
2807 gphi *phi = as_a <gphi *> (phi_info->stmt); | 3067 gphi *phi = as_a <gphi *> (phi_info->stmt); |
2808 struct loop *loop = (gimple_bb (phi))->loop_father; | |
2809 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info); | |
2810 gimple *phi_use_stmt = NULL; | 3068 gimple *phi_use_stmt = NULL; |
2811 enum tree_code orig_code, code; | |
2812 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE; | |
2813 tree type; | |
2814 int nloop_uses; | |
2815 tree name; | |
2816 imm_use_iterator imm_iter; | 3069 imm_use_iterator imm_iter; |
2817 use_operand_p use_p; | 3070 use_operand_p use_p; |
2818 bool phi_def; | |
2819 | 3071 |
2820 *double_reduc = false; | 3072 *double_reduc = false; |
2821 *v_reduc_type = TREE_CODE_REDUCTION; | 3073 *reduc_chain_p = false; |
3074 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION; | |
2822 | 3075 |
2823 tree phi_name = PHI_RESULT (phi); | 3076 tree phi_name = PHI_RESULT (phi); |
2824 /* ??? If there are no uses of the PHI result the inner loop reduction | 3077 /* ??? If there are no uses of the PHI result the inner loop reduction |
2825 won't be detected as possibly double-reduction by vectorizable_reduction | 3078 won't be detected as possibly double-reduction by vectorizable_reduction |
2826 because that tries to walk the PHI arg from the preheader edge which | 3079 because that tries to walk the PHI arg from the preheader edge which |
2827 can be constant. See PR60382. */ | 3080 can be constant. See PR60382. */ |
2828 if (has_zero_uses (phi_name)) | 3081 if (has_zero_uses (phi_name)) |
2829 return NULL; | 3082 return NULL; |
2830 nloop_uses = 0; | 3083 class loop *loop = (gimple_bb (phi))->loop_father; |
3084 unsigned nphi_def_loop_uses = 0; | |
2831 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name) | 3085 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name) |
2832 { | 3086 { |
2833 gimple *use_stmt = USE_STMT (use_p); | 3087 gimple *use_stmt = USE_STMT (use_p); |
2834 if (is_gimple_debug (use_stmt)) | 3088 if (is_gimple_debug (use_stmt)) |
2835 continue; | 3089 continue; |
2841 "intermediate value used outside loop.\n"); | 3095 "intermediate value used outside loop.\n"); |
2842 | 3096 |
2843 return NULL; | 3097 return NULL; |
2844 } | 3098 } |
2845 | 3099 |
2846 nloop_uses++; | 3100 nphi_def_loop_uses++; |
2847 if (nloop_uses > 1) | |
2848 { | |
2849 if (dump_enabled_p ()) | |
2850 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
2851 "reduction value used in loop.\n"); | |
2852 return NULL; | |
2853 } | |
2854 | |
2855 phi_use_stmt = use_stmt; | 3101 phi_use_stmt = use_stmt; |
2856 } | 3102 } |
2857 | 3103 |
2858 edge latch_e = loop_latch_edge (loop); | 3104 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop)); |
2859 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e); | 3105 if (TREE_CODE (latch_def) != SSA_NAME) |
2860 if (TREE_CODE (loop_arg) != SSA_NAME) | |
2861 { | 3106 { |
2862 if (dump_enabled_p ()) | 3107 if (dump_enabled_p ()) |
2863 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | 3108 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2864 "reduction: not ssa_name: %T\n", loop_arg); | 3109 "reduction: not ssa_name: %T\n", latch_def); |
2865 return NULL; | 3110 return NULL; |
2866 } | 3111 } |
2867 | 3112 |
2868 stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg); | 3113 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def); |
2869 if (!def_stmt_info | 3114 if (!def_stmt_info |
2870 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))) | 3115 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))) |
2871 return NULL; | 3116 return NULL; |
2872 | 3117 |
2873 if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt)) | 3118 bool nested_in_vect_loop |
2874 { | 3119 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop); |
2875 name = gimple_assign_lhs (def_stmt); | 3120 unsigned nlatch_def_loop_uses = 0; |
2876 phi_def = false; | |
2877 } | |
2878 else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt)) | |
2879 { | |
2880 name = PHI_RESULT (def_stmt); | |
2881 phi_def = true; | |
2882 } | |
2883 else | |
2884 { | |
2885 if (dump_enabled_p ()) | |
2886 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
2887 "reduction: unhandled reduction operation: %G", | |
2888 def_stmt_info->stmt); | |
2889 return NULL; | |
2890 } | |
2891 | |
2892 nloop_uses = 0; | |
2893 auto_vec<gphi *, 3> lcphis; | 3121 auto_vec<gphi *, 3> lcphis; |
2894 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name) | 3122 bool inner_loop_of_double_reduc = false; |
3123 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def) | |
2895 { | 3124 { |
2896 gimple *use_stmt = USE_STMT (use_p); | 3125 gimple *use_stmt = USE_STMT (use_p); |
2897 if (is_gimple_debug (use_stmt)) | 3126 if (is_gimple_debug (use_stmt)) |
2898 continue; | 3127 continue; |
2899 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))) | 3128 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))) |
2900 nloop_uses++; | 3129 nlatch_def_loop_uses++; |
2901 else | 3130 else |
2902 /* We can have more than one loop-closed PHI. */ | 3131 { |
2903 lcphis.safe_push (as_a <gphi *> (use_stmt)); | 3132 /* We can have more than one loop-closed PHI. */ |
2904 if (nloop_uses > 1) | 3133 lcphis.safe_push (as_a <gphi *> (use_stmt)); |
2905 { | 3134 if (nested_in_vect_loop |
2906 if (dump_enabled_p ()) | 3135 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt)) |
2907 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | 3136 == vect_double_reduction_def)) |
2908 "reduction used in loop.\n"); | 3137 inner_loop_of_double_reduc = true; |
2909 return NULL; | 3138 } |
2910 } | 3139 } |
3140 | |
3141 /* If we are vectorizing an inner reduction we are executing that | |
3142 in the original order only in case we are not dealing with a | |
3143 double reduction. */ | |
3144 if (nested_in_vect_loop && !inner_loop_of_double_reduc) | |
3145 { | |
3146 if (dump_enabled_p ()) | |
3147 report_vect_op (MSG_NOTE, def_stmt_info->stmt, | |
3148 "detected nested cycle: "); | |
3149 return def_stmt_info; | |
3150 } | |
3151 | |
3152 /* If this isn't a nested cycle or if the nested cycle reduction value | |
3153 is used ouside of the inner loop we cannot handle uses of the reduction | |
3154 value. */ | |
3155 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1) | |
3156 { | |
3157 if (dump_enabled_p ()) | |
3158 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
3159 "reduction used in loop.\n"); | |
3160 return NULL; | |
2911 } | 3161 } |
2912 | 3162 |
2913 /* If DEF_STMT is a phi node itself, we expect it to have a single argument | 3163 /* If DEF_STMT is a phi node itself, we expect it to have a single argument |
2914 defined in the inner loop. */ | 3164 defined in the inner loop. */ |
2915 if (phi_def) | 3165 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt)) |
2916 { | 3166 { |
2917 gphi *def_stmt = as_a <gphi *> (def_stmt_info->stmt); | 3167 tree op1 = PHI_ARG_DEF (def_stmt, 0); |
2918 op1 = PHI_ARG_DEF (def_stmt, 0); | |
2919 | |
2920 if (gimple_phi_num_args (def_stmt) != 1 | 3168 if (gimple_phi_num_args (def_stmt) != 1 |
2921 || TREE_CODE (op1) != SSA_NAME) | 3169 || TREE_CODE (op1) != SSA_NAME) |
2922 { | 3170 { |
2923 if (dump_enabled_p ()) | 3171 if (dump_enabled_p ()) |
2924 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | 3172 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2945 } | 3193 } |
2946 | 3194 |
2947 return NULL; | 3195 return NULL; |
2948 } | 3196 } |
2949 | 3197 |
2950 /* If we are vectorizing an inner reduction we are executing that | 3198 /* Look for the expression computing latch_def from then loop PHI result. */ |
2951 in the original order only in case we are not dealing with a | 3199 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path; |
2952 double reduction. */ | 3200 enum tree_code code; |
2953 bool check_reduction = true; | 3201 if (check_reduction_path (vect_location, loop, phi, latch_def, &code, |
2954 if (flow_loop_nested_p (vect_loop, loop)) | 3202 path)) |
2955 { | 3203 { |
2956 gphi *lcphi; | 3204 STMT_VINFO_REDUC_CODE (phi_info) = code; |
3205 if (code == COND_EXPR && !nested_in_vect_loop) | |
3206 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION; | |
3207 | |
3208 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP | |
3209 reduction chain for which the additional restriction is that | |
3210 all operations in the chain are the same. */ | |
3211 auto_vec<stmt_vec_info, 8> reduc_chain; | |
2957 unsigned i; | 3212 unsigned i; |
2958 check_reduction = false; | 3213 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR; |
2959 FOR_EACH_VEC_ELT (lcphis, i, lcphi) | 3214 for (i = path.length () - 1; i >= 1; --i) |
2960 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi)) | 3215 { |
2961 { | 3216 gimple *stmt = USE_STMT (path[i].second); |
2962 gimple *use_stmt = USE_STMT (use_p); | 3217 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt); |
2963 if (is_gimple_debug (use_stmt)) | 3218 STMT_VINFO_REDUC_IDX (stmt_info) |
2964 continue; | 3219 = path[i].second->use - gimple_assign_rhs1_ptr (stmt); |
2965 if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt))) | 3220 enum tree_code stmt_code = gimple_assign_rhs_code (stmt); |
2966 check_reduction = true; | 3221 bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code) |
2967 } | 3222 && (i == 1 || i == path.length () - 1)); |
2968 } | 3223 if ((stmt_code != code && !leading_conversion) |
2969 | 3224 /* We can only handle the final value in epilogue |
2970 gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt); | 3225 generation for reduction chains. */ |
2971 bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop); | 3226 || (i != 1 && !has_single_use (gimple_assign_lhs (stmt)))) |
2972 code = orig_code = gimple_assign_rhs_code (def_stmt); | 3227 is_slp_reduc = false; |
2973 | 3228 /* For reduction chains we support a trailing/leading |
2974 /* We can handle "res -= x[i]", which is non-associative by | 3229 conversions. We do not store those in the actual chain. */ |
2975 simply rewriting this into "res += -x[i]". Avoid changing | 3230 if (leading_conversion) |
2976 gimple instruction for the first simple tests and only do this | 3231 continue; |
2977 if we're allowed to change code at all. */ | 3232 reduc_chain.safe_push (stmt_info); |
2978 if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name) | 3233 } |
2979 code = PLUS_EXPR; | 3234 if (is_slp_reduc && reduc_chain.length () > 1) |
2980 | 3235 { |
2981 if (code == COND_EXPR) | 3236 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i) |
2982 { | 3237 { |
2983 if (! nested_in_vect_loop) | 3238 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0]; |
2984 *v_reduc_type = COND_REDUCTION; | 3239 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1]; |
2985 | 3240 } |
2986 op3 = gimple_assign_rhs1 (def_stmt); | 3241 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0]; |
2987 if (COMPARISON_CLASS_P (op3)) | 3242 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL; |
2988 { | 3243 |
2989 op4 = TREE_OPERAND (op3, 1); | 3244 /* Save the chain for further analysis in SLP detection. */ |
2990 op3 = TREE_OPERAND (op3, 0); | 3245 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]); |
2991 } | 3246 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length (); |
2992 if (op3 == phi_name || op4 == phi_name) | 3247 |
2993 { | 3248 *reduc_chain_p = true; |
2994 if (dump_enabled_p ()) | 3249 if (dump_enabled_p ()) |
2995 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, | 3250 dump_printf_loc (MSG_NOTE, vect_location, |
2996 "reduction: condition depends on previous" | 3251 "reduction: detected reduction chain\n"); |
2997 " iteration: "); | 3252 } |
2998 return NULL; | 3253 else if (dump_enabled_p ()) |
2999 } | 3254 dump_printf_loc (MSG_NOTE, vect_location, |
3000 | 3255 "reduction: detected reduction\n"); |
3001 op1 = gimple_assign_rhs2 (def_stmt); | 3256 |
3002 op2 = gimple_assign_rhs3 (def_stmt); | |
3003 } | |
3004 else if (!commutative_tree_code (code) || !associative_tree_code (code)) | |
3005 { | |
3006 if (dump_enabled_p ()) | |
3007 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, | |
3008 "reduction: not commutative/associative: "); | |
3009 return NULL; | |
3010 } | |
3011 else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS) | |
3012 { | |
3013 op1 = gimple_assign_rhs1 (def_stmt); | |
3014 op2 = gimple_assign_rhs2 (def_stmt); | |
3015 } | |
3016 else | |
3017 { | |
3018 if (dump_enabled_p ()) | |
3019 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, | |
3020 "reduction: not handled operation: "); | |
3021 return NULL; | |
3022 } | |
3023 | |
3024 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME) | |
3025 { | |
3026 if (dump_enabled_p ()) | |
3027 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, | |
3028 "reduction: both uses not ssa_names: "); | |
3029 | |
3030 return NULL; | |
3031 } | |
3032 | |
3033 type = TREE_TYPE (gimple_assign_lhs (def_stmt)); | |
3034 if ((TREE_CODE (op1) == SSA_NAME | |
3035 && !types_compatible_p (type,TREE_TYPE (op1))) | |
3036 || (TREE_CODE (op2) == SSA_NAME | |
3037 && !types_compatible_p (type, TREE_TYPE (op2))) | |
3038 || (op3 && TREE_CODE (op3) == SSA_NAME | |
3039 && !types_compatible_p (type, TREE_TYPE (op3))) | |
3040 || (op4 && TREE_CODE (op4) == SSA_NAME | |
3041 && !types_compatible_p (type, TREE_TYPE (op4)))) | |
3042 { | |
3043 if (dump_enabled_p ()) | |
3044 { | |
3045 dump_printf_loc (MSG_NOTE, vect_location, | |
3046 "reduction: multiple types: operation type: " | |
3047 "%T, operands types: %T,%T", | |
3048 type, TREE_TYPE (op1), TREE_TYPE (op2)); | |
3049 if (op3) | |
3050 dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op3)); | |
3051 | |
3052 if (op4) | |
3053 dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op4)); | |
3054 dump_printf (MSG_NOTE, "\n"); | |
3055 } | |
3056 | |
3057 return NULL; | |
3058 } | |
3059 | |
3060 /* Check whether it's ok to change the order of the computation. | |
3061 Generally, when vectorizing a reduction we change the order of the | |
3062 computation. This may change the behavior of the program in some | |
3063 cases, so we need to check that this is ok. One exception is when | |
3064 vectorizing an outer-loop: the inner-loop is executed sequentially, | |
3065 and therefore vectorizing reductions in the inner-loop during | |
3066 outer-loop vectorization is safe. */ | |
3067 if (check_reduction | |
3068 && *v_reduc_type == TREE_CODE_REDUCTION | |
3069 && needs_fold_left_reduction_p (type, code, | |
3070 need_wrapping_integral_overflow)) | |
3071 *v_reduc_type = FOLD_LEFT_REDUCTION; | |
3072 | |
3073 /* Reduction is safe. We're dealing with one of the following: | |
3074 1) integer arithmetic and no trapv | |
3075 2) floating point arithmetic, and special flags permit this optimization | |
3076 3) nested cycle (i.e., outer loop vectorization). */ | |
3077 stmt_vec_info def1_info = loop_info->lookup_def (op1); | |
3078 stmt_vec_info def2_info = loop_info->lookup_def (op2); | |
3079 if (code != COND_EXPR && !def1_info && !def2_info) | |
3080 { | |
3081 if (dump_enabled_p ()) | |
3082 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: "); | |
3083 return NULL; | |
3084 } | |
3085 | |
3086 /* Check that one def is the reduction def, defined by PHI, | |
3087 the other def is either defined in the loop ("vect_internal_def"), | |
3088 or it's an induction (defined by a loop-header phi-node). */ | |
3089 | |
3090 if (def2_info | |
3091 && def2_info->stmt == phi | |
3092 && (code == COND_EXPR | |
3093 || !def1_info | |
3094 || !flow_bb_inside_loop_p (loop, gimple_bb (def1_info->stmt)) | |
3095 || vect_valid_reduction_input_p (def1_info))) | |
3096 { | |
3097 if (dump_enabled_p ()) | |
3098 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: "); | |
3099 return def_stmt_info; | 3257 return def_stmt_info; |
3100 } | 3258 } |
3101 | 3259 |
3102 if (def1_info | |
3103 && def1_info->stmt == phi | |
3104 && (code == COND_EXPR | |
3105 || !def2_info | |
3106 || !flow_bb_inside_loop_p (loop, gimple_bb (def2_info->stmt)) | |
3107 || vect_valid_reduction_input_p (def2_info))) | |
3108 { | |
3109 if (! nested_in_vect_loop && orig_code != MINUS_EXPR) | |
3110 { | |
3111 /* Check if we can swap operands (just for simplicity - so that | |
3112 the rest of the code can assume that the reduction variable | |
3113 is always the last (second) argument). */ | |
3114 if (code == COND_EXPR) | |
3115 { | |
3116 /* Swap cond_expr by inverting the condition. */ | |
3117 tree cond_expr = gimple_assign_rhs1 (def_stmt); | |
3118 enum tree_code invert_code = ERROR_MARK; | |
3119 enum tree_code cond_code = TREE_CODE (cond_expr); | |
3120 | |
3121 if (TREE_CODE_CLASS (cond_code) == tcc_comparison) | |
3122 { | |
3123 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0)); | |
3124 invert_code = invert_tree_comparison (cond_code, honor_nans); | |
3125 } | |
3126 if (invert_code != ERROR_MARK) | |
3127 { | |
3128 TREE_SET_CODE (cond_expr, invert_code); | |
3129 swap_ssa_operands (def_stmt, | |
3130 gimple_assign_rhs2_ptr (def_stmt), | |
3131 gimple_assign_rhs3_ptr (def_stmt)); | |
3132 } | |
3133 else | |
3134 { | |
3135 if (dump_enabled_p ()) | |
3136 report_vect_op (MSG_NOTE, def_stmt, | |
3137 "detected reduction: cannot swap operands " | |
3138 "for cond_expr"); | |
3139 return NULL; | |
3140 } | |
3141 } | |
3142 else | |
3143 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt), | |
3144 gimple_assign_rhs2_ptr (def_stmt)); | |
3145 | |
3146 if (dump_enabled_p ()) | |
3147 report_vect_op (MSG_NOTE, def_stmt, | |
3148 "detected reduction: need to swap operands: "); | |
3149 | |
3150 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt))) | |
3151 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true; | |
3152 } | |
3153 else | |
3154 { | |
3155 if (dump_enabled_p ()) | |
3156 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: "); | |
3157 } | |
3158 | |
3159 return def_stmt_info; | |
3160 } | |
3161 | |
3162 /* Try to find SLP reduction chain. */ | |
3163 if (! nested_in_vect_loop | |
3164 && code != COND_EXPR | |
3165 && orig_code != MINUS_EXPR | |
3166 && vect_is_slp_reduction (loop_info, phi, def_stmt)) | |
3167 { | |
3168 if (dump_enabled_p ()) | |
3169 report_vect_op (MSG_NOTE, def_stmt, | |
3170 "reduction: detected reduction chain: "); | |
3171 | |
3172 return def_stmt_info; | |
3173 } | |
3174 | |
3175 /* Dissolve group eventually half-built by vect_is_slp_reduction. */ | |
3176 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (def_stmt_info); | |
3177 while (first) | |
3178 { | |
3179 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first); | |
3180 REDUC_GROUP_FIRST_ELEMENT (first) = NULL; | |
3181 REDUC_GROUP_NEXT_ELEMENT (first) = NULL; | |
3182 first = next; | |
3183 } | |
3184 | |
3185 /* Look for the expression computing loop_arg from loop PHI result. */ | |
3186 if (check_reduction_path (vect_location, loop, phi, loop_arg, code)) | |
3187 return def_stmt_info; | |
3188 | |
3189 if (dump_enabled_p ()) | 3260 if (dump_enabled_p ()) |
3190 { | 3261 dump_printf_loc (MSG_NOTE, vect_location, |
3191 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, | 3262 "reduction: unknown pattern\n"); |
3192 "reduction: unknown pattern: "); | |
3193 } | |
3194 | 3263 |
3195 return NULL; | 3264 return NULL; |
3196 } | |
3197 | |
3198 /* Wrapper around vect_is_simple_reduction, which will modify code | |
3199 in-place if it enables detection of more reductions. Arguments | |
3200 as there. */ | |
3201 | |
3202 stmt_vec_info | |
3203 vect_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info, | |
3204 bool *double_reduc, | |
3205 bool need_wrapping_integral_overflow) | |
3206 { | |
3207 enum vect_reduction_type v_reduc_type; | |
3208 stmt_vec_info def_info | |
3209 = vect_is_simple_reduction (loop_info, phi_info, double_reduc, | |
3210 need_wrapping_integral_overflow, | |
3211 &v_reduc_type); | |
3212 if (def_info) | |
3213 { | |
3214 STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type; | |
3215 STMT_VINFO_REDUC_DEF (phi_info) = def_info; | |
3216 STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type; | |
3217 STMT_VINFO_REDUC_DEF (def_info) = phi_info; | |
3218 } | |
3219 return def_info; | |
3220 } | 3265 } |
3221 | 3266 |
3222 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */ | 3267 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */ |
3223 int | 3268 int |
3224 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue, | 3269 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue, |
3240 | 3285 |
3241 /* If peeled iterations are known but number of scalar loop | 3286 /* If peeled iterations are known but number of scalar loop |
3242 iterations are unknown, count a taken branch per peeled loop. */ | 3287 iterations are unknown, count a taken branch per peeled loop. */ |
3243 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken, | 3288 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken, |
3244 NULL, 0, vect_prologue); | 3289 NULL, 0, vect_prologue); |
3245 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken, | 3290 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken, |
3246 NULL, 0, vect_epilogue); | 3291 NULL, 0, vect_epilogue); |
3247 } | 3292 } |
3248 else | 3293 else |
3249 { | 3294 { |
3250 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo); | 3295 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo); |
3251 peel_iters_prologue = niters < peel_iters_prologue ? | 3296 peel_iters_prologue = niters < peel_iters_prologue ? |
3309 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo); | 3354 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo); |
3310 | 3355 |
3311 /* Cost model disabled. */ | 3356 /* Cost model disabled. */ |
3312 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) | 3357 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) |
3313 { | 3358 { |
3314 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n"); | 3359 if (dump_enabled_p ()) |
3360 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n"); | |
3315 *ret_min_profitable_niters = 0; | 3361 *ret_min_profitable_niters = 0; |
3316 *ret_min_profitable_estimate = 0; | 3362 *ret_min_profitable_estimate = 0; |
3317 return; | 3363 return; |
3318 } | 3364 } |
3319 | 3365 |
3322 { | 3368 { |
3323 /* FIXME: Make cost depend on complexity of individual check. */ | 3369 /* FIXME: Make cost depend on complexity of individual check. */ |
3324 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length (); | 3370 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length (); |
3325 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0, | 3371 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0, |
3326 vect_prologue); | 3372 vect_prologue); |
3327 dump_printf (MSG_NOTE, | 3373 if (dump_enabled_p ()) |
3328 "cost model: Adding cost of checks for loop " | 3374 dump_printf (MSG_NOTE, |
3329 "versioning to treat misalignment.\n"); | 3375 "cost model: Adding cost of checks for loop " |
3376 "versioning to treat misalignment.\n"); | |
3330 } | 3377 } |
3331 | 3378 |
3332 /* Requires loop versioning with alias checks. */ | 3379 /* Requires loop versioning with alias checks. */ |
3333 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)) | 3380 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)) |
3334 { | 3381 { |
3351 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p) | 3398 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p) |
3352 nstmts += 1; | 3399 nstmts += 1; |
3353 (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt, | 3400 (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt, |
3354 NULL, 0, vect_prologue); | 3401 NULL, 0, vect_prologue); |
3355 } | 3402 } |
3356 dump_printf (MSG_NOTE, | 3403 if (dump_enabled_p ()) |
3357 "cost model: Adding cost of checks for loop " | 3404 dump_printf (MSG_NOTE, |
3358 "versioning aliasing.\n"); | 3405 "cost model: Adding cost of checks for loop " |
3406 "versioning aliasing.\n"); | |
3359 } | 3407 } |
3360 | 3408 |
3361 /* Requires loop versioning with niter checks. */ | 3409 /* Requires loop versioning with niter checks. */ |
3362 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo)) | 3410 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo)) |
3363 { | 3411 { |
3364 /* FIXME: Make cost depend on complexity of individual check. */ | 3412 /* FIXME: Make cost depend on complexity of individual check. */ |
3365 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0, | 3413 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0, |
3366 vect_prologue); | 3414 vect_prologue); |
3367 dump_printf (MSG_NOTE, | 3415 if (dump_enabled_p ()) |
3368 "cost model: Adding cost of checks for loop " | 3416 dump_printf (MSG_NOTE, |
3369 "versioning niters.\n"); | 3417 "cost model: Adding cost of checks for loop " |
3418 "versioning niters.\n"); | |
3370 } | 3419 } |
3371 | 3420 |
3372 if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) | 3421 if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) |
3373 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0, | 3422 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0, |
3374 vect_prologue); | 3423 vect_prologue); |
3408 j, si) | 3457 j, si) |
3409 (void) add_stmt_cost (target_cost_data, si->count, | 3458 (void) add_stmt_cost (target_cost_data, si->count, |
3410 si->kind, si->stmt_info, si->misalign, | 3459 si->kind, si->stmt_info, si->misalign, |
3411 vect_epilogue); | 3460 vect_epilogue); |
3412 } | 3461 } |
3462 | |
3463 /* Calculate how many masks we need to generate. */ | |
3464 unsigned int num_masks = 0; | |
3465 rgroup_masks *rgm; | |
3466 unsigned int num_vectors_m1; | |
3467 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm) | |
3468 if (rgm->mask_type) | |
3469 num_masks += num_vectors_m1 + 1; | |
3470 gcc_assert (num_masks > 0); | |
3471 | |
3472 /* In the worst case, we need to generate each mask in the prologue | |
3473 and in the loop body. One of the loop body mask instructions | |
3474 replaces the comparison in the scalar loop, and since we don't | |
3475 count the scalar comparison against the scalar body, we shouldn't | |
3476 count that vector instruction against the vector body either. | |
3477 | |
3478 Sometimes we can use unpacks instead of generating prologue | |
3479 masks and sometimes the prologue mask will fold to a constant, | |
3480 so the actual prologue cost might be smaller. However, it's | |
3481 simpler and safer to use the worst-case cost; if this ends up | |
3482 being the tie-breaker between vectorizing or not, then it's | |
3483 probably better not to vectorize. */ | |
3484 (void) add_stmt_cost (target_cost_data, num_masks, vector_stmt, | |
3485 NULL, 0, vect_prologue); | |
3486 (void) add_stmt_cost (target_cost_data, num_masks - 1, vector_stmt, | |
3487 NULL, 0, vect_body); | |
3413 } | 3488 } |
3414 else if (npeel < 0) | 3489 else if (npeel < 0) |
3415 { | 3490 { |
3416 peel_iters_prologue = assumed_vf / 2; | 3491 peel_iters_prologue = assumed_vf / 2; |
3417 dump_printf (MSG_NOTE, "cost model: " | 3492 if (dump_enabled_p ()) |
3418 "prologue peel iters set to vf/2.\n"); | 3493 dump_printf (MSG_NOTE, "cost model: " |
3494 "prologue peel iters set to vf/2.\n"); | |
3419 | 3495 |
3420 /* If peeling for alignment is unknown, loop bound of main loop becomes | 3496 /* If peeling for alignment is unknown, loop bound of main loop becomes |
3421 unknown. */ | 3497 unknown. */ |
3422 peel_iters_epilogue = assumed_vf / 2; | 3498 peel_iters_epilogue = assumed_vf / 2; |
3423 dump_printf (MSG_NOTE, "cost model: " | 3499 if (dump_enabled_p ()) |
3424 "epilogue peel iters set to vf/2 because " | 3500 dump_printf (MSG_NOTE, "cost model: " |
3425 "peeling for alignment is unknown.\n"); | 3501 "epilogue peel iters set to vf/2 because " |
3502 "peeling for alignment is unknown.\n"); | |
3426 | 3503 |
3427 /* If peeled iterations are unknown, count a taken branch and a not taken | 3504 /* If peeled iterations are unknown, count a taken branch and a not taken |
3428 branch per peeled loop. Even if scalar loop iterations are known, | 3505 branch per peeled loop. Even if scalar loop iterations are known, |
3429 vector iterations are not known since peeled prologue iterations are | 3506 vector iterations are not known since peeled prologue iterations are |
3430 not known. Hence guards remain the same. */ | 3507 not known. Hence guards remain the same. */ |
3554 /* Complete the target-specific cost calculations. */ | 3631 /* Complete the target-specific cost calculations. */ |
3555 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost, | 3632 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost, |
3556 &vec_inside_cost, &vec_epilogue_cost); | 3633 &vec_inside_cost, &vec_epilogue_cost); |
3557 | 3634 |
3558 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost); | 3635 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost); |
3559 | 3636 |
3637 /* Stash the costs so that we can compare two loop_vec_infos. */ | |
3638 loop_vinfo->vec_inside_cost = vec_inside_cost; | |
3639 loop_vinfo->vec_outside_cost = vec_outside_cost; | |
3640 | |
3560 if (dump_enabled_p ()) | 3641 if (dump_enabled_p ()) |
3561 { | 3642 { |
3562 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n"); | 3643 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n"); |
3563 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n", | 3644 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n", |
3564 vec_inside_cost); | 3645 vec_inside_cost); |
3579 } | 3660 } |
3580 | 3661 |
3581 /* Calculate number of iterations required to make the vector version | 3662 /* Calculate number of iterations required to make the vector version |
3582 profitable, relative to the loop bodies only. The following condition | 3663 profitable, relative to the loop bodies only. The following condition |
3583 must hold true: | 3664 must hold true: |
3584 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC | 3665 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC |
3585 where | 3666 where |
3586 SIC = scalar iteration cost, VIC = vector iteration cost, | 3667 SIC = scalar iteration cost, VIC = vector iteration cost, |
3587 VOC = vector outside cost, VF = vectorization factor, | 3668 VOC = vector outside cost, VF = vectorization factor, |
3588 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations | 3669 NPEEL = prologue iterations + epilogue iterations, |
3589 SOC = scalar outside cost for run time cost model check. */ | 3670 SOC = scalar outside cost for run time cost model check. */ |
3590 | 3671 |
3591 if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost) | 3672 int saving_per_viter = (scalar_single_iter_cost * assumed_vf |
3592 { | 3673 - vec_inside_cost); |
3593 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) | 3674 if (saving_per_viter <= 0) |
3594 * assumed_vf | |
3595 - vec_inside_cost * peel_iters_prologue | |
3596 - vec_inside_cost * peel_iters_epilogue); | |
3597 if (min_profitable_iters <= 0) | |
3598 min_profitable_iters = 0; | |
3599 else | |
3600 { | |
3601 min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf) | |
3602 - vec_inside_cost); | |
3603 | |
3604 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters) | |
3605 <= (((int) vec_inside_cost * min_profitable_iters) | |
3606 + (((int) vec_outside_cost - scalar_outside_cost) | |
3607 * assumed_vf))) | |
3608 min_profitable_iters++; | |
3609 } | |
3610 } | |
3611 /* vector version will never be profitable. */ | |
3612 else | |
3613 { | 3675 { |
3614 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize) | 3676 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize) |
3615 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd, | 3677 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd, |
3616 "vectorization did not happen for a simd loop"); | 3678 "vectorization did not happen for a simd loop"); |
3617 | 3679 |
3625 *ret_min_profitable_niters = -1; | 3687 *ret_min_profitable_niters = -1; |
3626 *ret_min_profitable_estimate = -1; | 3688 *ret_min_profitable_estimate = -1; |
3627 return; | 3689 return; |
3628 } | 3690 } |
3629 | 3691 |
3630 dump_printf (MSG_NOTE, | 3692 /* ??? The "if" arm is written to handle all cases; see below for what |
3631 " Calculated minimum iters for profitability: %d\n", | 3693 we would do for !LOOP_VINFO_FULLY_MASKED_P. */ |
3632 min_profitable_iters); | 3694 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) |
3695 { | |
3696 /* Rewriting the condition above in terms of the number of | |
3697 vector iterations (vniters) rather than the number of | |
3698 scalar iterations (niters) gives: | |
3699 | |
3700 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC | |
3701 | |
3702 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC | |
3703 | |
3704 For integer N, X and Y when X > 0: | |
3705 | |
3706 N * X > Y <==> N >= (Y /[floor] X) + 1. */ | |
3707 int outside_overhead = (vec_outside_cost | |
3708 - scalar_single_iter_cost * peel_iters_prologue | |
3709 - scalar_single_iter_cost * peel_iters_epilogue | |
3710 - scalar_outside_cost); | |
3711 /* We're only interested in cases that require at least one | |
3712 vector iteration. */ | |
3713 int min_vec_niters = 1; | |
3714 if (outside_overhead > 0) | |
3715 min_vec_niters = outside_overhead / saving_per_viter + 1; | |
3716 | |
3717 if (dump_enabled_p ()) | |
3718 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n", | |
3719 min_vec_niters); | |
3720 | |
3721 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) | |
3722 { | |
3723 /* Now that we know the minimum number of vector iterations, | |
3724 find the minimum niters for which the scalar cost is larger: | |
3725 | |
3726 SIC * niters > VIC * vniters + VOC - SOC | |
3727 | |
3728 We know that the minimum niters is no more than | |
3729 vniters * VF + NPEEL, but it might be (and often is) less | |
3730 than that if a partial vector iteration is cheaper than the | |
3731 equivalent scalar code. */ | |
3732 int threshold = (vec_inside_cost * min_vec_niters | |
3733 + vec_outside_cost | |
3734 - scalar_outside_cost); | |
3735 if (threshold <= 0) | |
3736 min_profitable_iters = 1; | |
3737 else | |
3738 min_profitable_iters = threshold / scalar_single_iter_cost + 1; | |
3739 } | |
3740 else | |
3741 /* Convert the number of vector iterations into a number of | |
3742 scalar iterations. */ | |
3743 min_profitable_iters = (min_vec_niters * assumed_vf | |
3744 + peel_iters_prologue | |
3745 + peel_iters_epilogue); | |
3746 } | |
3747 else | |
3748 { | |
3749 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) | |
3750 * assumed_vf | |
3751 - vec_inside_cost * peel_iters_prologue | |
3752 - vec_inside_cost * peel_iters_epilogue); | |
3753 if (min_profitable_iters <= 0) | |
3754 min_profitable_iters = 0; | |
3755 else | |
3756 { | |
3757 min_profitable_iters /= saving_per_viter; | |
3758 | |
3759 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters) | |
3760 <= (((int) vec_inside_cost * min_profitable_iters) | |
3761 + (((int) vec_outside_cost - scalar_outside_cost) | |
3762 * assumed_vf))) | |
3763 min_profitable_iters++; | |
3764 } | |
3765 } | |
3766 | |
3767 if (dump_enabled_p ()) | |
3768 dump_printf (MSG_NOTE, | |
3769 " Calculated minimum iters for profitability: %d\n", | |
3770 min_profitable_iters); | |
3633 | 3771 |
3634 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) | 3772 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) |
3635 && min_profitable_iters < (assumed_vf + peel_iters_prologue)) | 3773 && min_profitable_iters < (assumed_vf + peel_iters_prologue)) |
3636 /* We want the vectorized loop to execute at least once. */ | 3774 /* We want the vectorized loop to execute at least once. */ |
3637 min_profitable_iters = assumed_vf + peel_iters_prologue; | 3775 min_profitable_iters = assumed_vf + peel_iters_prologue; |
3646 /* Calculate number of iterations required to make the vector version | 3784 /* Calculate number of iterations required to make the vector version |
3647 profitable, relative to the loop bodies only. | 3785 profitable, relative to the loop bodies only. |
3648 | 3786 |
3649 Non-vectorized variant is SIC * niters and it must win over vector | 3787 Non-vectorized variant is SIC * niters and it must win over vector |
3650 variant on the expected loop trip count. The following condition must hold true: | 3788 variant on the expected loop trip count. The following condition must hold true: |
3651 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */ | 3789 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */ |
3652 | 3790 |
3653 if (vec_outside_cost <= 0) | 3791 if (vec_outside_cost <= 0) |
3654 min_profitable_estimate = 0; | 3792 min_profitable_estimate = 0; |
3793 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) | |
3794 { | |
3795 /* This is a repeat of the code above, but with + SOC rather | |
3796 than - SOC. */ | |
3797 int outside_overhead = (vec_outside_cost | |
3798 - scalar_single_iter_cost * peel_iters_prologue | |
3799 - scalar_single_iter_cost * peel_iters_epilogue | |
3800 + scalar_outside_cost); | |
3801 int min_vec_niters = 1; | |
3802 if (outside_overhead > 0) | |
3803 min_vec_niters = outside_overhead / saving_per_viter + 1; | |
3804 | |
3805 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) | |
3806 { | |
3807 int threshold = (vec_inside_cost * min_vec_niters | |
3808 + vec_outside_cost | |
3809 + scalar_outside_cost); | |
3810 min_profitable_estimate = threshold / scalar_single_iter_cost + 1; | |
3811 } | |
3812 else | |
3813 min_profitable_estimate = (min_vec_niters * assumed_vf | |
3814 + peel_iters_prologue | |
3815 + peel_iters_epilogue); | |
3816 } | |
3655 else | 3817 else |
3656 { | 3818 { |
3657 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) | 3819 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) |
3658 * assumed_vf | 3820 * assumed_vf |
3659 - vec_inside_cost * peel_iters_prologue | 3821 - vec_inside_cost * peel_iters_prologue |
3718 generated within the strip-mine loop, the initial definition before | 3880 generated within the strip-mine loop, the initial definition before |
3719 the loop, and the epilogue code that must be generated. */ | 3881 the loop, and the epilogue code that must be generated. */ |
3720 | 3882 |
3721 static void | 3883 static void |
3722 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn, | 3884 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn, |
3885 vect_reduction_type reduction_type, | |
3723 int ncopies, stmt_vector_for_cost *cost_vec) | 3886 int ncopies, stmt_vector_for_cost *cost_vec) |
3724 { | 3887 { |
3725 int prologue_cost = 0, epilogue_cost = 0, inside_cost; | 3888 int prologue_cost = 0, epilogue_cost = 0, inside_cost; |
3726 enum tree_code code; | 3889 enum tree_code code; |
3727 optab optab; | 3890 optab optab; |
3728 tree vectype; | 3891 tree vectype; |
3729 machine_mode mode; | 3892 machine_mode mode; |
3730 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | 3893 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); |
3731 struct loop *loop = NULL; | 3894 class loop *loop = NULL; |
3732 | 3895 |
3733 if (loop_vinfo) | 3896 if (loop_vinfo) |
3734 loop = LOOP_VINFO_LOOP (loop_vinfo); | 3897 loop = LOOP_VINFO_LOOP (loop_vinfo); |
3735 | 3898 |
3736 /* Condition reductions generate two reductions in the loop. */ | 3899 /* Condition reductions generate two reductions in the loop. */ |
3737 vect_reduction_type reduction_type | |
3738 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info); | |
3739 if (reduction_type == COND_REDUCTION) | 3900 if (reduction_type == COND_REDUCTION) |
3740 ncopies *= 2; | 3901 ncopies *= 2; |
3741 | 3902 |
3742 vectype = STMT_VINFO_VECTYPE (stmt_info); | 3903 vectype = STMT_VINFO_VECTYPE (stmt_info); |
3743 mode = TYPE_MODE (vectype); | 3904 mode = TYPE_MODE (vectype); |
3744 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info); | 3905 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info); |
3745 | 3906 |
3746 code = gimple_assign_rhs_code (orig_stmt_info->stmt); | 3907 code = gimple_assign_rhs_code (orig_stmt_info->stmt); |
3747 | 3908 |
3748 if (reduction_type == EXTRACT_LAST_REDUCTION | 3909 if (reduction_type == EXTRACT_LAST_REDUCTION) |
3749 || reduction_type == FOLD_LEFT_REDUCTION) | 3910 /* No extra instructions are needed in the prologue. The loop body |
3911 operations are costed in vectorizable_condition. */ | |
3912 inside_cost = 0; | |
3913 else if (reduction_type == FOLD_LEFT_REDUCTION) | |
3750 { | 3914 { |
3751 /* No extra instructions needed in the prologue. */ | 3915 /* No extra instructions needed in the prologue. */ |
3752 prologue_cost = 0; | 3916 prologue_cost = 0; |
3753 | 3917 |
3754 if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST) | 3918 if (reduc_fn != IFN_LAST) |
3755 /* Count one reduction-like operation per vector. */ | 3919 /* Count one reduction-like operation per vector. */ |
3756 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar, | 3920 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar, |
3757 stmt_info, 0, vect_body); | 3921 stmt_info, 0, vect_body); |
3758 else | 3922 else |
3759 { | 3923 { |
3952 initialization vector is simpler (same element in all entries), if | 4116 initialization vector is simpler (same element in all entries), if |
3953 ADJUSTMENT_DEF is not NULL, and Option2 otherwise. | 4117 ADJUSTMENT_DEF is not NULL, and Option2 otherwise. |
3954 | 4118 |
3955 A cost model should help decide between these two schemes. */ | 4119 A cost model should help decide between these two schemes. */ |
3956 | 4120 |
3957 tree | 4121 static tree |
3958 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val, | 4122 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, |
4123 enum tree_code code, tree init_val, | |
3959 tree *adjustment_def) | 4124 tree *adjustment_def) |
3960 { | 4125 { |
3961 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo); | 4126 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo); |
3962 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | 4127 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
3963 tree scalar_type = TREE_TYPE (init_val); | 4128 tree scalar_type = TREE_TYPE (init_val); |
3964 tree vectype = get_vectype_for_scalar_type (scalar_type); | 4129 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type); |
3965 enum tree_code code = gimple_assign_rhs_code (stmt_vinfo->stmt); | |
3966 tree def_for_init; | 4130 tree def_for_init; |
3967 tree init_def; | 4131 tree init_def; |
3968 REAL_VALUE_TYPE real_init_val = dconst0; | 4132 REAL_VALUE_TYPE real_init_val = dconst0; |
3969 int int_init_val = 0; | 4133 int int_init_val = 0; |
3970 gimple_seq stmts = NULL; | 4134 gimple_seq stmts = NULL; |
3975 || SCALAR_FLOAT_TYPE_P (scalar_type)); | 4139 || SCALAR_FLOAT_TYPE_P (scalar_type)); |
3976 | 4140 |
3977 gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo) | 4141 gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo) |
3978 || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father); | 4142 || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father); |
3979 | 4143 |
3980 vect_reduction_type reduction_type | 4144 /* ADJUSTMENT_DEF is NULL when called from |
3981 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo); | 4145 vect_create_epilog_for_reduction to vectorize double reduction. */ |
4146 if (adjustment_def) | |
4147 *adjustment_def = NULL; | |
3982 | 4148 |
3983 switch (code) | 4149 switch (code) |
3984 { | 4150 { |
3985 case WIDEN_SUM_EXPR: | 4151 case WIDEN_SUM_EXPR: |
3986 case DOT_PROD_EXPR: | 4152 case DOT_PROD_EXPR: |
3990 case BIT_IOR_EXPR: | 4156 case BIT_IOR_EXPR: |
3991 case BIT_XOR_EXPR: | 4157 case BIT_XOR_EXPR: |
3992 case MULT_EXPR: | 4158 case MULT_EXPR: |
3993 case BIT_AND_EXPR: | 4159 case BIT_AND_EXPR: |
3994 { | 4160 { |
3995 /* ADJUSTMENT_DEF is NULL when called from | |
3996 vect_create_epilog_for_reduction to vectorize double reduction. */ | |
3997 if (adjustment_def) | |
3998 *adjustment_def = init_val; | |
3999 | |
4000 if (code == MULT_EXPR) | 4161 if (code == MULT_EXPR) |
4001 { | 4162 { |
4002 real_init_val = dconst1; | 4163 real_init_val = dconst1; |
4003 int_init_val = 1; | 4164 int_init_val = 1; |
4004 } | 4165 } |
4009 if (SCALAR_FLOAT_TYPE_P (scalar_type)) | 4170 if (SCALAR_FLOAT_TYPE_P (scalar_type)) |
4010 def_for_init = build_real (scalar_type, real_init_val); | 4171 def_for_init = build_real (scalar_type, real_init_val); |
4011 else | 4172 else |
4012 def_for_init = build_int_cst (scalar_type, int_init_val); | 4173 def_for_init = build_int_cst (scalar_type, int_init_val); |
4013 | 4174 |
4014 if (adjustment_def) | 4175 if (adjustment_def || operand_equal_p (def_for_init, init_val, 0)) |
4015 /* Option1: the first element is '0' or '1' as well. */ | 4176 { |
4016 init_def = gimple_build_vector_from_val (&stmts, vectype, | 4177 /* Option1: the first element is '0' or '1' as well. */ |
4017 def_for_init); | 4178 if (!operand_equal_p (def_for_init, init_val, 0)) |
4179 *adjustment_def = init_val; | |
4180 init_def = gimple_build_vector_from_val (&stmts, vectype, | |
4181 def_for_init); | |
4182 } | |
4018 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ()) | 4183 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ()) |
4019 { | 4184 { |
4020 /* Option2 (variable length): the first element is INIT_VAL. */ | 4185 /* Option2 (variable length): the first element is INIT_VAL. */ |
4021 init_def = gimple_build_vector_from_val (&stmts, vectype, | 4186 init_def = gimple_build_vector_from_val (&stmts, vectype, |
4022 def_for_init); | 4187 def_for_init); |
4036 | 4201 |
4037 case MIN_EXPR: | 4202 case MIN_EXPR: |
4038 case MAX_EXPR: | 4203 case MAX_EXPR: |
4039 case COND_EXPR: | 4204 case COND_EXPR: |
4040 { | 4205 { |
4041 if (adjustment_def) | |
4042 { | |
4043 *adjustment_def = NULL_TREE; | |
4044 if (reduction_type != COND_REDUCTION | |
4045 && reduction_type != EXTRACT_LAST_REDUCTION) | |
4046 { | |
4047 init_def = vect_get_vec_def_for_operand (init_val, stmt_vinfo); | |
4048 break; | |
4049 } | |
4050 } | |
4051 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val); | 4206 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val); |
4052 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val); | 4207 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val); |
4053 } | 4208 } |
4054 break; | 4209 break; |
4055 | 4210 |
4073 unsigned int number_of_vectors, | 4228 unsigned int number_of_vectors, |
4074 bool reduc_chain, tree neutral_op) | 4229 bool reduc_chain, tree neutral_op) |
4075 { | 4230 { |
4076 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node); | 4231 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node); |
4077 stmt_vec_info stmt_vinfo = stmts[0]; | 4232 stmt_vec_info stmt_vinfo = stmts[0]; |
4233 vec_info *vinfo = stmt_vinfo->vinfo; | |
4078 unsigned HOST_WIDE_INT nunits; | 4234 unsigned HOST_WIDE_INT nunits; |
4079 unsigned j, number_of_places_left_in_vector; | 4235 unsigned j, number_of_places_left_in_vector; |
4080 tree vector_type; | 4236 tree vector_type; |
4081 tree vop; | 4237 unsigned int group_size = stmts.length (); |
4082 int group_size = stmts.length (); | 4238 unsigned int i; |
4083 unsigned int vec_num, i; | 4239 class loop *loop; |
4084 unsigned number_of_copies = 1; | |
4085 vec<tree> voprnds; | |
4086 voprnds.create (number_of_vectors); | |
4087 struct loop *loop; | |
4088 auto_vec<tree, 16> permute_results; | |
4089 | 4240 |
4090 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo); | 4241 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo); |
4091 | 4242 |
4092 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def); | 4243 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def); |
4093 | 4244 |
4114 {s5, s6, s7, s8}. */ | 4265 {s5, s6, s7, s8}. */ |
4115 | 4266 |
4116 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits)) | 4267 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits)) |
4117 nunits = group_size; | 4268 nunits = group_size; |
4118 | 4269 |
4119 number_of_copies = nunits * number_of_vectors / group_size; | |
4120 | |
4121 number_of_places_left_in_vector = nunits; | 4270 number_of_places_left_in_vector = nunits; |
4122 bool constant_p = true; | 4271 bool constant_p = true; |
4123 tree_vector_builder elts (vector_type, nunits, 1); | 4272 tree_vector_builder elts (vector_type, nunits, 1); |
4124 elts.quick_grow (nunits); | 4273 elts.quick_grow (nunits); |
4125 for (j = 0; j < number_of_copies; j++) | 4274 gimple_seq ctor_seq = NULL; |
4126 { | 4275 for (j = 0; j < nunits * number_of_vectors; ++j) |
4127 for (i = group_size - 1; stmts.iterate (i, &stmt_vinfo); i--) | 4276 { |
4128 { | 4277 tree op; |
4129 tree op; | 4278 i = j % group_size; |
4130 /* Get the def before the loop. In reduction chain we have only | 4279 stmt_vinfo = stmts[i]; |
4131 one initial value. */ | 4280 |
4132 if ((j != (number_of_copies - 1) | 4281 /* Get the def before the loop. In reduction chain we have only |
4133 || (reduc_chain && i != 0)) | 4282 one initial value. Else we have as many as PHIs in the group. */ |
4134 && neutral_op) | 4283 if (reduc_chain) |
4135 op = neutral_op; | 4284 op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe); |
4285 else if (((vec_oprnds->length () + 1) * nunits | |
4286 - number_of_places_left_in_vector >= group_size) | |
4287 && neutral_op) | |
4288 op = neutral_op; | |
4289 else | |
4290 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe); | |
4291 | |
4292 /* Create 'vect_ = {op0,op1,...,opn}'. */ | |
4293 number_of_places_left_in_vector--; | |
4294 elts[nunits - number_of_places_left_in_vector - 1] = op; | |
4295 if (!CONSTANT_CLASS_P (op)) | |
4296 constant_p = false; | |
4297 | |
4298 if (number_of_places_left_in_vector == 0) | |
4299 { | |
4300 tree init; | |
4301 if (constant_p && !neutral_op | |
4302 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits) | |
4303 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits)) | |
4304 /* Build the vector directly from ELTS. */ | |
4305 init = gimple_build_vector (&ctor_seq, &elts); | |
4306 else if (neutral_op) | |
4307 { | |
4308 /* Build a vector of the neutral value and shift the | |
4309 other elements into place. */ | |
4310 init = gimple_build_vector_from_val (&ctor_seq, vector_type, | |
4311 neutral_op); | |
4312 int k = nunits; | |
4313 while (k > 0 && elts[k - 1] == neutral_op) | |
4314 k -= 1; | |
4315 while (k > 0) | |
4316 { | |
4317 k -= 1; | |
4318 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT, | |
4319 vector_type, init, elts[k]); | |
4320 } | |
4321 } | |
4136 else | 4322 else |
4137 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe); | |
4138 | |
4139 /* Create 'vect_ = {op0,op1,...,opn}'. */ | |
4140 number_of_places_left_in_vector--; | |
4141 elts[number_of_places_left_in_vector] = op; | |
4142 if (!CONSTANT_CLASS_P (op)) | |
4143 constant_p = false; | |
4144 | |
4145 if (number_of_places_left_in_vector == 0) | |
4146 { | |
4147 gimple_seq ctor_seq = NULL; | |
4148 tree init; | |
4149 if (constant_p && !neutral_op | |
4150 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits) | |
4151 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits)) | |
4152 /* Build the vector directly from ELTS. */ | |
4153 init = gimple_build_vector (&ctor_seq, &elts); | |
4154 else if (neutral_op) | |
4155 { | |
4156 /* Build a vector of the neutral value and shift the | |
4157 other elements into place. */ | |
4158 init = gimple_build_vector_from_val (&ctor_seq, vector_type, | |
4159 neutral_op); | |
4160 int k = nunits; | |
4161 while (k > 0 && elts[k - 1] == neutral_op) | |
4162 k -= 1; | |
4163 while (k > 0) | |
4164 { | |
4165 k -= 1; | |
4166 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT, | |
4167 vector_type, init, elts[k]); | |
4168 } | |
4169 } | |
4170 else | |
4171 { | |
4172 /* First time round, duplicate ELTS to fill the | |
4173 required number of vectors, then cherry pick the | |
4174 appropriate result for each iteration. */ | |
4175 if (vec_oprnds->is_empty ()) | |
4176 duplicate_and_interleave (&ctor_seq, vector_type, elts, | |
4177 number_of_vectors, | |
4178 permute_results); | |
4179 init = permute_results[number_of_vectors - j - 1]; | |
4180 } | |
4181 if (ctor_seq != NULL) | |
4182 gsi_insert_seq_on_edge_immediate (pe, ctor_seq); | |
4183 voprnds.quick_push (init); | |
4184 | |
4185 number_of_places_left_in_vector = nunits; | |
4186 elts.new_vector (vector_type, nunits, 1); | |
4187 elts.quick_grow (nunits); | |
4188 constant_p = true; | |
4189 } | |
4190 } | |
4191 } | |
4192 | |
4193 /* Since the vectors are created in the reverse order, we should invert | |
4194 them. */ | |
4195 vec_num = voprnds.length (); | |
4196 for (j = vec_num; j != 0; j--) | |
4197 { | |
4198 vop = voprnds[j - 1]; | |
4199 vec_oprnds->quick_push (vop); | |
4200 } | |
4201 | |
4202 voprnds.release (); | |
4203 | |
4204 /* In case that VF is greater than the unrolling factor needed for the SLP | |
4205 group of stmts, NUMBER_OF_VECTORS to be created is greater than | |
4206 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have | |
4207 to replicate the vectors. */ | |
4208 tree neutral_vec = NULL; | |
4209 while (number_of_vectors > vec_oprnds->length ()) | |
4210 { | |
4211 if (neutral_op) | |
4212 { | |
4213 if (!neutral_vec) | |
4214 { | 4323 { |
4215 gimple_seq ctor_seq = NULL; | 4324 /* First time round, duplicate ELTS to fill the |
4216 neutral_vec = gimple_build_vector_from_val | 4325 required number of vectors. */ |
4217 (&ctor_seq, vector_type, neutral_op); | 4326 duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts, |
4218 if (ctor_seq != NULL) | 4327 number_of_vectors, *vec_oprnds); |
4219 gsi_insert_seq_on_edge_immediate (pe, ctor_seq); | 4328 break; |
4220 } | 4329 } |
4221 vec_oprnds->quick_push (neutral_vec); | 4330 vec_oprnds->quick_push (init); |
4222 } | 4331 |
4223 else | 4332 number_of_places_left_in_vector = nunits; |
4224 { | 4333 elts.new_vector (vector_type, nunits, 1); |
4225 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++) | 4334 elts.quick_grow (nunits); |
4226 vec_oprnds->quick_push (vop); | 4335 constant_p = true; |
4227 } | 4336 } |
4228 } | 4337 } |
4338 if (ctor_seq != NULL) | |
4339 gsi_insert_seq_on_edge_immediate (pe, ctor_seq); | |
4229 } | 4340 } |
4230 | 4341 |
4342 /* For a statement STMT_INFO taking part in a reduction operation return | |
4343 the stmt_vec_info the meta information is stored on. */ | |
4344 | |
4345 stmt_vec_info | |
4346 info_for_reduction (stmt_vec_info stmt_info) | |
4347 { | |
4348 stmt_info = vect_orig_stmt (stmt_info); | |
4349 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info)); | |
4350 if (!is_a <gphi *> (stmt_info->stmt)) | |
4351 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info); | |
4352 gphi *phi = as_a <gphi *> (stmt_info->stmt); | |
4353 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def) | |
4354 { | |
4355 if (gimple_phi_num_args (phi) == 1) | |
4356 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info); | |
4357 } | |
4358 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle) | |
4359 { | |
4360 edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father); | |
4361 stmt_vec_info info | |
4362 = stmt_info->vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe)); | |
4363 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def) | |
4364 stmt_info = info; | |
4365 } | |
4366 return stmt_info; | |
4367 } | |
4231 | 4368 |
4232 /* Function vect_create_epilog_for_reduction | 4369 /* Function vect_create_epilog_for_reduction |
4233 | 4370 |
4234 Create code at the loop-epilog to finalize the result of a reduction | 4371 Create code at the loop-epilog to finalize the result of a reduction |
4235 computation. | 4372 computation. |
4236 | 4373 |
4237 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector | |
4238 reduction statements. | |
4239 STMT_INFO is the scalar reduction stmt that is being vectorized. | 4374 STMT_INFO is the scalar reduction stmt that is being vectorized. |
4240 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the | |
4241 number of elements that we can fit in a vectype (nunits). In this case | |
4242 we have to generate more than one vector stmt - i.e - we need to "unroll" | |
4243 the vector stmt by a factor VF/nunits. For more details see documentation | |
4244 in vectorizable_operation. | |
4245 REDUC_FN is the internal function for the epilog reduction. | |
4246 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction | |
4247 computation. | |
4248 REDUC_INDEX is the index of the operand in the right hand side of the | |
4249 statement that is defined by REDUCTION_PHI. | |
4250 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled. | |
4251 SLP_NODE is an SLP node containing a group of reduction statements. The | 4375 SLP_NODE is an SLP node containing a group of reduction statements. The |
4252 first one in this group is STMT_INFO. | 4376 first one in this group is STMT_INFO. |
4253 INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case | 4377 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE |
4254 when the COND_EXPR is never true in the loop. For MAX_EXPR, it needs to | 4378 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi |
4255 be smaller than any value of the IV in the loop, for MIN_EXPR larger than | 4379 (counting from 0) |
4256 any value of the IV in the loop. | |
4257 INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION. | |
4258 NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is | |
4259 null if this is not an SLP reduction | |
4260 | 4380 |
4261 This function: | 4381 This function: |
4262 1. Creates the reduction def-use cycles: sets the arguments for | 4382 1. Completes the reduction def-use cycles. |
4263 REDUCTION_PHIS: | |
4264 The loop-entry argument is the vectorized initial-value of the reduction. | |
4265 The loop-latch argument is taken from VECT_DEFS - the vector of partial | |
4266 sums. | |
4267 2. "Reduces" each vector of partial results VECT_DEFS into a single result, | 4383 2. "Reduces" each vector of partial results VECT_DEFS into a single result, |
4268 by calling the function specified by REDUC_FN if available, or by | 4384 by calling the function specified by REDUC_FN if available, or by |
4269 other means (whole-vector shifts or a scalar loop). | 4385 other means (whole-vector shifts or a scalar loop). |
4270 The function also creates a new phi node at the loop exit to preserve | 4386 The function also creates a new phi node at the loop exit to preserve |
4271 loop-closed form, as illustrated below. | 4387 loop-closed form, as illustrated below. |
4272 | 4388 |
4273 The flow at the entry to this function: | 4389 The flow at the entry to this function: |
4274 | 4390 |
4275 loop: | 4391 loop: |
4276 vec_def = phi <null, null> # REDUCTION_PHI | 4392 vec_def = phi <vec_init, null> # REDUCTION_PHI |
4277 VECT_DEF = vector_stmt # vectorized form of STMT_INFO | 4393 VECT_DEF = vector_stmt # vectorized form of STMT_INFO |
4278 s_loop = scalar_stmt # (scalar) STMT_INFO | 4394 s_loop = scalar_stmt # (scalar) STMT_INFO |
4279 loop_exit: | 4395 loop_exit: |
4280 s_out0 = phi <s_loop> # (scalar) EXIT_PHI | 4396 s_out0 = phi <s_loop> # (scalar) EXIT_PHI |
4281 use <s_out0> | 4397 use <s_out0> |
4296 use <s_out4> | 4412 use <s_out4> |
4297 use <s_out4> | 4413 use <s_out4> |
4298 */ | 4414 */ |
4299 | 4415 |
4300 static void | 4416 static void |
4301 vect_create_epilog_for_reduction (vec<tree> vect_defs, | 4417 vect_create_epilog_for_reduction (stmt_vec_info stmt_info, |
4302 stmt_vec_info stmt_info, | |
4303 gimple *reduc_def_stmt, | |
4304 int ncopies, internal_fn reduc_fn, | |
4305 vec<stmt_vec_info> reduction_phis, | |
4306 bool double_reduc, | |
4307 slp_tree slp_node, | 4418 slp_tree slp_node, |
4308 slp_instance slp_node_instance, | 4419 slp_instance slp_node_instance) |
4309 tree induc_val, enum tree_code induc_code, | |
4310 tree neutral_op) | |
4311 { | 4420 { |
4421 stmt_vec_info reduc_info = info_for_reduction (stmt_info); | |
4422 gcc_assert (reduc_info->is_reduc_info); | |
4423 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
4424 /* For double reductions we need to get at the inner loop reduction | |
4425 stmt which has the meta info attached. Our stmt_info is that of the | |
4426 loop-closed PHI of the inner loop which we remember as | |
4427 def for the reduction PHI generation. */ | |
4428 bool double_reduc = false; | |
4429 stmt_vec_info rdef_info = stmt_info; | |
4430 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def) | |
4431 { | |
4432 gcc_assert (!slp_node); | |
4433 double_reduc = true; | |
4434 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def | |
4435 (stmt_info->stmt, 0)); | |
4436 stmt_info = vect_stmt_to_vectorize (stmt_info); | |
4437 } | |
4438 gphi *reduc_def_stmt | |
4439 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt); | |
4440 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info); | |
4441 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info); | |
4312 stmt_vec_info prev_phi_info; | 4442 stmt_vec_info prev_phi_info; |
4313 tree vectype; | 4443 tree vectype; |
4314 machine_mode mode; | 4444 machine_mode mode; |
4315 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | 4445 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL; |
4316 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL; | |
4317 basic_block exit_bb; | 4446 basic_block exit_bb; |
4318 tree scalar_dest; | 4447 tree scalar_dest; |
4319 tree scalar_type; | 4448 tree scalar_type; |
4320 gimple *new_phi = NULL, *phi; | 4449 gimple *new_phi = NULL, *phi; |
4321 stmt_vec_info phi_info; | 4450 stmt_vec_info phi_info; |
4322 gimple_stmt_iterator exit_gsi; | 4451 gimple_stmt_iterator exit_gsi; |
4323 tree vec_dest; | 4452 tree new_temp = NULL_TREE, new_name, new_scalar_dest; |
4324 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest; | |
4325 gimple *epilog_stmt = NULL; | 4453 gimple *epilog_stmt = NULL; |
4326 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt); | |
4327 gimple *exit_phi; | 4454 gimple *exit_phi; |
4328 tree bitsize; | 4455 tree bitsize; |
4329 tree adjustment_def = NULL; | 4456 tree def; |
4330 tree vec_initial_def = NULL; | |
4331 tree expr, def, initial_def = NULL; | |
4332 tree orig_name, scalar_result; | 4457 tree orig_name, scalar_result; |
4333 imm_use_iterator imm_iter, phi_imm_iter; | 4458 imm_use_iterator imm_iter, phi_imm_iter; |
4334 use_operand_p use_p, phi_use_p; | 4459 use_operand_p use_p, phi_use_p; |
4335 gimple *use_stmt; | 4460 gimple *use_stmt; |
4336 stmt_vec_info reduction_phi_info = NULL; | |
4337 bool nested_in_vect_loop = false; | 4461 bool nested_in_vect_loop = false; |
4338 auto_vec<gimple *> new_phis; | 4462 auto_vec<gimple *> new_phis; |
4339 auto_vec<stmt_vec_info> inner_phis; | |
4340 int j, i; | 4463 int j, i; |
4341 auto_vec<tree> scalar_results; | 4464 auto_vec<tree> scalar_results; |
4342 unsigned int group_size = 1, k, ratio; | 4465 unsigned int group_size = 1, k; |
4343 auto_vec<tree> vec_initial_defs; | |
4344 auto_vec<gimple *> phis; | 4466 auto_vec<gimple *> phis; |
4345 bool slp_reduc = false; | 4467 bool slp_reduc = false; |
4346 bool direct_slp_reduc; | 4468 bool direct_slp_reduc; |
4347 tree new_phi_result; | 4469 tree new_phi_result; |
4348 stmt_vec_info inner_phi = NULL; | |
4349 tree induction_index = NULL_TREE; | 4470 tree induction_index = NULL_TREE; |
4350 | 4471 |
4351 if (slp_node) | 4472 if (slp_node) |
4352 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length (); | 4473 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length (); |
4353 | 4474 |
4356 outer_loop = loop; | 4477 outer_loop = loop; |
4357 loop = loop->inner; | 4478 loop = loop->inner; |
4358 nested_in_vect_loop = true; | 4479 nested_in_vect_loop = true; |
4359 gcc_assert (!slp_node); | 4480 gcc_assert (!slp_node); |
4360 } | 4481 } |
4361 | 4482 gcc_assert (!nested_in_vect_loop || double_reduc); |
4362 vectype = STMT_VINFO_VECTYPE (stmt_info); | 4483 |
4484 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info); | |
4363 gcc_assert (vectype); | 4485 gcc_assert (vectype); |
4364 mode = TYPE_MODE (vectype); | 4486 mode = TYPE_MODE (vectype); |
4365 | 4487 |
4366 /* 1. Create the reduction def-use cycle: | 4488 tree initial_def = NULL; |
4367 Set the arguments of REDUCTION_PHIS, i.e., transform | 4489 tree induc_val = NULL_TREE; |
4368 | 4490 tree adjustment_def = NULL; |
4369 loop: | |
4370 vec_def = phi <null, null> # REDUCTION_PHI | |
4371 VECT_DEF = vector_stmt # vectorized form of STMT | |
4372 ... | |
4373 | |
4374 into: | |
4375 | |
4376 loop: | |
4377 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI | |
4378 VECT_DEF = vector_stmt # vectorized form of STMT | |
4379 ... | |
4380 | |
4381 (in case of SLP, do it for all the phis). */ | |
4382 | |
4383 /* Get the loop-entry arguments. */ | |
4384 enum vect_def_type initial_def_dt = vect_unknown_def_type; | |
4385 if (slp_node) | 4491 if (slp_node) |
4386 { | 4492 ; |
4387 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); | |
4388 vec_initial_defs.reserve (vec_num); | |
4389 get_initial_defs_for_reduction (slp_node_instance->reduc_phis, | |
4390 &vec_initial_defs, vec_num, | |
4391 REDUC_GROUP_FIRST_ELEMENT (stmt_info), | |
4392 neutral_op); | |
4393 } | |
4394 else | 4493 else |
4395 { | 4494 { |
4396 /* Get at the scalar def before the loop, that defines the initial value | 4495 /* Get at the scalar def before the loop, that defines the initial value |
4397 of the reduction variable. */ | 4496 of the reduction variable. */ |
4398 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt, | 4497 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt, |
4399 loop_preheader_edge (loop)); | 4498 loop_preheader_edge (loop)); |
4400 /* Optimize: if initial_def is for REDUC_MAX smaller than the base | 4499 /* Optimize: for induction condition reduction, if we can't use zero |
4401 and we can't use zero for induc_val, use initial_def. Similarly | 4500 for induc_val, use initial_def. */ |
4402 for REDUC_MIN and initial_def larger than the base. */ | 4501 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION) |
4403 if (TREE_CODE (initial_def) == INTEGER_CST | 4502 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info); |
4404 && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) | 4503 else if (double_reduc) |
4405 == INTEGER_INDUC_COND_REDUCTION) | 4504 ; |
4406 && !integer_zerop (induc_val) | |
4407 && ((induc_code == MAX_EXPR | |
4408 && tree_int_cst_lt (initial_def, induc_val)) | |
4409 || (induc_code == MIN_EXPR | |
4410 && tree_int_cst_lt (induc_val, initial_def)))) | |
4411 induc_val = initial_def; | |
4412 | |
4413 if (double_reduc) | |
4414 /* In case of double reduction we only create a vector variable | |
4415 to be put in the reduction phi node. The actual statement | |
4416 creation is done later in this function. */ | |
4417 vec_initial_def = vect_create_destination_var (initial_def, vectype); | |
4418 else if (nested_in_vect_loop) | 4505 else if (nested_in_vect_loop) |
4419 { | 4506 ; |
4420 /* Do not use an adjustment def as that case is not supported | |
4421 correctly if ncopies is not one. */ | |
4422 vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt); | |
4423 vec_initial_def = vect_get_vec_def_for_operand (initial_def, | |
4424 stmt_info); | |
4425 } | |
4426 else | 4507 else |
4427 vec_initial_def | 4508 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info); |
4428 = get_initial_def_for_reduction (stmt_info, initial_def, | 4509 } |
4429 &adjustment_def); | 4510 |
4430 vec_initial_defs.create (1); | 4511 unsigned vec_num; |
4431 vec_initial_defs.quick_push (vec_initial_def); | 4512 int ncopies; |
4432 } | 4513 if (slp_node) |
4433 | 4514 { |
4434 /* Set phi nodes arguments. */ | 4515 vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length (); |
4435 FOR_EACH_VEC_ELT (reduction_phis, i, phi_info) | 4516 ncopies = 1; |
4436 { | 4517 } |
4437 tree vec_init_def = vec_initial_defs[i]; | 4518 else |
4438 tree def = vect_defs[i]; | 4519 { |
4439 for (j = 0; j < ncopies; j++) | 4520 vec_num = 1; |
4440 { | 4521 ncopies = 0; |
4441 if (j != 0) | 4522 phi_info = STMT_VINFO_VEC_STMT (loop_vinfo->lookup_stmt (reduc_def_stmt)); |
4442 { | 4523 do |
4443 phi_info = STMT_VINFO_RELATED_STMT (phi_info); | 4524 { |
4444 if (nested_in_vect_loop) | 4525 ncopies++; |
4445 vec_init_def | 4526 phi_info = STMT_VINFO_RELATED_STMT (phi_info); |
4446 = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_init_def); | 4527 } |
4447 } | 4528 while (phi_info); |
4448 | |
4449 /* Set the loop-entry arg of the reduction-phi. */ | |
4450 | |
4451 gphi *phi = as_a <gphi *> (phi_info->stmt); | |
4452 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) | |
4453 == INTEGER_INDUC_COND_REDUCTION) | |
4454 { | |
4455 /* Initialise the reduction phi to zero. This prevents initial | |
4456 values of non-zero interferring with the reduction op. */ | |
4457 gcc_assert (ncopies == 1); | |
4458 gcc_assert (i == 0); | |
4459 | |
4460 tree vec_init_def_type = TREE_TYPE (vec_init_def); | |
4461 tree induc_val_vec | |
4462 = build_vector_from_val (vec_init_def_type, induc_val); | |
4463 | |
4464 add_phi_arg (phi, induc_val_vec, loop_preheader_edge (loop), | |
4465 UNKNOWN_LOCATION); | |
4466 } | |
4467 else | |
4468 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop), | |
4469 UNKNOWN_LOCATION); | |
4470 | |
4471 /* Set the loop-latch arg for the reduction-phi. */ | |
4472 if (j > 0) | |
4473 def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def); | |
4474 | |
4475 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION); | |
4476 | |
4477 if (dump_enabled_p ()) | |
4478 dump_printf_loc (MSG_NOTE, vect_location, | |
4479 "transform reduction: created def-use cycle: %G%G", | |
4480 phi, SSA_NAME_DEF_STMT (def)); | |
4481 } | |
4482 } | 4529 } |
4483 | 4530 |
4484 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR) | 4531 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR) |
4485 which is updated with the current index of the loop for every match of | 4532 which is updated with the current index of the loop for every match of |
4486 the original loop's cond_expr (VEC_STMT). This results in a vector | 4533 the original loop's cond_expr (VEC_STMT). This results in a vector |
4487 containing the last time the condition passed for that vector lane. | 4534 containing the last time the condition passed for that vector lane. |
4488 The first match will be a 1 to allow 0 to be used for non-matching | 4535 The first match will be a 1 to allow 0 to be used for non-matching |
4489 indexes. If there are no matches at all then the vector will be all | 4536 indexes. If there are no matches at all then the vector will be all |
4490 zeroes. */ | 4537 zeroes. |
4491 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION) | 4538 |
4492 { | 4539 PR92772: This algorithm is broken for architectures that support |
4540 masked vectors, but do not provide fold_extract_last. */ | |
4541 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION) | |
4542 { | |
4543 auto_vec<std::pair<tree, bool>, 2> ccompares; | |
4544 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info); | |
4545 cond_info = vect_stmt_to_vectorize (cond_info); | |
4546 while (cond_info != reduc_info) | |
4547 { | |
4548 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR) | |
4549 { | |
4550 gimple *vec_stmt = STMT_VINFO_VEC_STMT (cond_info)->stmt; | |
4551 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR); | |
4552 ccompares.safe_push | |
4553 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)), | |
4554 STMT_VINFO_REDUC_IDX (cond_info) == 2)); | |
4555 } | |
4556 cond_info | |
4557 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt, | |
4558 1 + STMT_VINFO_REDUC_IDX | |
4559 (cond_info))); | |
4560 cond_info = vect_stmt_to_vectorize (cond_info); | |
4561 } | |
4562 gcc_assert (ccompares.length () != 0); | |
4563 | |
4493 tree indx_before_incr, indx_after_incr; | 4564 tree indx_before_incr, indx_after_incr; |
4494 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype); | 4565 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype); |
4495 | |
4496 gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt; | |
4497 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR); | |
4498 | |
4499 int scalar_precision | 4566 int scalar_precision |
4500 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype))); | 4567 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype))); |
4501 tree cr_index_scalar_type = make_unsigned_type (scalar_precision); | 4568 tree cr_index_scalar_type = make_unsigned_type (scalar_precision); |
4502 tree cr_index_vector_type = build_vector_type | 4569 tree cr_index_vector_type = get_related_vectype_for_scalar_type |
4503 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype)); | 4570 (TYPE_MODE (vectype), cr_index_scalar_type, |
4571 TYPE_VECTOR_SUBPARTS (vectype)); | |
4504 | 4572 |
4505 /* First we create a simple vector induction variable which starts | 4573 /* First we create a simple vector induction variable which starts |
4506 with the values {1,2,3,...} (SERIES_VECT) and increments by the | 4574 with the values {1,2,3,...} (SERIES_VECT) and increments by the |
4507 vector size (STEP). */ | 4575 vector size (STEP). */ |
4508 | 4576 |
4532 new_phi = create_phi_node (new_phi_tree, loop->header); | 4600 new_phi = create_phi_node (new_phi_tree, loop->header); |
4533 loop_vinfo->add_stmt (new_phi); | 4601 loop_vinfo->add_stmt (new_phi); |
4534 add_phi_arg (as_a <gphi *> (new_phi), vec_zero, | 4602 add_phi_arg (as_a <gphi *> (new_phi), vec_zero, |
4535 loop_preheader_edge (loop), UNKNOWN_LOCATION); | 4603 loop_preheader_edge (loop), UNKNOWN_LOCATION); |
4536 | 4604 |
4537 /* Now take the condition from the loops original cond_expr | 4605 /* Now take the condition from the loops original cond_exprs |
4538 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for | 4606 and produce a new cond_exprs (INDEX_COND_EXPR) which for |
4539 every match uses values from the induction variable | 4607 every match uses values from the induction variable |
4540 (INDEX_BEFORE_INCR) otherwise uses values from the phi node | 4608 (INDEX_BEFORE_INCR) otherwise uses values from the phi node |
4541 (NEW_PHI_TREE). | 4609 (NEW_PHI_TREE). |
4542 Finally, we update the phi (NEW_PHI_TREE) to take the value of | 4610 Finally, we update the phi (NEW_PHI_TREE) to take the value of |
4543 the new cond_expr (INDEX_COND_EXPR). */ | 4611 the new cond_expr (INDEX_COND_EXPR). */ |
4544 | 4612 gimple_seq stmts = NULL; |
4545 /* Duplicate the condition from vec_stmt. */ | 4613 for (int i = ccompares.length () - 1; i != -1; --i) |
4546 tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt)); | 4614 { |
4547 | 4615 tree ccompare = ccompares[i].first; |
4548 /* Create a conditional, where the condition is taken from vec_stmt | 4616 if (ccompares[i].second) |
4549 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and | 4617 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR, |
4550 else is the phi (NEW_PHI_TREE). */ | 4618 cr_index_vector_type, |
4551 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type, | 4619 ccompare, |
4552 ccompare, indx_before_incr, | 4620 indx_before_incr, new_phi_tree); |
4553 new_phi_tree); | 4621 else |
4554 induction_index = make_ssa_name (cr_index_vector_type); | 4622 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR, |
4555 gimple *index_condition = gimple_build_assign (induction_index, | 4623 cr_index_vector_type, |
4556 index_cond_expr); | 4624 ccompare, |
4557 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT); | 4625 new_phi_tree, indx_before_incr); |
4558 stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition); | 4626 } |
4627 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT); | |
4628 stmt_vec_info index_vec_info | |
4629 = loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (new_phi_tree)); | |
4559 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type; | 4630 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type; |
4560 | 4631 |
4561 /* Update the phi with the vec cond. */ | 4632 /* Update the phi with the vec cond. */ |
4633 induction_index = new_phi_tree; | |
4562 add_phi_arg (as_a <gphi *> (new_phi), induction_index, | 4634 add_phi_arg (as_a <gphi *> (new_phi), induction_index, |
4563 loop_latch_edge (loop), UNKNOWN_LOCATION); | 4635 loop_latch_edge (loop), UNKNOWN_LOCATION); |
4564 } | 4636 } |
4565 | 4637 |
4566 /* 2. Create epilog code. | 4638 /* 2. Create epilog code. |
4591 | 4663 |
4592 | 4664 |
4593 /* 2.1 Create new loop-exit-phis to preserve loop-closed form: | 4665 /* 2.1 Create new loop-exit-phis to preserve loop-closed form: |
4594 v_out1 = phi <VECT_DEF> | 4666 v_out1 = phi <VECT_DEF> |
4595 Store them in NEW_PHIS. */ | 4667 Store them in NEW_PHIS. */ |
4596 | 4668 if (double_reduc) |
4669 loop = outer_loop; | |
4597 exit_bb = single_exit (loop)->dest; | 4670 exit_bb = single_exit (loop)->dest; |
4598 prev_phi_info = NULL; | 4671 prev_phi_info = NULL; |
4599 new_phis.create (vect_defs.length ()); | 4672 new_phis.create (slp_node ? vec_num : ncopies); |
4600 FOR_EACH_VEC_ELT (vect_defs, i, def) | 4673 for (unsigned i = 0; i < vec_num; i++) |
4601 { | 4674 { |
4675 if (slp_node) | |
4676 def = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]->stmt); | |
4677 else | |
4678 def = gimple_get_lhs (STMT_VINFO_VEC_STMT (rdef_info)->stmt); | |
4602 for (j = 0; j < ncopies; j++) | 4679 for (j = 0; j < ncopies; j++) |
4603 { | 4680 { |
4604 tree new_def = copy_ssa_name (def); | 4681 tree new_def = copy_ssa_name (def); |
4605 phi = create_phi_node (new_def, exit_bb); | 4682 phi = create_phi_node (new_def, exit_bb); |
4606 stmt_vec_info phi_info = loop_vinfo->add_stmt (phi); | 4683 stmt_vec_info phi_info = loop_vinfo->add_stmt (phi); |
4615 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def); | 4692 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def); |
4616 prev_phi_info = phi_info; | 4693 prev_phi_info = phi_info; |
4617 } | 4694 } |
4618 } | 4695 } |
4619 | 4696 |
4620 /* The epilogue is created for the outer-loop, i.e., for the loop being | |
4621 vectorized. Create exit phis for the outer loop. */ | |
4622 if (double_reduc) | |
4623 { | |
4624 loop = outer_loop; | |
4625 exit_bb = single_exit (loop)->dest; | |
4626 inner_phis.create (vect_defs.length ()); | |
4627 FOR_EACH_VEC_ELT (new_phis, i, phi) | |
4628 { | |
4629 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi); | |
4630 tree new_result = copy_ssa_name (PHI_RESULT (phi)); | |
4631 gphi *outer_phi = create_phi_node (new_result, exit_bb); | |
4632 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx, | |
4633 PHI_RESULT (phi)); | |
4634 prev_phi_info = loop_vinfo->add_stmt (outer_phi); | |
4635 inner_phis.quick_push (phi_info); | |
4636 new_phis[i] = outer_phi; | |
4637 while (STMT_VINFO_RELATED_STMT (phi_info)) | |
4638 { | |
4639 phi_info = STMT_VINFO_RELATED_STMT (phi_info); | |
4640 new_result = copy_ssa_name (PHI_RESULT (phi_info->stmt)); | |
4641 outer_phi = create_phi_node (new_result, exit_bb); | |
4642 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx, | |
4643 PHI_RESULT (phi_info->stmt)); | |
4644 stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi); | |
4645 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi_info; | |
4646 prev_phi_info = outer_phi_info; | |
4647 } | |
4648 } | |
4649 } | |
4650 | |
4651 exit_gsi = gsi_after_labels (exit_bb); | 4697 exit_gsi = gsi_after_labels (exit_bb); |
4652 | 4698 |
4653 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3 | 4699 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3 |
4654 (i.e. when reduc_fn is not available) and in the final adjustment | 4700 (i.e. when reduc_fn is not available) and in the final adjustment |
4655 code (if needed). Also get the original scalar reduction variable as | 4701 code (if needed). Also get the original scalar reduction variable as |
4664 { | 4710 { |
4665 /* Reduction pattern */ | 4711 /* Reduction pattern */ |
4666 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)); | 4712 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)); |
4667 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info); | 4713 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info); |
4668 } | 4714 } |
4669 | |
4670 code = gimple_assign_rhs_code (orig_stmt_info->stmt); | |
4671 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore, | |
4672 partial results are added and not subtracted. */ | |
4673 if (code == MINUS_EXPR) | |
4674 code = PLUS_EXPR; | |
4675 | 4715 |
4676 scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt); | 4716 scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt); |
4677 scalar_type = TREE_TYPE (scalar_dest); | 4717 scalar_type = TREE_TYPE (scalar_dest); |
4678 scalar_results.create (group_size); | 4718 scalar_results.create (group_size); |
4679 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL); | 4719 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL); |
4680 bitsize = TYPE_SIZE (scalar_type); | 4720 bitsize = TYPE_SIZE (scalar_type); |
4681 | 4721 |
4682 /* In case this is a reduction in an inner-loop while vectorizing an outer | |
4683 loop - we don't need to extract a single scalar result at the end of the | |
4684 inner-loop (unless it is double reduction, i.e., the use of reduction is | |
4685 outside the outer-loop). The final vector of partial results will be used | |
4686 in the vectorized outer-loop, or reduced to a scalar result at the end of | |
4687 the outer-loop. */ | |
4688 if (nested_in_vect_loop && !double_reduc) | |
4689 goto vect_finalize_reduction; | |
4690 | |
4691 /* SLP reduction without reduction chain, e.g., | 4722 /* SLP reduction without reduction chain, e.g., |
4692 # a1 = phi <a2, a0> | 4723 # a1 = phi <a2, a0> |
4693 # b1 = phi <b2, b0> | 4724 # b1 = phi <b2, b0> |
4694 a2 = operation (a1) | 4725 a2 = operation (a1) |
4695 b2 = operation (b1) */ | 4726 b2 = operation (b1) */ |
4708 | 4739 |
4709 we may end up with more than one vector result. Here we reduce them to | 4740 we may end up with more than one vector result. Here we reduce them to |
4710 one vector. */ | 4741 one vector. */ |
4711 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc) | 4742 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc) |
4712 { | 4743 { |
4744 gimple_seq stmts = NULL; | |
4713 tree first_vect = PHI_RESULT (new_phis[0]); | 4745 tree first_vect = PHI_RESULT (new_phis[0]); |
4714 gassign *new_vec_stmt = NULL; | 4746 first_vect = gimple_convert (&stmts, vectype, first_vect); |
4715 vec_dest = vect_create_destination_var (scalar_dest, vectype); | |
4716 for (k = 1; k < new_phis.length (); k++) | 4747 for (k = 1; k < new_phis.length (); k++) |
4717 { | 4748 { |
4718 gimple *next_phi = new_phis[k]; | 4749 gimple *next_phi = new_phis[k]; |
4719 tree second_vect = PHI_RESULT (next_phi); | 4750 tree second_vect = PHI_RESULT (next_phi); |
4720 tree tem = make_ssa_name (vec_dest, new_vec_stmt); | 4751 second_vect = gimple_convert (&stmts, vectype, second_vect); |
4721 new_vec_stmt = gimple_build_assign (tem, code, | 4752 first_vect = gimple_build (&stmts, code, vectype, |
4722 first_vect, second_vect); | 4753 first_vect, second_vect); |
4723 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT); | |
4724 first_vect = tem; | |
4725 } | 4754 } |
4755 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); | |
4726 | 4756 |
4727 new_phi_result = first_vect; | 4757 new_phi_result = first_vect; |
4728 if (new_vec_stmt) | 4758 new_phis.truncate (0); |
4729 { | 4759 new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect)); |
4730 new_phis.truncate (0); | |
4731 new_phis.safe_push (new_vec_stmt); | |
4732 } | |
4733 } | 4760 } |
4734 /* Likewise if we couldn't use a single defuse cycle. */ | 4761 /* Likewise if we couldn't use a single defuse cycle. */ |
4735 else if (ncopies > 1) | 4762 else if (ncopies > 1) |
4736 { | 4763 { |
4737 gcc_assert (new_phis.length () == 1); | 4764 gcc_assert (new_phis.length () == 1); |
4765 gimple_seq stmts = NULL; | |
4738 tree first_vect = PHI_RESULT (new_phis[0]); | 4766 tree first_vect = PHI_RESULT (new_phis[0]); |
4739 gassign *new_vec_stmt = NULL; | 4767 first_vect = gimple_convert (&stmts, vectype, first_vect); |
4740 vec_dest = vect_create_destination_var (scalar_dest, vectype); | |
4741 stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]); | 4768 stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]); |
4742 for (int k = 1; k < ncopies; ++k) | 4769 for (int k = 1; k < ncopies; ++k) |
4743 { | 4770 { |
4744 next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info); | 4771 next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info); |
4745 tree second_vect = PHI_RESULT (next_phi_info->stmt); | 4772 tree second_vect = PHI_RESULT (next_phi_info->stmt); |
4746 tree tem = make_ssa_name (vec_dest, new_vec_stmt); | 4773 second_vect = gimple_convert (&stmts, vectype, second_vect); |
4747 new_vec_stmt = gimple_build_assign (tem, code, | 4774 first_vect = gimple_build (&stmts, code, vectype, |
4748 first_vect, second_vect); | 4775 first_vect, second_vect); |
4749 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT); | 4776 } |
4750 first_vect = tem; | 4777 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); |
4751 } | |
4752 new_phi_result = first_vect; | 4778 new_phi_result = first_vect; |
4753 new_phis.truncate (0); | 4779 new_phis.truncate (0); |
4754 new_phis.safe_push (new_vec_stmt); | 4780 new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect)); |
4755 } | 4781 } |
4756 else | 4782 else |
4757 new_phi_result = PHI_RESULT (new_phis[0]); | 4783 new_phi_result = PHI_RESULT (new_phis[0]); |
4758 | 4784 |
4759 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION | 4785 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION |
4760 && reduc_fn != IFN_LAST) | 4786 && reduc_fn != IFN_LAST) |
4761 { | 4787 { |
4762 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing | 4788 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing |
4763 various data values where the condition matched and another vector | 4789 various data values where the condition matched and another vector |
4764 (INDUCTION_INDEX) containing all the indexes of those matches. We | 4790 (INDUCTION_INDEX) containing all the indexes of those matches. We |
4769 | 4795 |
4770 /* Get various versions of the type of the vector of indexes. */ | 4796 /* Get various versions of the type of the vector of indexes. */ |
4771 tree index_vec_type = TREE_TYPE (induction_index); | 4797 tree index_vec_type = TREE_TYPE (induction_index); |
4772 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type)); | 4798 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type)); |
4773 tree index_scalar_type = TREE_TYPE (index_vec_type); | 4799 tree index_scalar_type = TREE_TYPE (index_vec_type); |
4774 tree index_vec_cmp_type = build_same_sized_truth_vector_type | 4800 tree index_vec_cmp_type = truth_type_for (index_vec_type); |
4775 (index_vec_type); | |
4776 | 4801 |
4777 /* Get an unsigned integer version of the type of the data vector. */ | 4802 /* Get an unsigned integer version of the type of the data vector. */ |
4778 int scalar_precision | 4803 int scalar_precision |
4779 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type)); | 4804 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type)); |
4780 tree scalar_type_unsigned = make_unsigned_type (scalar_precision); | 4805 tree scalar_type_unsigned = make_unsigned_type (scalar_precision); |
4786 can create using a MAX reduction and then expanding. | 4811 can create using a MAX reduction and then expanding. |
4787 In the case where the loop never made any matches, the max index will | 4812 In the case where the loop never made any matches, the max index will |
4788 be zero. */ | 4813 be zero. */ |
4789 | 4814 |
4790 /* Vector of {0, 0, 0,...}. */ | 4815 /* Vector of {0, 0, 0,...}. */ |
4791 tree zero_vec = make_ssa_name (vectype); | 4816 tree zero_vec = build_zero_cst (vectype); |
4792 tree zero_vec_rhs = build_zero_cst (vectype); | 4817 |
4793 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs); | 4818 gimple_seq stmts = NULL; |
4794 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT); | 4819 new_phi_result = gimple_convert (&stmts, vectype, new_phi_result); |
4820 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); | |
4795 | 4821 |
4796 /* Find maximum value from the vector of found indexes. */ | 4822 /* Find maximum value from the vector of found indexes. */ |
4797 tree max_index = make_ssa_name (index_scalar_type); | 4823 tree max_index = make_ssa_name (index_scalar_type); |
4798 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX, | 4824 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX, |
4799 1, induction_index); | 4825 1, induction_index); |
4857 gimple_call_set_lhs (data_reduc_stmt, data_reduc); | 4883 gimple_call_set_lhs (data_reduc_stmt, data_reduc); |
4858 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT); | 4884 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT); |
4859 | 4885 |
4860 /* Convert the reduced value back to the result type and set as the | 4886 /* Convert the reduced value back to the result type and set as the |
4861 result. */ | 4887 result. */ |
4862 gimple_seq stmts = NULL; | 4888 stmts = NULL; |
4863 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type, | 4889 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type, |
4864 data_reduc); | 4890 data_reduc); |
4865 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); | 4891 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); |
4866 scalar_results.safe_push (new_temp); | 4892 scalar_results.safe_push (new_temp); |
4867 } | 4893 } |
4868 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION | 4894 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION |
4869 && reduc_fn == IFN_LAST) | 4895 && reduc_fn == IFN_LAST) |
4870 { | 4896 { |
4871 /* Condition reduction without supported IFN_REDUC_MAX. Generate | 4897 /* Condition reduction without supported IFN_REDUC_MAX. Generate |
4872 idx = 0; | 4898 idx = 0; |
4873 idx_val = induction_index[0]; | 4899 idx_val = induction_index[0]; |
4906 bitsize_int (off))); | 4932 bitsize_int (off))); |
4907 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); | 4933 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); |
4908 if (off != 0) | 4934 if (off != 0) |
4909 { | 4935 { |
4910 tree new_idx_val = idx_val; | 4936 tree new_idx_val = idx_val; |
4911 tree new_val = val; | |
4912 if (off != v_size - el_size) | 4937 if (off != v_size - el_size) |
4913 { | 4938 { |
4914 new_idx_val = make_ssa_name (idx_eltype); | 4939 new_idx_val = make_ssa_name (idx_eltype); |
4915 epilog_stmt = gimple_build_assign (new_idx_val, | 4940 epilog_stmt = gimple_build_assign (new_idx_val, |
4916 MAX_EXPR, idx_val, | 4941 MAX_EXPR, idx_val, |
4917 old_idx_val); | 4942 old_idx_val); |
4918 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); | 4943 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); |
4919 } | 4944 } |
4920 new_val = make_ssa_name (data_eltype); | 4945 tree new_val = make_ssa_name (data_eltype); |
4921 epilog_stmt = gimple_build_assign (new_val, | 4946 epilog_stmt = gimple_build_assign (new_val, |
4922 COND_EXPR, | 4947 COND_EXPR, |
4923 build2 (GT_EXPR, | 4948 build2 (GT_EXPR, |
4924 boolean_type_node, | 4949 boolean_type_node, |
4925 idx_val, | 4950 idx_val, |
4951 | 4976 |
4952 if (dump_enabled_p ()) | 4977 if (dump_enabled_p ()) |
4953 dump_printf_loc (MSG_NOTE, vect_location, | 4978 dump_printf_loc (MSG_NOTE, vect_location, |
4954 "Reduce using direct vector reduction.\n"); | 4979 "Reduce using direct vector reduction.\n"); |
4955 | 4980 |
4981 gimple_seq stmts = NULL; | |
4982 new_phi_result = gimple_convert (&stmts, vectype, new_phi_result); | |
4956 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result)); | 4983 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result)); |
4957 if (!useless_type_conversion_p (scalar_type, vec_elem_type)) | 4984 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn), |
4958 { | 4985 vec_elem_type, new_phi_result); |
4959 tree tmp_dest | 4986 new_temp = gimple_convert (&stmts, scalar_type, new_temp); |
4960 = vect_create_destination_var (scalar_dest, vec_elem_type); | 4987 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); |
4961 epilog_stmt = gimple_build_call_internal (reduc_fn, 1, | 4988 |
4962 new_phi_result); | 4989 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION) |
4963 gimple_set_lhs (epilog_stmt, tmp_dest); | 4990 && induc_val) |
4964 new_temp = make_ssa_name (tmp_dest, epilog_stmt); | |
4965 gimple_set_lhs (epilog_stmt, new_temp); | |
4966 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); | |
4967 | |
4968 epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR, | |
4969 new_temp); | |
4970 } | |
4971 else | |
4972 { | |
4973 epilog_stmt = gimple_build_call_internal (reduc_fn, 1, | |
4974 new_phi_result); | |
4975 gimple_set_lhs (epilog_stmt, new_scalar_dest); | |
4976 } | |
4977 | |
4978 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); | |
4979 gimple_set_lhs (epilog_stmt, new_temp); | |
4980 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); | |
4981 | |
4982 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) | |
4983 == INTEGER_INDUC_COND_REDUCTION) | |
4984 && !operand_equal_p (initial_def, induc_val, 0)) | |
4985 { | 4991 { |
4986 /* Earlier we set the initial value to be a vector if induc_val | 4992 /* Earlier we set the initial value to be a vector if induc_val |
4987 values. Check the result and if it is induc_val then replace | 4993 values. Check the result and if it is induc_val then replace |
4988 with the original initial value, unless induc_val is | 4994 with the original initial value, unless induc_val is |
4989 the same as initial_def already. */ | 4995 the same as initial_def already. */ |
5017 /* Build a vector {0, 1, 2, ...}, with the same number of elements | 5023 /* Build a vector {0, 1, 2, ...}, with the same number of elements |
5018 and the same element size as VECTYPE. */ | 5024 and the same element size as VECTYPE. */ |
5019 tree index = build_index_vector (vectype, 0, 1); | 5025 tree index = build_index_vector (vectype, 0, 1); |
5020 tree index_type = TREE_TYPE (index); | 5026 tree index_type = TREE_TYPE (index); |
5021 tree index_elt_type = TREE_TYPE (index_type); | 5027 tree index_elt_type = TREE_TYPE (index_type); |
5022 tree mask_type = build_same_sized_truth_vector_type (index_type); | 5028 tree mask_type = truth_type_for (index_type); |
5023 | 5029 |
5024 /* Create a vector that, for each element, identifies which of | 5030 /* Create a vector that, for each element, identifies which of |
5025 the REDUC_GROUP_SIZE results should use it. */ | 5031 the REDUC_GROUP_SIZE results should use it. */ |
5026 tree index_mask = build_int_cst (index_elt_type, group_size - 1); | 5032 tree index_mask = build_int_cst (index_elt_type, group_size - 1); |
5027 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index, | 5033 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index, |
5029 | 5035 |
5030 /* Get a neutral vector value. This is simply a splat of the neutral | 5036 /* Get a neutral vector value. This is simply a splat of the neutral |
5031 scalar value if we have one, otherwise the initial scalar value | 5037 scalar value if we have one, otherwise the initial scalar value |
5032 is itself a neutral value. */ | 5038 is itself a neutral value. */ |
5033 tree vector_identity = NULL_TREE; | 5039 tree vector_identity = NULL_TREE; |
5040 tree neutral_op = NULL_TREE; | |
5041 if (slp_node) | |
5042 { | |
5043 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info); | |
5044 neutral_op | |
5045 = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis, | |
5046 vectype, code, first != NULL); | |
5047 } | |
5034 if (neutral_op) | 5048 if (neutral_op) |
5035 vector_identity = gimple_build_vector_from_val (&seq, vectype, | 5049 vector_identity = gimple_build_vector_from_val (&seq, vectype, |
5036 neutral_op); | 5050 neutral_op); |
5037 for (unsigned int i = 0; i < group_size; ++i) | 5051 for (unsigned int i = 0; i < group_size; ++i) |
5038 { | 5052 { |
5042 if (!neutral_op) | 5056 if (!neutral_op) |
5043 { | 5057 { |
5044 tree scalar_value | 5058 tree scalar_value |
5045 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt, | 5059 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt, |
5046 loop_preheader_edge (loop)); | 5060 loop_preheader_edge (loop)); |
5061 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype), | |
5062 scalar_value); | |
5047 vector_identity = gimple_build_vector_from_val (&seq, vectype, | 5063 vector_identity = gimple_build_vector_from_val (&seq, vectype, |
5048 scalar_value); | 5064 scalar_value); |
5049 } | 5065 } |
5050 | 5066 |
5051 /* Calculate the equivalent of: | 5067 /* Calculate the equivalent of: |
5078 else | 5094 else |
5079 { | 5095 { |
5080 bool reduce_with_shift; | 5096 bool reduce_with_shift; |
5081 tree vec_temp; | 5097 tree vec_temp; |
5082 | 5098 |
5083 /* COND reductions all do the final reduction with MAX_EXPR | 5099 gcc_assert (slp_reduc || new_phis.length () == 1); |
5084 or MIN_EXPR. */ | |
5085 if (code == COND_EXPR) | |
5086 { | |
5087 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) | |
5088 == INTEGER_INDUC_COND_REDUCTION) | |
5089 code = induc_code; | |
5090 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) | |
5091 == CONST_COND_REDUCTION) | |
5092 code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info); | |
5093 else | |
5094 code = MAX_EXPR; | |
5095 } | |
5096 | 5100 |
5097 /* See if the target wants to do the final (shift) reduction | 5101 /* See if the target wants to do the final (shift) reduction |
5098 in a vector mode of smaller size and first reduce upper/lower | 5102 in a vector mode of smaller size and first reduce upper/lower |
5099 halves against each other. */ | 5103 halves against each other. */ |
5100 enum machine_mode mode1 = mode; | 5104 enum machine_mode mode1 = mode; |
5101 tree vectype1 = vectype; | 5105 tree stype = TREE_TYPE (vectype); |
5102 unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype)); | 5106 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); |
5103 unsigned sz1 = sz; | 5107 unsigned nunits1 = nunits; |
5108 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode | |
5109 && new_phis.length () == 1) | |
5110 { | |
5111 nunits1 = GET_MODE_NUNITS (mode1).to_constant (); | |
5112 /* For SLP reductions we have to make sure lanes match up, but | |
5113 since we're doing individual element final reduction reducing | |
5114 vector width here is even more important. | |
5115 ??? We can also separate lanes with permutes, for the common | |
5116 case of power-of-two group-size odd/even extracts would work. */ | |
5117 if (slp_reduc && nunits != nunits1) | |
5118 { | |
5119 nunits1 = least_common_multiple (nunits1, group_size); | |
5120 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits); | |
5121 } | |
5122 } | |
5104 if (!slp_reduc | 5123 if (!slp_reduc |
5105 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode) | 5124 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode) |
5106 sz1 = GET_MODE_SIZE (mode1).to_constant (); | 5125 nunits1 = GET_MODE_NUNITS (mode1).to_constant (); |
5107 | 5126 |
5108 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1); | 5127 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype), |
5128 stype, nunits1); | |
5109 reduce_with_shift = have_whole_vector_shift (mode1); | 5129 reduce_with_shift = have_whole_vector_shift (mode1); |
5110 if (!VECTOR_MODE_P (mode1)) | 5130 if (!VECTOR_MODE_P (mode1)) |
5111 reduce_with_shift = false; | 5131 reduce_with_shift = false; |
5112 else | 5132 else |
5113 { | 5133 { |
5117 } | 5137 } |
5118 | 5138 |
5119 /* First reduce the vector to the desired vector size we should | 5139 /* First reduce the vector to the desired vector size we should |
5120 do shift reduction on by combining upper and lower halves. */ | 5140 do shift reduction on by combining upper and lower halves. */ |
5121 new_temp = new_phi_result; | 5141 new_temp = new_phi_result; |
5122 while (sz > sz1) | 5142 while (nunits > nunits1) |
5123 { | 5143 { |
5124 gcc_assert (!slp_reduc); | 5144 nunits /= 2; |
5125 sz /= 2; | 5145 vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype), |
5126 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz); | 5146 stype, nunits); |
5147 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1)); | |
5127 | 5148 |
5128 /* The target has to make sure we support lowpart/highpart | 5149 /* The target has to make sure we support lowpart/highpart |
5129 extraction, either via direct vector extract or through | 5150 extraction, either via direct vector extract or through |
5130 an integer mode punning. */ | 5151 an integer mode punning. */ |
5131 tree dst1, dst2; | 5152 tree dst1, dst2; |
5146 dst2 = make_ssa_name (vectype1); | 5167 dst2 = make_ssa_name (vectype1); |
5147 epilog_stmt | 5168 epilog_stmt |
5148 = gimple_build_assign (dst2, BIT_FIELD_REF, | 5169 = gimple_build_assign (dst2, BIT_FIELD_REF, |
5149 build3 (BIT_FIELD_REF, vectype1, | 5170 build3 (BIT_FIELD_REF, vectype1, |
5150 new_temp, TYPE_SIZE (vectype1), | 5171 new_temp, TYPE_SIZE (vectype1), |
5151 bitsize_int (sz * BITS_PER_UNIT))); | 5172 bitsize_int (bitsize))); |
5152 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); | 5173 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); |
5153 } | 5174 } |
5154 else | 5175 else |
5155 { | 5176 { |
5156 /* Extract via punning to appropriately sized integer mode | 5177 /* Extract via punning to appropriately sized integer mode |
5157 vector. */ | 5178 vector. */ |
5158 tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT, | 5179 tree eltype = build_nonstandard_integer_type (bitsize, 1); |
5159 1); | |
5160 tree etype = build_vector_type (eltype, 2); | 5180 tree etype = build_vector_type (eltype, 2); |
5161 gcc_assert (convert_optab_handler (vec_extract_optab, | 5181 gcc_assert (convert_optab_handler (vec_extract_optab, |
5162 TYPE_MODE (etype), | 5182 TYPE_MODE (etype), |
5163 TYPE_MODE (eltype)) | 5183 TYPE_MODE (eltype)) |
5164 != CODE_FOR_nothing); | 5184 != CODE_FOR_nothing); |
5183 tem = make_ssa_name (eltype); | 5203 tem = make_ssa_name (eltype); |
5184 epilog_stmt | 5204 epilog_stmt |
5185 = gimple_build_assign (tem, BIT_FIELD_REF, | 5205 = gimple_build_assign (tem, BIT_FIELD_REF, |
5186 build3 (BIT_FIELD_REF, eltype, | 5206 build3 (BIT_FIELD_REF, eltype, |
5187 new_temp, TYPE_SIZE (eltype), | 5207 new_temp, TYPE_SIZE (eltype), |
5188 bitsize_int (sz * BITS_PER_UNIT))); | 5208 bitsize_int (bitsize))); |
5189 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); | 5209 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); |
5190 dst2 = make_ssa_name (vectype1); | 5210 dst2 = make_ssa_name (vectype1); |
5191 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR, | 5211 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR, |
5192 build1 (VIEW_CONVERT_EXPR, | 5212 build1 (VIEW_CONVERT_EXPR, |
5193 vectype1, tem)); | 5213 vectype1, tem)); |
5195 } | 5215 } |
5196 | 5216 |
5197 new_temp = make_ssa_name (vectype1); | 5217 new_temp = make_ssa_name (vectype1); |
5198 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2); | 5218 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2); |
5199 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); | 5219 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); |
5220 new_phis[0] = epilog_stmt; | |
5200 } | 5221 } |
5201 | 5222 |
5202 if (reduce_with_shift && !slp_reduc) | 5223 if (reduce_with_shift && !slp_reduc) |
5203 { | 5224 { |
5204 int element_bitsize = tree_to_uhwi (bitsize); | 5225 int element_bitsize = tree_to_uhwi (bitsize); |
5224 | 5245 |
5225 if (dump_enabled_p ()) | 5246 if (dump_enabled_p ()) |
5226 dump_printf_loc (MSG_NOTE, vect_location, | 5247 dump_printf_loc (MSG_NOTE, vect_location, |
5227 "Reduce using vector shifts\n"); | 5248 "Reduce using vector shifts\n"); |
5228 | 5249 |
5229 mode1 = TYPE_MODE (vectype1); | 5250 gimple_seq stmts = NULL; |
5230 vec_dest = vect_create_destination_var (scalar_dest, vectype1); | 5251 new_temp = gimple_convert (&stmts, vectype1, new_temp); |
5231 for (elt_offset = nelements / 2; | 5252 for (elt_offset = nelements / 2; |
5232 elt_offset >= 1; | 5253 elt_offset >= 1; |
5233 elt_offset /= 2) | 5254 elt_offset /= 2) |
5234 { | 5255 { |
5235 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel); | 5256 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel); |
5236 indices.new_vector (sel, 2, nelements); | 5257 indices.new_vector (sel, 2, nelements); |
5237 tree mask = vect_gen_perm_mask_any (vectype1, indices); | 5258 tree mask = vect_gen_perm_mask_any (vectype1, indices); |
5238 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR, | 5259 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1, |
5239 new_temp, zero_vec, mask); | 5260 new_temp, zero_vec, mask); |
5240 new_name = make_ssa_name (vec_dest, epilog_stmt); | 5261 new_temp = gimple_build (&stmts, code, |
5241 gimple_assign_set_lhs (epilog_stmt, new_name); | 5262 vectype1, new_name, new_temp); |
5242 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); | |
5243 | |
5244 epilog_stmt = gimple_build_assign (vec_dest, code, new_name, | |
5245 new_temp); | |
5246 new_temp = make_ssa_name (vec_dest, epilog_stmt); | |
5247 gimple_assign_set_lhs (epilog_stmt, new_temp); | |
5248 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); | |
5249 } | 5263 } |
5264 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); | |
5250 | 5265 |
5251 /* 2.4 Extract the final scalar result. Create: | 5266 /* 2.4 Extract the final scalar result. Create: |
5252 s_out3 = extract_field <v_out2, bitpos> */ | 5267 s_out3 = extract_field <v_out2, bitpos> */ |
5253 | 5268 |
5254 if (dump_enabled_p ()) | 5269 if (dump_enabled_p ()) |
5279 dump_printf_loc (MSG_NOTE, vect_location, | 5294 dump_printf_loc (MSG_NOTE, vect_location, |
5280 "Reduce using scalar code.\n"); | 5295 "Reduce using scalar code.\n"); |
5281 | 5296 |
5282 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1)); | 5297 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1)); |
5283 int element_bitsize = tree_to_uhwi (bitsize); | 5298 int element_bitsize = tree_to_uhwi (bitsize); |
5299 tree compute_type = TREE_TYPE (vectype); | |
5300 gimple_seq stmts = NULL; | |
5284 FOR_EACH_VEC_ELT (new_phis, i, new_phi) | 5301 FOR_EACH_VEC_ELT (new_phis, i, new_phi) |
5285 { | 5302 { |
5286 int bit_offset; | 5303 int bit_offset; |
5287 if (gimple_code (new_phi) == GIMPLE_PHI) | 5304 if (gimple_code (new_phi) == GIMPLE_PHI) |
5288 vec_temp = PHI_RESULT (new_phi); | 5305 vec_temp = PHI_RESULT (new_phi); |
5289 else | 5306 else |
5290 vec_temp = gimple_assign_lhs (new_phi); | 5307 vec_temp = gimple_assign_lhs (new_phi); |
5291 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize, | 5308 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type, |
5292 bitsize_zero_node); | 5309 vec_temp, bitsize, bitsize_zero_node); |
5293 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs); | |
5294 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); | |
5295 gimple_assign_set_lhs (epilog_stmt, new_temp); | |
5296 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); | |
5297 | 5310 |
5298 /* In SLP we don't need to apply reduction operation, so we just | 5311 /* In SLP we don't need to apply reduction operation, so we just |
5299 collect s' values in SCALAR_RESULTS. */ | 5312 collect s' values in SCALAR_RESULTS. */ |
5300 if (slp_reduc) | 5313 if (slp_reduc) |
5301 scalar_results.safe_push (new_temp); | 5314 scalar_results.safe_push (new_temp); |
5303 for (bit_offset = element_bitsize; | 5316 for (bit_offset = element_bitsize; |
5304 bit_offset < vec_size_in_bits; | 5317 bit_offset < vec_size_in_bits; |
5305 bit_offset += element_bitsize) | 5318 bit_offset += element_bitsize) |
5306 { | 5319 { |
5307 tree bitpos = bitsize_int (bit_offset); | 5320 tree bitpos = bitsize_int (bit_offset); |
5308 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, | 5321 new_name = gimple_build (&stmts, BIT_FIELD_REF, |
5309 bitsize, bitpos); | 5322 compute_type, vec_temp, |
5310 | 5323 bitsize, bitpos); |
5311 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs); | |
5312 new_name = make_ssa_name (new_scalar_dest, epilog_stmt); | |
5313 gimple_assign_set_lhs (epilog_stmt, new_name); | |
5314 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); | |
5315 | |
5316 if (slp_reduc) | 5324 if (slp_reduc) |
5317 { | 5325 { |
5318 /* In SLP we don't need to apply reduction operation, so | 5326 /* In SLP we don't need to apply reduction operation, so |
5319 we just collect s' values in SCALAR_RESULTS. */ | 5327 we just collect s' values in SCALAR_RESULTS. */ |
5320 new_temp = new_name; | 5328 new_temp = new_name; |
5321 scalar_results.safe_push (new_name); | 5329 scalar_results.safe_push (new_name); |
5322 } | 5330 } |
5323 else | 5331 else |
5324 { | 5332 new_temp = gimple_build (&stmts, code, compute_type, |
5325 epilog_stmt = gimple_build_assign (new_scalar_dest, code, | 5333 new_name, new_temp); |
5326 new_name, new_temp); | |
5327 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); | |
5328 gimple_assign_set_lhs (epilog_stmt, new_temp); | |
5329 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); | |
5330 } | |
5331 } | 5334 } |
5332 } | 5335 } |
5333 | 5336 |
5334 /* The only case where we need to reduce scalar results in SLP, is | 5337 /* The only case where we need to reduce scalar results in SLP, is |
5335 unrolling. If the size of SCALAR_RESULTS is greater than | 5338 unrolling. If the size of SCALAR_RESULTS is greater than |
5336 REDUC_GROUP_SIZE, we reduce them combining elements modulo | 5339 REDUC_GROUP_SIZE, we reduce them combining elements modulo |
5337 REDUC_GROUP_SIZE. */ | 5340 REDUC_GROUP_SIZE. */ |
5338 if (slp_reduc) | 5341 if (slp_reduc) |
5339 { | 5342 { |
5340 tree res, first_res, new_res; | 5343 tree res, first_res, new_res; |
5341 gimple *new_stmt; | |
5342 | 5344 |
5343 /* Reduce multiple scalar results in case of SLP unrolling. */ | 5345 /* Reduce multiple scalar results in case of SLP unrolling. */ |
5344 for (j = group_size; scalar_results.iterate (j, &res); | 5346 for (j = group_size; scalar_results.iterate (j, &res); |
5345 j++) | 5347 j++) |
5346 { | 5348 { |
5347 first_res = scalar_results[j % group_size]; | 5349 first_res = scalar_results[j % group_size]; |
5348 new_stmt = gimple_build_assign (new_scalar_dest, code, | 5350 new_res = gimple_build (&stmts, code, compute_type, |
5349 first_res, res); | 5351 first_res, res); |
5350 new_res = make_ssa_name (new_scalar_dest, new_stmt); | |
5351 gimple_assign_set_lhs (new_stmt, new_res); | |
5352 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT); | |
5353 scalar_results[j % group_size] = new_res; | 5352 scalar_results[j % group_size] = new_res; |
5354 } | 5353 } |
5354 for (k = 0; k < group_size; k++) | |
5355 scalar_results[k] = gimple_convert (&stmts, scalar_type, | |
5356 scalar_results[k]); | |
5355 } | 5357 } |
5356 else | 5358 else |
5357 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */ | 5359 { |
5358 scalar_results.safe_push (new_temp); | 5360 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */ |
5361 new_temp = gimple_convert (&stmts, scalar_type, new_temp); | |
5362 scalar_results.safe_push (new_temp); | |
5363 } | |
5364 | |
5365 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); | |
5359 } | 5366 } |
5360 | 5367 |
5361 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) | 5368 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION) |
5362 == INTEGER_INDUC_COND_REDUCTION) | 5369 && induc_val) |
5363 && !operand_equal_p (initial_def, induc_val, 0)) | |
5364 { | 5370 { |
5365 /* Earlier we set the initial value to be a vector if induc_val | 5371 /* Earlier we set the initial value to be a vector if induc_val |
5366 values. Check the result and if it is induc_val then replace | 5372 values. Check the result and if it is induc_val then replace |
5367 with the original initial value, unless induc_val is | 5373 with the original initial value, unless induc_val is |
5368 the same as initial_def already. */ | 5374 the same as initial_def already. */ |
5374 initial_def, new_temp); | 5380 initial_def, new_temp); |
5375 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); | 5381 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); |
5376 scalar_results[0] = tmp; | 5382 scalar_results[0] = tmp; |
5377 } | 5383 } |
5378 } | 5384 } |
5379 | 5385 |
5380 vect_finalize_reduction: | |
5381 | |
5382 if (double_reduc) | |
5383 loop = loop->inner; | |
5384 | |
5385 /* 2.5 Adjust the final result by the initial value of the reduction | 5386 /* 2.5 Adjust the final result by the initial value of the reduction |
5386 variable. (When such adjustment is not needed, then | 5387 variable. (When such adjustment is not needed, then |
5387 'adjustment_def' is zero). For example, if code is PLUS we create: | 5388 'adjustment_def' is zero). For example, if code is PLUS we create: |
5388 new_temp = loop_exit_def + adjustment_def */ | 5389 new_temp = loop_exit_def + adjustment_def */ |
5389 | 5390 |
5390 if (adjustment_def) | 5391 if (adjustment_def) |
5391 { | 5392 { |
5392 gcc_assert (!slp_reduc); | 5393 gcc_assert (!slp_reduc); |
5394 gimple_seq stmts = NULL; | |
5393 if (nested_in_vect_loop) | 5395 if (nested_in_vect_loop) |
5394 { | 5396 { |
5395 new_phi = new_phis[0]; | 5397 new_phi = new_phis[0]; |
5396 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE); | 5398 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def))); |
5397 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def); | 5399 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def); |
5398 new_dest = vect_create_destination_var (scalar_dest, vectype); | 5400 new_temp = gimple_build (&stmts, code, vectype, |
5401 PHI_RESULT (new_phi), adjustment_def); | |
5399 } | 5402 } |
5400 else | 5403 else |
5401 { | 5404 { |
5402 new_temp = scalar_results[0]; | 5405 new_temp = scalar_results[0]; |
5403 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE); | 5406 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE); |
5404 expr = build2 (code, scalar_type, new_temp, adjustment_def); | 5407 adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def); |
5405 new_dest = vect_create_destination_var (scalar_dest, scalar_type); | 5408 new_temp = gimple_build (&stmts, code, scalar_type, |
5406 } | 5409 new_temp, adjustment_def); |
5407 | 5410 } |
5408 epilog_stmt = gimple_build_assign (new_dest, expr); | 5411 |
5409 new_temp = make_ssa_name (new_dest, epilog_stmt); | 5412 epilog_stmt = gimple_seq_last_stmt (stmts); |
5410 gimple_assign_set_lhs (epilog_stmt, new_temp); | 5413 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); |
5411 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); | |
5412 if (nested_in_vect_loop) | 5414 if (nested_in_vect_loop) |
5413 { | 5415 { |
5414 stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt); | 5416 stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt); |
5415 STMT_VINFO_RELATED_STMT (epilog_stmt_info) | 5417 STMT_VINFO_RELATED_STMT (epilog_stmt_info) |
5416 = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi)); | 5418 = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi)); |
5423 else | 5425 else |
5424 scalar_results[0] = new_temp; | 5426 scalar_results[0] = new_temp; |
5425 | 5427 |
5426 new_phis[0] = epilog_stmt; | 5428 new_phis[0] = epilog_stmt; |
5427 } | 5429 } |
5430 | |
5431 if (double_reduc) | |
5432 loop = loop->inner; | |
5428 | 5433 |
5429 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit | 5434 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit |
5430 phis with new adjusted scalar results, i.e., replace use <s_out0> | 5435 phis with new adjusted scalar results, i.e., replace use <s_out0> |
5431 with use <s_out4>. | 5436 with use <s_out4>. |
5432 | 5437 |
5469 Therefore, we need to match SCALAR_RESULTS with corresponding statements. | 5474 Therefore, we need to match SCALAR_RESULTS with corresponding statements. |
5470 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results | 5475 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results |
5471 correspond to the first vector stmt, etc. | 5476 correspond to the first vector stmt, etc. |
5472 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */ | 5477 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */ |
5473 if (group_size > new_phis.length ()) | 5478 if (group_size > new_phis.length ()) |
5474 { | 5479 gcc_assert (!(group_size % new_phis.length ())); |
5475 ratio = group_size / new_phis.length (); | 5480 |
5476 gcc_assert (!(group_size % new_phis.length ())); | |
5477 } | |
5478 else | |
5479 ratio = 1; | |
5480 | |
5481 stmt_vec_info epilog_stmt_info = NULL; | |
5482 for (k = 0; k < group_size; k++) | 5481 for (k = 0; k < group_size; k++) |
5483 { | 5482 { |
5484 if (k % ratio == 0) | |
5485 { | |
5486 epilog_stmt_info = loop_vinfo->lookup_stmt (new_phis[k / ratio]); | |
5487 reduction_phi_info = reduction_phis[k / ratio]; | |
5488 if (double_reduc) | |
5489 inner_phi = inner_phis[k / ratio]; | |
5490 } | |
5491 | |
5492 if (slp_reduc) | 5483 if (slp_reduc) |
5493 { | 5484 { |
5494 stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k]; | 5485 stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k]; |
5495 | 5486 |
5496 orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info); | 5487 orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info); |
5497 /* SLP statements can't participate in patterns. */ | 5488 /* SLP statements can't participate in patterns. */ |
5498 gcc_assert (!orig_stmt_info); | 5489 gcc_assert (!orig_stmt_info); |
5499 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt); | 5490 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt); |
5500 } | 5491 } |
5501 | 5492 |
5502 phis.create (3); | |
5503 /* Find the loop-closed-use at the loop exit of the original scalar | |
5504 result. (The reduction result is expected to have two immediate uses - | |
5505 one at the latch block, and one at the loop exit). */ | |
5506 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest) | |
5507 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))) | |
5508 && !is_gimple_debug (USE_STMT (use_p))) | |
5509 phis.safe_push (USE_STMT (use_p)); | |
5510 | |
5511 /* While we expect to have found an exit_phi because of loop-closed-ssa | |
5512 form we can end up without one if the scalar cycle is dead. */ | |
5513 | |
5514 FOR_EACH_VEC_ELT (phis, i, exit_phi) | |
5515 { | |
5516 if (outer_loop) | |
5517 { | |
5518 stmt_vec_info exit_phi_vinfo | |
5519 = loop_vinfo->lookup_stmt (exit_phi); | |
5520 gphi *vect_phi; | |
5521 | |
5522 /* FORNOW. Currently not supporting the case that an inner-loop | |
5523 reduction is not used in the outer-loop (but only outside the | |
5524 outer-loop), unless it is double reduction. */ | |
5525 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo) | |
5526 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)) | |
5527 || double_reduc); | |
5528 | |
5529 if (double_reduc) | |
5530 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi; | |
5531 else | |
5532 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt_info; | |
5533 if (!double_reduc | |
5534 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo) | |
5535 != vect_double_reduction_def) | |
5536 continue; | |
5537 | |
5538 /* Handle double reduction: | |
5539 | |
5540 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop) | |
5541 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop) | |
5542 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop) | |
5543 stmt4: s2 = phi <s4> - double reduction stmt (outer loop) | |
5544 | |
5545 At that point the regular reduction (stmt2 and stmt3) is | |
5546 already vectorized, as well as the exit phi node, stmt4. | |
5547 Here we vectorize the phi node of double reduction, stmt1, and | |
5548 update all relevant statements. */ | |
5549 | |
5550 /* Go through all the uses of s2 to find double reduction phi | |
5551 node, i.e., stmt1 above. */ | |
5552 orig_name = PHI_RESULT (exit_phi); | |
5553 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name) | |
5554 { | |
5555 stmt_vec_info use_stmt_vinfo; | |
5556 tree vect_phi_init, preheader_arg, vect_phi_res; | |
5557 basic_block bb = gimple_bb (use_stmt); | |
5558 | |
5559 /* Check that USE_STMT is really double reduction phi | |
5560 node. */ | |
5561 if (gimple_code (use_stmt) != GIMPLE_PHI | |
5562 || gimple_phi_num_args (use_stmt) != 2 | |
5563 || bb->loop_father != outer_loop) | |
5564 continue; | |
5565 use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt); | |
5566 if (!use_stmt_vinfo | |
5567 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo) | |
5568 != vect_double_reduction_def) | |
5569 continue; | |
5570 | |
5571 /* Create vector phi node for double reduction: | |
5572 vs1 = phi <vs0, vs2> | |
5573 vs1 was created previously in this function by a call to | |
5574 vect_get_vec_def_for_operand and is stored in | |
5575 vec_initial_def; | |
5576 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI; | |
5577 vs0 is created here. */ | |
5578 | |
5579 /* Create vector phi node. */ | |
5580 vect_phi = create_phi_node (vec_initial_def, bb); | |
5581 loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi); | |
5582 | |
5583 /* Create vs0 - initial def of the double reduction phi. */ | |
5584 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt, | |
5585 loop_preheader_edge (outer_loop)); | |
5586 vect_phi_init = get_initial_def_for_reduction | |
5587 (stmt_info, preheader_arg, NULL); | |
5588 | |
5589 /* Update phi node arguments with vs0 and vs2. */ | |
5590 add_phi_arg (vect_phi, vect_phi_init, | |
5591 loop_preheader_edge (outer_loop), | |
5592 UNKNOWN_LOCATION); | |
5593 add_phi_arg (vect_phi, PHI_RESULT (inner_phi->stmt), | |
5594 loop_latch_edge (outer_loop), UNKNOWN_LOCATION); | |
5595 if (dump_enabled_p ()) | |
5596 dump_printf_loc (MSG_NOTE, vect_location, | |
5597 "created double reduction phi node: %G", | |
5598 vect_phi); | |
5599 | |
5600 vect_phi_res = PHI_RESULT (vect_phi); | |
5601 | |
5602 /* Replace the use, i.e., set the correct vs1 in the regular | |
5603 reduction phi node. FORNOW, NCOPIES is always 1, so the | |
5604 loop is redundant. */ | |
5605 stmt_vec_info use_info = reduction_phi_info; | |
5606 for (j = 0; j < ncopies; j++) | |
5607 { | |
5608 edge pr_edge = loop_preheader_edge (loop); | |
5609 SET_PHI_ARG_DEF (as_a <gphi *> (use_info->stmt), | |
5610 pr_edge->dest_idx, vect_phi_res); | |
5611 use_info = STMT_VINFO_RELATED_STMT (use_info); | |
5612 } | |
5613 } | |
5614 } | |
5615 } | |
5616 | |
5617 phis.release (); | |
5618 if (nested_in_vect_loop) | 5493 if (nested_in_vect_loop) |
5619 { | 5494 { |
5620 if (double_reduc) | 5495 if (double_reduc) |
5621 loop = outer_loop; | 5496 loop = outer_loop; |
5622 else | 5497 else |
5623 continue; | 5498 gcc_unreachable (); |
5624 } | 5499 } |
5625 | 5500 |
5626 phis.create (3); | 5501 phis.create (3); |
5627 /* Find the loop-closed-use at the loop exit of the original scalar | 5502 /* Find the loop-closed-use at the loop exit of the original scalar |
5628 result. (The reduction result is expected to have two immediate uses, | 5503 result. (The reduction result is expected to have two immediate uses, |
5656 { | 5531 { |
5657 /* Replace the uses: */ | 5532 /* Replace the uses: */ |
5658 orig_name = PHI_RESULT (exit_phi); | 5533 orig_name = PHI_RESULT (exit_phi); |
5659 scalar_result = scalar_results[k]; | 5534 scalar_result = scalar_results[k]; |
5660 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name) | 5535 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name) |
5661 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) | 5536 { |
5662 SET_USE (use_p, scalar_result); | 5537 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) |
5538 SET_USE (use_p, scalar_result); | |
5539 update_stmt (use_stmt); | |
5540 } | |
5663 } | 5541 } |
5664 | 5542 |
5665 phis.release (); | 5543 phis.release (); |
5666 } | 5544 } |
5667 } | 5545 } |
5716 lhs = new_name; | 5594 lhs = new_name; |
5717 } | 5595 } |
5718 return lhs; | 5596 return lhs; |
5719 } | 5597 } |
5720 | 5598 |
5599 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the | |
5600 type of the vector input. */ | |
5601 | |
5602 static internal_fn | |
5603 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in) | |
5604 { | |
5605 internal_fn mask_reduc_fn; | |
5606 | |
5607 switch (reduc_fn) | |
5608 { | |
5609 case IFN_FOLD_LEFT_PLUS: | |
5610 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS; | |
5611 break; | |
5612 | |
5613 default: | |
5614 return IFN_LAST; | |
5615 } | |
5616 | |
5617 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in, | |
5618 OPTIMIZE_FOR_SPEED)) | |
5619 return mask_reduc_fn; | |
5620 return IFN_LAST; | |
5621 } | |
5622 | |
5721 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the | 5623 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the |
5722 statement that sets the live-out value. REDUC_DEF_STMT is the phi | 5624 statement that sets the live-out value. REDUC_DEF_STMT is the phi |
5723 statement. CODE is the operation performed by STMT_INFO and OPS are | 5625 statement. CODE is the operation performed by STMT_INFO and OPS are |
5724 its scalar operands. REDUC_INDEX is the index of the operand in | 5626 its scalar operands. REDUC_INDEX is the index of the operand in |
5725 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that | 5627 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that |
5735 tree_code code, internal_fn reduc_fn, | 5637 tree_code code, internal_fn reduc_fn, |
5736 tree ops[3], tree vectype_in, | 5638 tree ops[3], tree vectype_in, |
5737 int reduc_index, vec_loop_masks *masks) | 5639 int reduc_index, vec_loop_masks *masks) |
5738 { | 5640 { |
5739 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | 5641 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); |
5740 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | 5642 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
5741 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); | 5643 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); |
5742 stmt_vec_info new_stmt_info = NULL; | 5644 stmt_vec_info new_stmt_info = NULL; |
5645 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in); | |
5743 | 5646 |
5744 int ncopies; | 5647 int ncopies; |
5745 if (slp_node) | 5648 if (slp_node) |
5746 ncopies = 1; | 5649 ncopies = 1; |
5747 else | 5650 else |
5748 ncopies = vect_get_num_copies (loop_vinfo, vectype_in); | 5651 ncopies = vect_get_num_copies (loop_vinfo, vectype_in); |
5749 | 5652 |
5750 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info)); | 5653 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info)); |
5751 gcc_assert (ncopies == 1); | 5654 gcc_assert (ncopies == 1); |
5752 gcc_assert (TREE_CODE_LENGTH (code) == binary_op); | 5655 gcc_assert (TREE_CODE_LENGTH (code) == binary_op); |
5753 gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1)); | |
5754 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) | |
5755 == FOLD_LEFT_REDUCTION); | |
5756 | 5656 |
5757 if (slp_node) | 5657 if (slp_node) |
5758 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out), | 5658 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out), |
5759 TYPE_VECTOR_SUBPARTS (vectype_in))); | 5659 TYPE_VECTOR_SUBPARTS (vectype_in))); |
5760 | 5660 |
5763 int group_size = 1; | 5663 int group_size = 1; |
5764 stmt_vec_info scalar_dest_def_info; | 5664 stmt_vec_info scalar_dest_def_info; |
5765 auto_vec<tree> vec_oprnds0; | 5665 auto_vec<tree> vec_oprnds0; |
5766 if (slp_node) | 5666 if (slp_node) |
5767 { | 5667 { |
5768 vect_get_vec_defs (op0, NULL_TREE, stmt_info, &vec_oprnds0, NULL, | 5668 auto_vec<vec<tree> > vec_defs (2); |
5769 slp_node); | 5669 vect_get_slp_defs (slp_node, &vec_defs); |
5670 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]); | |
5671 vec_defs[0].release (); | |
5672 vec_defs[1].release (); | |
5770 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length (); | 5673 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length (); |
5771 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]; | 5674 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]; |
5772 } | 5675 } |
5773 else | 5676 else |
5774 { | 5677 { |
5808 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0); | 5711 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0); |
5809 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); | 5712 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); |
5810 def0 = negated; | 5713 def0 = negated; |
5811 } | 5714 } |
5812 | 5715 |
5813 if (mask) | 5716 if (mask && mask_reduc_fn == IFN_LAST) |
5814 def0 = merge_with_identity (gsi, mask, vectype_out, def0, | 5717 def0 = merge_with_identity (gsi, mask, vectype_out, def0, |
5815 vector_identity); | 5718 vector_identity); |
5816 | 5719 |
5817 /* On the first iteration the input is simply the scalar phi | 5720 /* On the first iteration the input is simply the scalar phi |
5818 result, and for subsequent iterations it is the output of | 5721 result, and for subsequent iterations it is the output of |
5819 the preceding operation. */ | 5722 the preceding operation. */ |
5820 if (reduc_fn != IFN_LAST) | 5723 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST)) |
5821 { | 5724 { |
5822 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0); | 5725 if (mask && mask_reduc_fn != IFN_LAST) |
5726 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var, | |
5727 def0, mask); | |
5728 else | |
5729 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, | |
5730 def0); | |
5823 /* For chained SLP reductions the output of the previous reduction | 5731 /* For chained SLP reductions the output of the previous reduction |
5824 operation serves as the input of the next. For the final statement | 5732 operation serves as the input of the next. For the final statement |
5825 the output cannot be a temporary - we reuse the original | 5733 the output cannot be a temporary - we reuse the original |
5826 scalar destination of the last statement. */ | 5734 scalar destination of the last statement. */ |
5827 if (i != vec_num - 1) | 5735 if (i != vec_num - 1) |
5837 reduc_var, def0); | 5745 reduc_var, def0); |
5838 new_stmt = SSA_NAME_DEF_STMT (reduc_var); | 5746 new_stmt = SSA_NAME_DEF_STMT (reduc_var); |
5839 /* Remove the statement, so that we can use the same code paths | 5747 /* Remove the statement, so that we can use the same code paths |
5840 as for statements that we've just created. */ | 5748 as for statements that we've just created. */ |
5841 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt); | 5749 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt); |
5842 gsi_remove (&tmp_gsi, false); | 5750 gsi_remove (&tmp_gsi, true); |
5843 } | 5751 } |
5844 | 5752 |
5845 if (i == vec_num - 1) | 5753 if (i == vec_num - 1) |
5846 { | 5754 { |
5847 gimple_set_lhs (new_stmt, scalar_dest); | 5755 gimple_set_lhs (new_stmt, scalar_dest); |
5866 | 5774 |
5867 Check if STMT_VINO (which is part of loop LOOP) both increments and | 5775 Check if STMT_VINO (which is part of loop LOOP) both increments and |
5868 does not cause overflow. */ | 5776 does not cause overflow. */ |
5869 | 5777 |
5870 static bool | 5778 static bool |
5871 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop) | 5779 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop) |
5872 { | 5780 { |
5873 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt); | 5781 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt); |
5874 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo); | 5782 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo); |
5875 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo); | 5783 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo); |
5876 tree lhs_type = TREE_TYPE (gimple_phi_result (phi)); | 5784 tree lhs_type = TREE_TYPE (gimple_phi_result (phi)); |
5900 if (overflow) | 5808 if (overflow) |
5901 return false; | 5809 return false; |
5902 | 5810 |
5903 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type)) | 5811 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type)) |
5904 <= TYPE_PRECISION (lhs_type)); | 5812 <= TYPE_PRECISION (lhs_type)); |
5813 } | |
5814 | |
5815 /* Check if masking can be supported by inserting a conditional expression. | |
5816 CODE is the code for the operation. COND_FN is the conditional internal | |
5817 function, if it exists. VECTYPE_IN is the type of the vector input. */ | |
5818 static bool | |
5819 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn, | |
5820 tree vectype_in) | |
5821 { | |
5822 if (cond_fn != IFN_LAST | |
5823 && direct_internal_fn_supported_p (cond_fn, vectype_in, | |
5824 OPTIMIZE_FOR_SPEED)) | |
5825 return false; | |
5826 | |
5827 switch (code) | |
5828 { | |
5829 case DOT_PROD_EXPR: | |
5830 case SAD_EXPR: | |
5831 return true; | |
5832 | |
5833 default: | |
5834 return false; | |
5835 } | |
5836 } | |
5837 | |
5838 /* Insert a conditional expression to enable masked vectorization. CODE is the | |
5839 code for the operation. VOP is the array of operands. MASK is the loop | |
5840 mask. GSI is a statement iterator used to place the new conditional | |
5841 expression. */ | |
5842 static void | |
5843 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask, | |
5844 gimple_stmt_iterator *gsi) | |
5845 { | |
5846 switch (code) | |
5847 { | |
5848 case DOT_PROD_EXPR: | |
5849 { | |
5850 tree vectype = TREE_TYPE (vop[1]); | |
5851 tree zero = build_zero_cst (vectype); | |
5852 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1"); | |
5853 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR, | |
5854 mask, vop[1], zero); | |
5855 gsi_insert_before (gsi, select, GSI_SAME_STMT); | |
5856 vop[1] = masked_op1; | |
5857 break; | |
5858 } | |
5859 | |
5860 case SAD_EXPR: | |
5861 { | |
5862 tree vectype = TREE_TYPE (vop[1]); | |
5863 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1"); | |
5864 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR, | |
5865 mask, vop[1], vop[0]); | |
5866 gsi_insert_before (gsi, select, GSI_SAME_STMT); | |
5867 vop[1] = masked_op1; | |
5868 break; | |
5869 } | |
5870 | |
5871 default: | |
5872 gcc_unreachable (); | |
5873 } | |
5905 } | 5874 } |
5906 | 5875 |
5907 /* Function vectorizable_reduction. | 5876 /* Function vectorizable_reduction. |
5908 | 5877 |
5909 Check if STMT_INFO performs a reduction operation that can be vectorized. | 5878 Check if STMT_INFO performs a reduction operation that can be vectorized. |
5945 indicates what is the actual level of parallelism (V8HI in the example), so | 5914 indicates what is the actual level of parallelism (V8HI in the example), so |
5946 that the right vectorization factor would be derived. This vectype | 5915 that the right vectorization factor would be derived. This vectype |
5947 corresponds to the type of arguments to the reduction stmt, and should *NOT* | 5916 corresponds to the type of arguments to the reduction stmt, and should *NOT* |
5948 be used to create the vectorized stmt. The right vectype for the vectorized | 5917 be used to create the vectorized stmt. The right vectype for the vectorized |
5949 stmt is obtained from the type of the result X: | 5918 stmt is obtained from the type of the result X: |
5950 get_vectype_for_scalar_type (TREE_TYPE (X)) | 5919 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X)) |
5951 | 5920 |
5952 This means that, contrary to "regular" reductions (or "regular" stmts in | 5921 This means that, contrary to "regular" reductions (or "regular" stmts in |
5953 general), the following equation: | 5922 general), the following equation: |
5954 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X)) | 5923 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X)) |
5955 does *NOT* necessarily hold for reduction patterns. */ | 5924 does *NOT* necessarily hold for reduction patterns. */ |
5956 | 5925 |
5957 bool | 5926 bool |
5958 vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, | 5927 vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node, |
5959 stmt_vec_info *vec_stmt, slp_tree slp_node, | |
5960 slp_instance slp_node_instance, | 5928 slp_instance slp_node_instance, |
5961 stmt_vector_for_cost *cost_vec) | 5929 stmt_vector_for_cost *cost_vec) |
5962 { | 5930 { |
5963 tree vec_dest; | |
5964 tree scalar_dest; | 5931 tree scalar_dest; |
5965 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); | |
5966 tree vectype_in = NULL_TREE; | 5932 tree vectype_in = NULL_TREE; |
5967 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | 5933 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); |
5968 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | 5934 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
5969 enum tree_code code, orig_code; | 5935 enum vect_def_type cond_reduc_dt = vect_unknown_def_type; |
5970 internal_fn reduc_fn; | |
5971 machine_mode vec_mode; | |
5972 int op_type; | |
5973 optab optab; | |
5974 tree new_temp = NULL_TREE; | |
5975 enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type; | |
5976 stmt_vec_info cond_stmt_vinfo = NULL; | 5936 stmt_vec_info cond_stmt_vinfo = NULL; |
5977 enum tree_code cond_reduc_op_code = ERROR_MARK; | |
5978 tree scalar_type; | 5937 tree scalar_type; |
5979 bool is_simple_use; | |
5980 int i; | 5938 int i; |
5981 int ncopies; | 5939 int ncopies; |
5982 int epilog_copies; | |
5983 stmt_vec_info prev_stmt_info, prev_phi_info; | |
5984 bool single_defuse_cycle = false; | 5940 bool single_defuse_cycle = false; |
5985 stmt_vec_info new_stmt_info = NULL; | 5941 bool nested_cycle = false; |
5986 int j; | |
5987 tree ops[3]; | |
5988 enum vect_def_type dts[3]; | |
5989 bool nested_cycle = false, found_nested_cycle_def = false; | |
5990 bool double_reduc = false; | 5942 bool double_reduc = false; |
5991 basic_block def_bb; | |
5992 struct loop * def_stmt_loop; | |
5993 tree def_arg; | |
5994 auto_vec<tree> vec_oprnds0; | |
5995 auto_vec<tree> vec_oprnds1; | |
5996 auto_vec<tree> vec_oprnds2; | |
5997 auto_vec<tree> vect_defs; | |
5998 auto_vec<stmt_vec_info> phis; | |
5999 int vec_num; | 5943 int vec_num; |
6000 tree def0, tem; | 5944 tree tem; |
6001 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE; | 5945 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE; |
6002 tree cond_reduc_val = NULL_TREE; | 5946 tree cond_reduc_val = NULL_TREE; |
6003 | 5947 |
6004 /* Make sure it was already recognized as a reduction computation. */ | 5948 /* Make sure it was already recognized as a reduction computation. */ |
6005 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def | 5949 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def |
5950 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def | |
6006 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle) | 5951 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle) |
6007 return false; | 5952 return false; |
6008 | 5953 |
5954 /* The stmt we store reduction analysis meta on. */ | |
5955 stmt_vec_info reduc_info = info_for_reduction (stmt_info); | |
5956 reduc_info->is_reduc_info = true; | |
5957 | |
5958 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle) | |
5959 { | |
5960 if (is_a <gphi *> (stmt_info->stmt)) | |
5961 /* Analysis for double-reduction is done on the outer | |
5962 loop PHI, nested cycles have no further restrictions. */ | |
5963 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type; | |
5964 else | |
5965 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; | |
5966 return true; | |
5967 } | |
5968 | |
5969 stmt_vec_info orig_stmt_of_analysis = stmt_info; | |
5970 stmt_vec_info phi_info = stmt_info; | |
5971 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def | |
5972 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def) | |
5973 { | |
5974 if (!is_a <gphi *> (stmt_info->stmt)) | |
5975 { | |
5976 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; | |
5977 return true; | |
5978 } | |
5979 if (slp_node) | |
5980 { | |
5981 slp_node_instance->reduc_phis = slp_node; | |
5982 /* ??? We're leaving slp_node to point to the PHIs, we only | |
5983 need it to get at the number of vector stmts which wasn't | |
5984 yet initialized for the instance root. */ | |
5985 } | |
5986 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def) | |
5987 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info)); | |
5988 else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */ | |
5989 { | |
5990 use_operand_p use_p; | |
5991 gimple *use_stmt; | |
5992 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt), | |
5993 &use_p, &use_stmt); | |
5994 gcc_assert (res); | |
5995 phi_info = loop_vinfo->lookup_stmt (use_stmt); | |
5996 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info)); | |
5997 } | |
5998 } | |
5999 | |
6000 /* PHIs should not participate in patterns. */ | |
6001 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info)); | |
6002 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt); | |
6003 | |
6004 /* Verify following REDUC_IDX from the latch def leads us back to the PHI | |
6005 and compute the reduction chain length. */ | |
6006 tree reduc_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, | |
6007 loop_latch_edge (loop)); | |
6008 unsigned reduc_chain_length = 0; | |
6009 bool only_slp_reduc_chain = true; | |
6010 stmt_info = NULL; | |
6011 while (reduc_def != PHI_RESULT (reduc_def_phi)) | |
6012 { | |
6013 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def); | |
6014 stmt_vec_info vdef = vect_stmt_to_vectorize (def); | |
6015 if (STMT_VINFO_REDUC_IDX (vdef) == -1) | |
6016 { | |
6017 if (dump_enabled_p ()) | |
6018 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
6019 "reduction chain broken by patterns.\n"); | |
6020 return false; | |
6021 } | |
6022 if (!REDUC_GROUP_FIRST_ELEMENT (vdef)) | |
6023 only_slp_reduc_chain = false; | |
6024 /* ??? For epilogue generation live members of the chain need | |
6025 to point back to the PHI via their original stmt for | |
6026 info_for_reduction to work. */ | |
6027 if (STMT_VINFO_LIVE_P (vdef)) | |
6028 STMT_VINFO_REDUC_DEF (def) = phi_info; | |
6029 gassign *assign = dyn_cast <gassign *> (vdef->stmt); | |
6030 if (!assign) | |
6031 { | |
6032 if (dump_enabled_p ()) | |
6033 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
6034 "reduction chain includes calls.\n"); | |
6035 return false; | |
6036 } | |
6037 if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign))) | |
6038 { | |
6039 if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)), | |
6040 TREE_TYPE (gimple_assign_rhs1 (assign)))) | |
6041 { | |
6042 if (dump_enabled_p ()) | |
6043 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
6044 "conversion in the reduction chain.\n"); | |
6045 return false; | |
6046 } | |
6047 } | |
6048 else if (!stmt_info) | |
6049 /* First non-conversion stmt. */ | |
6050 stmt_info = vdef; | |
6051 reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef)); | |
6052 reduc_chain_length++; | |
6053 } | |
6054 /* PHIs should not participate in patterns. */ | |
6055 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info)); | |
6056 | |
6009 if (nested_in_vect_loop_p (loop, stmt_info)) | 6057 if (nested_in_vect_loop_p (loop, stmt_info)) |
6010 { | 6058 { |
6011 loop = loop->inner; | 6059 loop = loop->inner; |
6012 nested_cycle = true; | 6060 nested_cycle = true; |
6013 } | 6061 } |
6014 | 6062 |
6063 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last | |
6064 element. */ | |
6065 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info)) | |
6066 { | |
6067 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info)); | |
6068 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info); | |
6069 } | |
6015 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)) | 6070 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)) |
6016 gcc_assert (slp_node | 6071 gcc_assert (slp_node |
6017 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info); | 6072 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info); |
6018 | |
6019 if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt)) | |
6020 { | |
6021 tree phi_result = gimple_phi_result (phi); | |
6022 /* Analysis is fully done on the reduction stmt invocation. */ | |
6023 if (! vec_stmt) | |
6024 { | |
6025 if (slp_node) | |
6026 slp_node_instance->reduc_phis = slp_node; | |
6027 | |
6028 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; | |
6029 return true; | |
6030 } | |
6031 | |
6032 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION) | |
6033 /* Leave the scalar phi in place. Note that checking | |
6034 STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works | |
6035 for reductions involving a single statement. */ | |
6036 return true; | |
6037 | |
6038 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info); | |
6039 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info); | |
6040 | |
6041 if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info) | |
6042 == EXTRACT_LAST_REDUCTION) | |
6043 /* Leave the scalar phi in place. */ | |
6044 return true; | |
6045 | |
6046 gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt); | |
6047 for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k) | |
6048 { | |
6049 tree op = gimple_op (reduc_stmt, k); | |
6050 if (op == phi_result) | |
6051 continue; | |
6052 if (k == 1 | |
6053 && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR) | |
6054 continue; | |
6055 if (!vectype_in | |
6056 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in))) | |
6057 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op))))) | |
6058 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op)); | |
6059 break; | |
6060 } | |
6061 gcc_assert (vectype_in); | |
6062 | |
6063 if (slp_node) | |
6064 ncopies = 1; | |
6065 else | |
6066 ncopies = vect_get_num_copies (loop_vinfo, vectype_in); | |
6067 | |
6068 stmt_vec_info use_stmt_info; | |
6069 if (ncopies > 1 | |
6070 && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live | |
6071 && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result)) | |
6072 && vect_stmt_to_vectorize (use_stmt_info) == reduc_stmt_info) | |
6073 single_defuse_cycle = true; | |
6074 | |
6075 /* Create the destination vector */ | |
6076 scalar_dest = gimple_assign_lhs (reduc_stmt); | |
6077 vec_dest = vect_create_destination_var (scalar_dest, vectype_out); | |
6078 | |
6079 if (slp_node) | |
6080 /* The size vect_schedule_slp_instance computes is off for us. */ | |
6081 vec_num = vect_get_num_vectors | |
6082 (LOOP_VINFO_VECT_FACTOR (loop_vinfo) | |
6083 * SLP_TREE_SCALAR_STMTS (slp_node).length (), | |
6084 vectype_in); | |
6085 else | |
6086 vec_num = 1; | |
6087 | |
6088 /* Generate the reduction PHIs upfront. */ | |
6089 prev_phi_info = NULL; | |
6090 for (j = 0; j < ncopies; j++) | |
6091 { | |
6092 if (j == 0 || !single_defuse_cycle) | |
6093 { | |
6094 for (i = 0; i < vec_num; i++) | |
6095 { | |
6096 /* Create the reduction-phi that defines the reduction | |
6097 operand. */ | |
6098 gimple *new_phi = create_phi_node (vec_dest, loop->header); | |
6099 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi); | |
6100 | |
6101 if (slp_node) | |
6102 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info); | |
6103 else | |
6104 { | |
6105 if (j == 0) | |
6106 STMT_VINFO_VEC_STMT (stmt_info) | |
6107 = *vec_stmt = new_phi_info; | |
6108 else | |
6109 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info; | |
6110 prev_phi_info = new_phi_info; | |
6111 } | |
6112 } | |
6113 } | |
6114 } | |
6115 | |
6116 return true; | |
6117 } | |
6118 | 6073 |
6119 /* 1. Is vectorizable reduction? */ | 6074 /* 1. Is vectorizable reduction? */ |
6120 /* Not supportable if the reduction variable is used in the loop, unless | 6075 /* Not supportable if the reduction variable is used in the loop, unless |
6121 it's a reduction chain. */ | 6076 it's a reduction chain. */ |
6122 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer | 6077 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer |
6145 | 6100 |
6146 /* 3. Check the operands of the operation. The first operands are defined | 6101 /* 3. Check the operands of the operation. The first operands are defined |
6147 inside the loop body. The last operand is the reduction variable, | 6102 inside the loop body. The last operand is the reduction variable, |
6148 which is defined by the loop-header-phi. */ | 6103 which is defined by the loop-header-phi. */ |
6149 | 6104 |
6105 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); | |
6106 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out; | |
6150 gassign *stmt = as_a <gassign *> (stmt_info->stmt); | 6107 gassign *stmt = as_a <gassign *> (stmt_info->stmt); |
6151 | 6108 enum tree_code code = gimple_assign_rhs_code (stmt); |
6152 /* Flatten RHS. */ | 6109 bool lane_reduc_code_p |
6153 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt))) | 6110 = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR); |
6154 { | 6111 int op_type = TREE_CODE_LENGTH (code); |
6155 case GIMPLE_BINARY_RHS: | |
6156 code = gimple_assign_rhs_code (stmt); | |
6157 op_type = TREE_CODE_LENGTH (code); | |
6158 gcc_assert (op_type == binary_op); | |
6159 ops[0] = gimple_assign_rhs1 (stmt); | |
6160 ops[1] = gimple_assign_rhs2 (stmt); | |
6161 break; | |
6162 | |
6163 case GIMPLE_TERNARY_RHS: | |
6164 code = gimple_assign_rhs_code (stmt); | |
6165 op_type = TREE_CODE_LENGTH (code); | |
6166 gcc_assert (op_type == ternary_op); | |
6167 ops[0] = gimple_assign_rhs1 (stmt); | |
6168 ops[1] = gimple_assign_rhs2 (stmt); | |
6169 ops[2] = gimple_assign_rhs3 (stmt); | |
6170 break; | |
6171 | |
6172 case GIMPLE_UNARY_RHS: | |
6173 return false; | |
6174 | |
6175 default: | |
6176 gcc_unreachable (); | |
6177 } | |
6178 | |
6179 if (code == COND_EXPR && slp_node) | |
6180 return false; | |
6181 | 6112 |
6182 scalar_dest = gimple_assign_lhs (stmt); | 6113 scalar_dest = gimple_assign_lhs (stmt); |
6183 scalar_type = TREE_TYPE (scalar_dest); | 6114 scalar_type = TREE_TYPE (scalar_dest); |
6184 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type) | 6115 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type) |
6185 && !SCALAR_FLOAT_TYPE_P (scalar_type)) | 6116 && !SCALAR_FLOAT_TYPE_P (scalar_type)) |
6187 | 6118 |
6188 /* Do not try to vectorize bit-precision reductions. */ | 6119 /* Do not try to vectorize bit-precision reductions. */ |
6189 if (!type_has_mode_precision_p (scalar_type)) | 6120 if (!type_has_mode_precision_p (scalar_type)) |
6190 return false; | 6121 return false; |
6191 | 6122 |
6123 /* For lane-reducing ops we're reducing the number of reduction PHIs | |
6124 which means the only use of that may be in the lane-reducing operation. */ | |
6125 if (lane_reduc_code_p | |
6126 && reduc_chain_length != 1 | |
6127 && !only_slp_reduc_chain) | |
6128 { | |
6129 if (dump_enabled_p ()) | |
6130 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
6131 "lane-reducing reduction with extra stmts.\n"); | |
6132 return false; | |
6133 } | |
6134 | |
6192 /* All uses but the last are expected to be defined in the loop. | 6135 /* All uses but the last are expected to be defined in the loop. |
6193 The last use is the reduction variable. In case of nested cycle this | 6136 The last use is the reduction variable. In case of nested cycle this |
6194 assumption is not true: we use reduc_index to record the index of the | 6137 assumption is not true: we use reduc_index to record the index of the |
6195 reduction variable. */ | 6138 reduction variable. */ |
6196 stmt_vec_info reduc_def_info = NULL; | 6139 reduc_def = PHI_RESULT (reduc_def_phi); |
6197 int reduc_index = -1; | |
6198 for (i = 0; i < op_type; i++) | 6140 for (i = 0; i < op_type; i++) |
6199 { | 6141 { |
6142 tree op = gimple_op (stmt, i + 1); | |
6200 /* The condition of COND_EXPR is checked in vectorizable_condition(). */ | 6143 /* The condition of COND_EXPR is checked in vectorizable_condition(). */ |
6201 if (i == 0 && code == COND_EXPR) | 6144 if (i == 0 && code == COND_EXPR) |
6202 continue; | 6145 continue; |
6203 | 6146 |
6204 stmt_vec_info def_stmt_info; | 6147 stmt_vec_info def_stmt_info; |
6205 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem, | 6148 enum vect_def_type dt; |
6206 &def_stmt_info); | 6149 if (!vect_is_simple_use (op, loop_vinfo, &dt, &tem, |
6207 dt = dts[i]; | 6150 &def_stmt_info)) |
6208 gcc_assert (is_simple_use); | 6151 { |
6209 if (dt == vect_reduction_def) | 6152 if (dump_enabled_p ()) |
6210 { | 6153 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
6211 reduc_def_info = def_stmt_info; | 6154 "use not simple.\n"); |
6212 reduc_index = i; | 6155 return false; |
6213 continue; | 6156 } |
6214 } | 6157 if (i == STMT_VINFO_REDUC_IDX (stmt_info)) |
6215 else if (tem) | 6158 continue; |
6216 { | 6159 |
6217 /* To properly compute ncopies we are interested in the widest | 6160 /* There should be only one cycle def in the stmt, the one |
6218 input type in case we're looking at a widening accumulation. */ | 6161 leading to reduc_def. */ |
6219 if (!vectype_in | 6162 if (VECTORIZABLE_CYCLE_DEF (dt)) |
6163 return false; | |
6164 | |
6165 /* To properly compute ncopies we are interested in the widest | |
6166 non-reduction input type in case we're looking at a widening | |
6167 accumulation that we later handle in vect_transform_reduction. */ | |
6168 if (lane_reduc_code_p | |
6169 && tem | |
6170 && (!vectype_in | |
6220 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in))) | 6171 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in))) |
6221 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))) | 6172 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))) |
6222 vectype_in = tem; | 6173 vectype_in = tem; |
6223 } | 6174 |
6224 | 6175 if (code == COND_EXPR) |
6225 if (dt != vect_internal_def | 6176 { |
6226 && dt != vect_external_def | 6177 /* Record how the non-reduction-def value of COND_EXPR is defined. */ |
6227 && dt != vect_constant_def | |
6228 && dt != vect_induction_def | |
6229 && !(dt == vect_nested_cycle && nested_cycle)) | |
6230 return false; | |
6231 | |
6232 if (dt == vect_nested_cycle) | |
6233 { | |
6234 found_nested_cycle_def = true; | |
6235 reduc_def_info = def_stmt_info; | |
6236 reduc_index = i; | |
6237 } | |
6238 | |
6239 if (i == 1 && code == COND_EXPR) | |
6240 { | |
6241 /* Record how value of COND_EXPR is defined. */ | |
6242 if (dt == vect_constant_def) | 6178 if (dt == vect_constant_def) |
6243 { | 6179 { |
6244 cond_reduc_dt = dt; | 6180 cond_reduc_dt = dt; |
6245 cond_reduc_val = ops[i]; | 6181 cond_reduc_val = op; |
6246 } | 6182 } |
6247 if (dt == vect_induction_def | 6183 if (dt == vect_induction_def |
6248 && def_stmt_info | 6184 && def_stmt_info |
6249 && is_nonwrapping_integer_induction (def_stmt_info, loop)) | 6185 && is_nonwrapping_integer_induction (def_stmt_info, loop)) |
6250 { | 6186 { |
6251 cond_reduc_dt = dt; | 6187 cond_reduc_dt = dt; |
6252 cond_stmt_vinfo = def_stmt_info; | 6188 cond_stmt_vinfo = def_stmt_info; |
6253 } | 6189 } |
6254 } | 6190 } |
6255 } | 6191 } |
6256 | |
6257 if (!vectype_in) | 6192 if (!vectype_in) |
6258 vectype_in = vectype_out; | 6193 vectype_in = STMT_VINFO_VECTYPE (phi_info); |
6259 | 6194 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in; |
6260 /* When vectorizing a reduction chain w/o SLP the reduction PHI is not | 6195 |
6261 directy used in stmt. */ | 6196 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info); |
6262 if (reduc_index == -1) | 6197 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type; |
6263 { | 6198 /* If we have a condition reduction, see if we can simplify it further. */ |
6264 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION) | 6199 if (v_reduc_type == COND_REDUCTION) |
6200 { | |
6201 if (slp_node) | |
6202 return false; | |
6203 | |
6204 /* When the condition uses the reduction value in the condition, fail. */ | |
6205 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0) | |
6265 { | 6206 { |
6266 if (dump_enabled_p ()) | 6207 if (dump_enabled_p ()) |
6267 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | 6208 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
6268 "in-order reduction chain without SLP.\n"); | 6209 "condition depends on previous iteration\n"); |
6269 return false; | 6210 return false; |
6270 } | 6211 } |
6271 | 6212 |
6272 if (orig_stmt_info) | 6213 if (reduc_chain_length == 1 |
6273 reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info); | 6214 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, |
6274 else | 6215 vectype_in, OPTIMIZE_FOR_SPEED)) |
6275 reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info); | |
6276 } | |
6277 | |
6278 if (! reduc_def_info) | |
6279 return false; | |
6280 | |
6281 gphi *reduc_def_phi = dyn_cast <gphi *> (reduc_def_info->stmt); | |
6282 if (!reduc_def_phi) | |
6283 return false; | |
6284 | |
6285 if (!(reduc_index == -1 | |
6286 || dts[reduc_index] == vect_reduction_def | |
6287 || dts[reduc_index] == vect_nested_cycle | |
6288 || ((dts[reduc_index] == vect_internal_def | |
6289 || dts[reduc_index] == vect_external_def | |
6290 || dts[reduc_index] == vect_constant_def | |
6291 || dts[reduc_index] == vect_induction_def) | |
6292 && nested_cycle && found_nested_cycle_def))) | |
6293 { | |
6294 /* For pattern recognized stmts, orig_stmt might be a reduction, | |
6295 but some helper statements for the pattern might not, or | |
6296 might be COND_EXPRs with reduction uses in the condition. */ | |
6297 gcc_assert (orig_stmt_info); | |
6298 return false; | |
6299 } | |
6300 | |
6301 /* PHIs should not participate in patterns. */ | |
6302 gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info)); | |
6303 enum vect_reduction_type v_reduc_type | |
6304 = STMT_VINFO_REDUC_TYPE (reduc_def_info); | |
6305 stmt_vec_info tmp = STMT_VINFO_REDUC_DEF (reduc_def_info); | |
6306 | |
6307 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type; | |
6308 /* If we have a condition reduction, see if we can simplify it further. */ | |
6309 if (v_reduc_type == COND_REDUCTION) | |
6310 { | |
6311 /* TODO: We can't yet handle reduction chains, since we need to treat | |
6312 each COND_EXPR in the chain specially, not just the last one. | |
6313 E.g. for: | |
6314 | |
6315 x_1 = PHI <x_3, ...> | |
6316 x_2 = a_2 ? ... : x_1; | |
6317 x_3 = a_3 ? ... : x_2; | |
6318 | |
6319 we're interested in the last element in x_3 for which a_2 || a_3 | |
6320 is true, whereas the current reduction chain handling would | |
6321 vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3 | |
6322 as a reduction operation. */ | |
6323 if (reduc_index == -1) | |
6324 { | |
6325 if (dump_enabled_p ()) | |
6326 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
6327 "conditional reduction chains not supported\n"); | |
6328 return false; | |
6329 } | |
6330 | |
6331 /* vect_is_simple_reduction ensured that operand 2 is the | |
6332 loop-carried operand. */ | |
6333 gcc_assert (reduc_index == 2); | |
6334 | |
6335 /* Loop peeling modifies initial value of reduction PHI, which | |
6336 makes the reduction stmt to be transformed different to the | |
6337 original stmt analyzed. We need to record reduction code for | |
6338 CONST_COND_REDUCTION type reduction at analyzing stage, thus | |
6339 it can be used directly at transform stage. */ | |
6340 if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR | |
6341 || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR) | |
6342 { | |
6343 /* Also set the reduction type to CONST_COND_REDUCTION. */ | |
6344 gcc_assert (cond_reduc_dt == vect_constant_def); | |
6345 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION; | |
6346 } | |
6347 else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, | |
6348 vectype_in, OPTIMIZE_FOR_SPEED)) | |
6349 { | 6216 { |
6350 if (dump_enabled_p ()) | 6217 if (dump_enabled_p ()) |
6351 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | 6218 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
6352 "optimizing condition reduction with" | 6219 "optimizing condition reduction with" |
6353 " FOLD_EXTRACT_LAST.\n"); | 6220 " FOLD_EXTRACT_LAST.\n"); |
6354 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION; | 6221 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION; |
6355 } | 6222 } |
6356 else if (cond_reduc_dt == vect_induction_def) | 6223 else if (cond_reduc_dt == vect_induction_def) |
6357 { | 6224 { |
6358 tree base | 6225 tree base |
6359 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo); | 6226 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo); |
6360 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo); | 6227 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo); |
6361 | 6228 |
6362 gcc_assert (TREE_CODE (base) == INTEGER_CST | 6229 gcc_assert (TREE_CODE (base) == INTEGER_CST |
6363 && TREE_CODE (step) == INTEGER_CST); | 6230 && TREE_CODE (step) == INTEGER_CST); |
6364 cond_reduc_val = NULL_TREE; | 6231 cond_reduc_val = NULL_TREE; |
6232 enum tree_code cond_reduc_op_code = ERROR_MARK; | |
6233 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo)); | |
6234 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base))) | |
6235 ; | |
6365 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR | 6236 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR |
6366 above base; punt if base is the minimum value of the type for | 6237 above base; punt if base is the minimum value of the type for |
6367 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */ | 6238 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */ |
6368 if (tree_int_cst_sgn (step) == -1) | 6239 else if (tree_int_cst_sgn (step) == -1) |
6369 { | 6240 { |
6370 cond_reduc_op_code = MIN_EXPR; | 6241 cond_reduc_op_code = MIN_EXPR; |
6371 if (tree_int_cst_sgn (base) == -1) | 6242 if (tree_int_cst_sgn (base) == -1) |
6372 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0); | 6243 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0); |
6373 else if (tree_int_cst_lt (base, | 6244 else if (tree_int_cst_lt (base, |
6389 { | 6260 { |
6390 if (dump_enabled_p ()) | 6261 if (dump_enabled_p ()) |
6391 dump_printf_loc (MSG_NOTE, vect_location, | 6262 dump_printf_loc (MSG_NOTE, vect_location, |
6392 "condition expression based on " | 6263 "condition expression based on " |
6393 "integer induction.\n"); | 6264 "integer induction.\n"); |
6394 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) | 6265 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code; |
6395 = INTEGER_INDUC_COND_REDUCTION; | 6266 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) |
6267 = cond_reduc_val; | |
6268 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION; | |
6396 } | 6269 } |
6397 } | 6270 } |
6398 else if (cond_reduc_dt == vect_constant_def) | 6271 else if (cond_reduc_dt == vect_constant_def) |
6399 { | 6272 { |
6400 enum vect_def_type cond_initial_dt; | 6273 enum vect_def_type cond_initial_dt; |
6401 gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]); | |
6402 tree cond_initial_val | 6274 tree cond_initial_val |
6403 = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop)); | 6275 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop)); |
6404 | 6276 |
6405 gcc_assert (cond_reduc_val != NULL_TREE); | 6277 gcc_assert (cond_reduc_val != NULL_TREE); |
6406 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt); | 6278 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt); |
6407 if (cond_initial_dt == vect_constant_def | 6279 if (cond_initial_dt == vect_constant_def |
6408 && types_compatible_p (TREE_TYPE (cond_initial_val), | 6280 && types_compatible_p (TREE_TYPE (cond_initial_val), |
6415 if (dump_enabled_p ()) | 6287 if (dump_enabled_p ()) |
6416 dump_printf_loc (MSG_NOTE, vect_location, | 6288 dump_printf_loc (MSG_NOTE, vect_location, |
6417 "condition expression based on " | 6289 "condition expression based on " |
6418 "compile time constant.\n"); | 6290 "compile time constant.\n"); |
6419 /* Record reduction code at analysis stage. */ | 6291 /* Record reduction code at analysis stage. */ |
6420 STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) | 6292 STMT_VINFO_REDUC_CODE (reduc_info) |
6421 = integer_onep (e) ? MAX_EXPR : MIN_EXPR; | 6293 = integer_onep (e) ? MAX_EXPR : MIN_EXPR; |
6422 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) | 6294 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION; |
6423 = CONST_COND_REDUCTION; | |
6424 } | 6295 } |
6425 } | 6296 } |
6426 } | 6297 } |
6427 } | 6298 } |
6428 | 6299 |
6429 if (orig_stmt_info) | 6300 if (STMT_VINFO_LIVE_P (phi_info)) |
6430 gcc_assert (tmp == orig_stmt_info | |
6431 || REDUC_GROUP_FIRST_ELEMENT (tmp) == orig_stmt_info); | |
6432 else | |
6433 /* We changed STMT to be the first stmt in reduction chain, hence we | |
6434 check that in this case the first element in the chain is STMT. */ | |
6435 gcc_assert (tmp == stmt_info | |
6436 || REDUC_GROUP_FIRST_ELEMENT (tmp) == stmt_info); | |
6437 | |
6438 if (STMT_VINFO_LIVE_P (reduc_def_info)) | |
6439 return false; | 6301 return false; |
6440 | 6302 |
6441 if (slp_node) | 6303 if (slp_node) |
6442 ncopies = 1; | 6304 ncopies = 1; |
6443 else | 6305 else |
6444 ncopies = vect_get_num_copies (loop_vinfo, vectype_in); | 6306 ncopies = vect_get_num_copies (loop_vinfo, vectype_in); |
6445 | 6307 |
6446 gcc_assert (ncopies >= 1); | 6308 gcc_assert (ncopies >= 1); |
6447 | 6309 |
6448 vec_mode = TYPE_MODE (vectype_in); | |
6449 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out); | 6310 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out); |
6450 | 6311 |
6451 if (code == COND_EXPR) | 6312 if (nested_cycle) |
6452 { | 6313 { |
6453 /* Only call during the analysis stage, otherwise we'll lose | 6314 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) |
6454 STMT_VINFO_TYPE. */ | 6315 == vect_double_reduction_def); |
6455 if (!vec_stmt && !vectorizable_condition (stmt_info, gsi, NULL, | 6316 double_reduc = true; |
6456 ops[reduc_index], 0, NULL, | |
6457 cost_vec)) | |
6458 { | |
6459 if (dump_enabled_p ()) | |
6460 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
6461 "unsupported condition in reduction\n"); | |
6462 return false; | |
6463 } | |
6464 } | |
6465 else | |
6466 { | |
6467 /* 4. Supportable by target? */ | |
6468 | |
6469 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR | |
6470 || code == LROTATE_EXPR || code == RROTATE_EXPR) | |
6471 { | |
6472 /* Shifts and rotates are only supported by vectorizable_shifts, | |
6473 not vectorizable_reduction. */ | |
6474 if (dump_enabled_p ()) | |
6475 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
6476 "unsupported shift or rotation.\n"); | |
6477 return false; | |
6478 } | |
6479 | |
6480 /* 4.1. check support for the operation in the loop */ | |
6481 optab = optab_for_tree_code (code, vectype_in, optab_default); | |
6482 if (!optab) | |
6483 { | |
6484 if (dump_enabled_p ()) | |
6485 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
6486 "no optab.\n"); | |
6487 | |
6488 return false; | |
6489 } | |
6490 | |
6491 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing) | |
6492 { | |
6493 if (dump_enabled_p ()) | |
6494 dump_printf (MSG_NOTE, "op not supported by target.\n"); | |
6495 | |
6496 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD) | |
6497 || !vect_worthwhile_without_simd_p (loop_vinfo, code)) | |
6498 return false; | |
6499 | |
6500 if (dump_enabled_p ()) | |
6501 dump_printf (MSG_NOTE, "proceeding using word mode.\n"); | |
6502 } | |
6503 | |
6504 /* Worthwhile without SIMD support? */ | |
6505 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in)) | |
6506 && !vect_worthwhile_without_simd_p (loop_vinfo, code)) | |
6507 { | |
6508 if (dump_enabled_p ()) | |
6509 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
6510 "not worthwhile without SIMD support.\n"); | |
6511 | |
6512 return false; | |
6513 } | |
6514 } | 6317 } |
6515 | 6318 |
6516 /* 4.2. Check support for the epilog operation. | 6319 /* 4.2. Check support for the epilog operation. |
6517 | 6320 |
6518 If STMT represents a reduction pattern, then the type of the | 6321 If STMT represents a reduction pattern, then the type of the |
6546 the arguments are the same as the type of the reduction variable. | 6349 the arguments are the same as the type of the reduction variable. |
6547 For "regular" reductions we can therefore use the same vector type | 6350 For "regular" reductions we can therefore use the same vector type |
6548 (and also the same tree-code) when generating the epilog code and | 6351 (and also the same tree-code) when generating the epilog code and |
6549 when generating the code inside the loop. */ | 6352 when generating the code inside the loop. */ |
6550 | 6353 |
6551 vect_reduction_type reduction_type | 6354 enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info); |
6552 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info); | 6355 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code; |
6553 if (orig_stmt_info | 6356 |
6554 && (reduction_type == TREE_CODE_REDUCTION | 6357 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info); |
6555 || reduction_type == FOLD_LEFT_REDUCTION)) | 6358 if (reduction_type == TREE_CODE_REDUCTION) |
6556 { | 6359 { |
6557 /* This is a reduction pattern: get the vectype from the type of the | 6360 /* Check whether it's ok to change the order of the computation. |
6558 reduction variable, and get the tree-code from orig_stmt. */ | 6361 Generally, when vectorizing a reduction we change the order of the |
6559 orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt); | 6362 computation. This may change the behavior of the program in some |
6560 gcc_assert (vectype_out); | 6363 cases, so we need to check that this is ok. One exception is when |
6561 vec_mode = TYPE_MODE (vectype_out); | 6364 vectorizing an outer-loop: the inner-loop is executed sequentially, |
6562 } | 6365 and therefore vectorizing reductions in the inner-loop during |
6563 else | 6366 outer-loop vectorization is safe. */ |
6564 { | 6367 if (needs_fold_left_reduction_p (scalar_type, orig_code)) |
6565 /* Regular reduction: use the same vectype and tree-code as used for | 6368 { |
6566 the vector code inside the loop can be used for the epilog code. */ | 6369 /* When vectorizing a reduction chain w/o SLP the reduction PHI |
6567 orig_code = code; | 6370 is not directy used in stmt. */ |
6568 | 6371 if (!only_slp_reduc_chain |
6569 if (code == MINUS_EXPR) | 6372 && reduc_chain_length != 1) |
6570 orig_code = PLUS_EXPR; | 6373 { |
6571 | 6374 if (dump_enabled_p ()) |
6572 /* For simple condition reductions, replace with the actual expression | 6375 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
6573 we want to base our reduction around. */ | 6376 "in-order reduction chain without SLP.\n"); |
6574 if (reduction_type == CONST_COND_REDUCTION) | 6377 return false; |
6575 { | 6378 } |
6576 orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info); | 6379 STMT_VINFO_REDUC_TYPE (reduc_info) |
6577 gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR); | 6380 = reduction_type = FOLD_LEFT_REDUCTION; |
6578 } | 6381 } |
6579 else if (reduction_type == INTEGER_INDUC_COND_REDUCTION) | 6382 else if (!commutative_tree_code (orig_code) |
6580 orig_code = cond_reduc_op_code; | 6383 || !associative_tree_code (orig_code)) |
6581 } | 6384 { |
6582 | 6385 if (dump_enabled_p ()) |
6583 if (nested_cycle) | 6386 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
6584 { | 6387 "reduction: not commutative/associative"); |
6585 def_bb = gimple_bb (reduc_def_phi); | 6388 return false; |
6586 def_stmt_loop = def_bb->loop_father; | 6389 } |
6587 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, | 6390 } |
6588 loop_preheader_edge (def_stmt_loop)); | 6391 |
6589 stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg); | 6392 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION) |
6590 if (def_arg_stmt_info | 6393 && ncopies > 1) |
6591 && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info) | 6394 { |
6592 == vect_double_reduction_def)) | 6395 if (dump_enabled_p ()) |
6593 double_reduc = true; | 6396 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
6594 } | 6397 "multiple types in double reduction or condition " |
6595 | 6398 "reduction or fold-left reduction.\n"); |
6596 reduc_fn = IFN_LAST; | 6399 return false; |
6597 | 6400 } |
6401 | |
6402 internal_fn reduc_fn = IFN_LAST; | |
6598 if (reduction_type == TREE_CODE_REDUCTION | 6403 if (reduction_type == TREE_CODE_REDUCTION |
6599 || reduction_type == FOLD_LEFT_REDUCTION | 6404 || reduction_type == FOLD_LEFT_REDUCTION |
6600 || reduction_type == INTEGER_INDUC_COND_REDUCTION | 6405 || reduction_type == INTEGER_INDUC_COND_REDUCTION |
6601 || reduction_type == CONST_COND_REDUCTION) | 6406 || reduction_type == CONST_COND_REDUCTION) |
6602 { | 6407 { |
6637 | 6442 |
6638 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type, | 6443 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type, |
6639 OPTIMIZE_FOR_SPEED)) | 6444 OPTIMIZE_FOR_SPEED)) |
6640 reduc_fn = IFN_REDUC_MAX; | 6445 reduc_fn = IFN_REDUC_MAX; |
6641 } | 6446 } |
6447 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn; | |
6642 | 6448 |
6643 if (reduction_type != EXTRACT_LAST_REDUCTION | 6449 if (reduction_type != EXTRACT_LAST_REDUCTION |
6644 && (!nested_cycle || double_reduc) | 6450 && (!nested_cycle || double_reduc) |
6645 && reduc_fn == IFN_LAST | 6451 && reduc_fn == IFN_LAST |
6646 && !nunits_out.is_constant ()) | 6452 && !nunits_out.is_constant ()) |
6650 "missing target support for reduction on" | 6456 "missing target support for reduction on" |
6651 " variable-length vectors.\n"); | 6457 " variable-length vectors.\n"); |
6652 return false; | 6458 return false; |
6653 } | 6459 } |
6654 | 6460 |
6655 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION) | |
6656 && ncopies > 1) | |
6657 { | |
6658 if (dump_enabled_p ()) | |
6659 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
6660 "multiple types in double reduction or condition " | |
6661 "reduction.\n"); | |
6662 return false; | |
6663 } | |
6664 | |
6665 /* For SLP reductions, see if there is a neutral value we can use. */ | 6461 /* For SLP reductions, see if there is a neutral value we can use. */ |
6666 tree neutral_op = NULL_TREE; | 6462 tree neutral_op = NULL_TREE; |
6667 if (slp_node) | 6463 if (slp_node) |
6668 neutral_op = neutral_op_for_slp_reduction | 6464 neutral_op = neutral_op_for_slp_reduction |
6669 (slp_node_instance->reduc_phis, code, | 6465 (slp_node_instance->reduc_phis, vectype_out, orig_code, |
6670 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL); | 6466 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL); |
6671 | 6467 |
6672 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION) | 6468 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION) |
6673 { | 6469 { |
6674 /* We can't support in-order reductions of code such as this: | 6470 /* We can't support in-order reductions of code such as this: |
6729 /* We checked above that we could build the initial vector when | 6525 /* We checked above that we could build the initial vector when |
6730 there's a neutral element value. Check here for the case in | 6526 there's a neutral element value. Check here for the case in |
6731 which each SLP statement has its own initial value and in which | 6527 which each SLP statement has its own initial value and in which |
6732 that value needs to be repeated for every instance of the | 6528 that value needs to be repeated for every instance of the |
6733 statement within the initial vector. */ | 6529 statement within the initial vector. */ |
6734 unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length (); | 6530 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance); |
6735 scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out)); | |
6736 if (!neutral_op | 6531 if (!neutral_op |
6737 && !can_duplicate_and_interleave_p (group_size, elt_mode)) | 6532 && !can_duplicate_and_interleave_p (loop_vinfo, group_size, |
6533 TREE_TYPE (vectype_out))) | |
6738 { | 6534 { |
6739 if (dump_enabled_p ()) | 6535 if (dump_enabled_p ()) |
6740 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | 6536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
6741 "unsupported form of SLP reduction for" | 6537 "unsupported form of SLP reduction for" |
6742 " variable-length vectors: cannot build" | 6538 " variable-length vectors: cannot build" |
6753 "unsupported form of SLP reduction for" | 6549 "unsupported form of SLP reduction for" |
6754 " variable-length vectors: the vector size" | 6550 " variable-length vectors: the vector size" |
6755 " is not a multiple of the number of results.\n"); | 6551 " is not a multiple of the number of results.\n"); |
6756 return false; | 6552 return false; |
6757 } | 6553 } |
6758 } | |
6759 | |
6760 /* In case of widenning multiplication by a constant, we update the type | |
6761 of the constant to be the type of the other operand. We check that the | |
6762 constant fits the type in the pattern recognition pass. */ | |
6763 if (code == DOT_PROD_EXPR | |
6764 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1]))) | |
6765 { | |
6766 if (TREE_CODE (ops[0]) == INTEGER_CST) | |
6767 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]); | |
6768 else if (TREE_CODE (ops[1]) == INTEGER_CST) | |
6769 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]); | |
6770 else | |
6771 { | |
6772 if (dump_enabled_p ()) | |
6773 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
6774 "invalid types in dot-prod\n"); | |
6775 | |
6776 return false; | |
6777 } | |
6778 } | 6554 } |
6779 | 6555 |
6780 if (reduction_type == COND_REDUCTION) | 6556 if (reduction_type == COND_REDUCTION) |
6781 { | 6557 { |
6782 widest_int ni; | 6558 widest_int ni; |
6832 from the vectorized reduction operation generated in the previous iteration. | 6608 from the vectorized reduction operation generated in the previous iteration. |
6833 | 6609 |
6834 This only works when we see both the reduction PHI and its only consumer | 6610 This only works when we see both the reduction PHI and its only consumer |
6835 in vectorizable_reduction and there are no intermediate stmts | 6611 in vectorizable_reduction and there are no intermediate stmts |
6836 participating. */ | 6612 participating. */ |
6837 stmt_vec_info use_stmt_info; | |
6838 tree reduc_phi_result = gimple_phi_result (reduc_def_phi); | |
6839 if (ncopies > 1 | 6613 if (ncopies > 1 |
6840 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live) | 6614 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live) |
6841 && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result)) | 6615 && reduc_chain_length == 1) |
6842 && vect_stmt_to_vectorize (use_stmt_info) == stmt_info) | 6616 single_defuse_cycle = true; |
6843 { | 6617 |
6844 single_defuse_cycle = true; | 6618 if (single_defuse_cycle || lane_reduc_code_p) |
6845 epilog_copies = 1; | 6619 { |
6846 } | 6620 gcc_assert (code != COND_EXPR); |
6847 else | 6621 |
6848 epilog_copies = ncopies; | 6622 /* 4. Supportable by target? */ |
6623 bool ok = true; | |
6624 | |
6625 /* 4.1. check support for the operation in the loop */ | |
6626 optab optab = optab_for_tree_code (code, vectype_in, optab_vector); | |
6627 if (!optab) | |
6628 { | |
6629 if (dump_enabled_p ()) | |
6630 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
6631 "no optab.\n"); | |
6632 ok = false; | |
6633 } | |
6634 | |
6635 machine_mode vec_mode = TYPE_MODE (vectype_in); | |
6636 if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing) | |
6637 { | |
6638 if (dump_enabled_p ()) | |
6639 dump_printf (MSG_NOTE, "op not supported by target.\n"); | |
6640 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD) | |
6641 || !vect_worthwhile_without_simd_p (loop_vinfo, code)) | |
6642 ok = false; | |
6643 else | |
6644 if (dump_enabled_p ()) | |
6645 dump_printf (MSG_NOTE, "proceeding using word mode.\n"); | |
6646 } | |
6647 | |
6648 /* Worthwhile without SIMD support? */ | |
6649 if (ok | |
6650 && !VECTOR_MODE_P (TYPE_MODE (vectype_in)) | |
6651 && !vect_worthwhile_without_simd_p (loop_vinfo, code)) | |
6652 { | |
6653 if (dump_enabled_p ()) | |
6654 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
6655 "not worthwhile without SIMD support.\n"); | |
6656 ok = false; | |
6657 } | |
6658 | |
6659 /* lane-reducing operations have to go through vect_transform_reduction. | |
6660 For the other cases try without the single cycle optimization. */ | |
6661 if (!ok) | |
6662 { | |
6663 if (lane_reduc_code_p) | |
6664 return false; | |
6665 else | |
6666 single_defuse_cycle = false; | |
6667 } | |
6668 } | |
6669 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle; | |
6849 | 6670 |
6850 /* If the reduction stmt is one of the patterns that have lane | 6671 /* If the reduction stmt is one of the patterns that have lane |
6851 reduction embedded we cannot handle the case of ! single_defuse_cycle. */ | 6672 reduction embedded we cannot handle the case of ! single_defuse_cycle. */ |
6852 if ((ncopies > 1 | 6673 if ((ncopies > 1 && ! single_defuse_cycle) |
6853 && ! single_defuse_cycle) | 6674 && lane_reduc_code_p) |
6854 && (code == DOT_PROD_EXPR | |
6855 || code == WIDEN_SUM_EXPR | |
6856 || code == SAD_EXPR)) | |
6857 { | 6675 { |
6858 if (dump_enabled_p ()) | 6676 if (dump_enabled_p ()) |
6859 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | 6677 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
6860 "multi def-use cycle not possible for lane-reducing " | 6678 "multi def-use cycle not possible for lane-reducing " |
6861 "reduction operation\n"); | 6679 "reduction operation\n"); |
6865 if (slp_node) | 6683 if (slp_node) |
6866 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); | 6684 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); |
6867 else | 6685 else |
6868 vec_num = 1; | 6686 vec_num = 1; |
6869 | 6687 |
6688 vect_model_reduction_cost (stmt_info, reduc_fn, reduction_type, ncopies, | |
6689 cost_vec); | |
6690 if (dump_enabled_p () | |
6691 && reduction_type == FOLD_LEFT_REDUCTION) | |
6692 dump_printf_loc (MSG_NOTE, vect_location, | |
6693 "using an in-order (fold-left) reduction.\n"); | |
6694 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type; | |
6695 /* All but single defuse-cycle optimized, lane-reducing and fold-left | |
6696 reductions go through their own vectorizable_* routines. */ | |
6697 if (!single_defuse_cycle | |
6698 && code != DOT_PROD_EXPR | |
6699 && code != WIDEN_SUM_EXPR | |
6700 && code != SAD_EXPR | |
6701 && reduction_type != FOLD_LEFT_REDUCTION) | |
6702 { | |
6703 stmt_vec_info tem | |
6704 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info)); | |
6705 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem)) | |
6706 { | |
6707 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem)); | |
6708 tem = REDUC_GROUP_FIRST_ELEMENT (tem); | |
6709 } | |
6710 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def; | |
6711 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def; | |
6712 } | |
6713 else if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)) | |
6714 { | |
6715 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); | |
6716 internal_fn cond_fn = get_conditional_internal_fn (code); | |
6717 | |
6718 if (reduction_type != FOLD_LEFT_REDUCTION | |
6719 && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in) | |
6720 && (cond_fn == IFN_LAST | |
6721 || !direct_internal_fn_supported_p (cond_fn, vectype_in, | |
6722 OPTIMIZE_FOR_SPEED))) | |
6723 { | |
6724 if (dump_enabled_p ()) | |
6725 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
6726 "can't use a fully-masked loop because no" | |
6727 " conditional operation is available.\n"); | |
6728 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false; | |
6729 } | |
6730 else if (reduction_type == FOLD_LEFT_REDUCTION | |
6731 && reduc_fn == IFN_LAST | |
6732 && !expand_vec_cond_expr_p (vectype_in, | |
6733 truth_type_for (vectype_in), | |
6734 SSA_NAME)) | |
6735 { | |
6736 if (dump_enabled_p ()) | |
6737 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
6738 "can't use a fully-masked loop because no" | |
6739 " conditional operation is available.\n"); | |
6740 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false; | |
6741 } | |
6742 else | |
6743 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num, | |
6744 vectype_in, NULL); | |
6745 } | |
6746 return true; | |
6747 } | |
6748 | |
6749 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge | |
6750 value. */ | |
6751 | |
6752 bool | |
6753 vect_transform_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, | |
6754 stmt_vec_info *vec_stmt, slp_tree slp_node) | |
6755 { | |
6756 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); | |
6757 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
6758 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
6759 int i; | |
6760 int ncopies; | |
6761 int j; | |
6762 int vec_num; | |
6763 | |
6764 stmt_vec_info reduc_info = info_for_reduction (stmt_info); | |
6765 gcc_assert (reduc_info->is_reduc_info); | |
6766 | |
6767 if (nested_in_vect_loop_p (loop, stmt_info)) | |
6768 { | |
6769 loop = loop->inner; | |
6770 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def); | |
6771 } | |
6772 | |
6773 gassign *stmt = as_a <gassign *> (stmt_info->stmt); | |
6774 enum tree_code code = gimple_assign_rhs_code (stmt); | |
6775 int op_type = TREE_CODE_LENGTH (code); | |
6776 | |
6777 /* Flatten RHS. */ | |
6778 tree ops[3]; | |
6779 switch (get_gimple_rhs_class (code)) | |
6780 { | |
6781 case GIMPLE_TERNARY_RHS: | |
6782 ops[2] = gimple_assign_rhs3 (stmt); | |
6783 /* Fall thru. */ | |
6784 case GIMPLE_BINARY_RHS: | |
6785 ops[0] = gimple_assign_rhs1 (stmt); | |
6786 ops[1] = gimple_assign_rhs2 (stmt); | |
6787 break; | |
6788 default: | |
6789 gcc_unreachable (); | |
6790 } | |
6791 | |
6792 /* All uses but the last are expected to be defined in the loop. | |
6793 The last use is the reduction variable. In case of nested cycle this | |
6794 assumption is not true: we use reduc_index to record the index of the | |
6795 reduction variable. */ | |
6796 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)); | |
6797 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt); | |
6798 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info); | |
6799 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info); | |
6800 | |
6801 if (slp_node) | |
6802 { | |
6803 ncopies = 1; | |
6804 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); | |
6805 } | |
6806 else | |
6807 { | |
6808 ncopies = vect_get_num_copies (loop_vinfo, vectype_in); | |
6809 vec_num = 1; | |
6810 } | |
6811 | |
6870 internal_fn cond_fn = get_conditional_internal_fn (code); | 6812 internal_fn cond_fn = get_conditional_internal_fn (code); |
6871 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); | 6813 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); |
6872 | 6814 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in); |
6873 if (!vec_stmt) /* transformation not required. */ | |
6874 { | |
6875 vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec); | |
6876 if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)) | |
6877 { | |
6878 if (reduction_type != FOLD_LEFT_REDUCTION | |
6879 && (cond_fn == IFN_LAST | |
6880 || !direct_internal_fn_supported_p (cond_fn, vectype_in, | |
6881 OPTIMIZE_FOR_SPEED))) | |
6882 { | |
6883 if (dump_enabled_p ()) | |
6884 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
6885 "can't use a fully-masked loop because no" | |
6886 " conditional operation is available.\n"); | |
6887 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false; | |
6888 } | |
6889 else if (reduc_index == -1) | |
6890 { | |
6891 if (dump_enabled_p ()) | |
6892 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
6893 "can't use a fully-masked loop for chained" | |
6894 " reductions.\n"); | |
6895 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false; | |
6896 } | |
6897 else | |
6898 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num, | |
6899 vectype_in); | |
6900 } | |
6901 if (dump_enabled_p () | |
6902 && reduction_type == FOLD_LEFT_REDUCTION) | |
6903 dump_printf_loc (MSG_NOTE, vect_location, | |
6904 "using an in-order (fold-left) reduction.\n"); | |
6905 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; | |
6906 return true; | |
6907 } | |
6908 | 6815 |
6909 /* Transform. */ | 6816 /* Transform. */ |
6817 stmt_vec_info new_stmt_info = NULL; | |
6818 stmt_vec_info prev_stmt_info; | |
6819 tree new_temp = NULL_TREE; | |
6820 auto_vec<tree> vec_oprnds0; | |
6821 auto_vec<tree> vec_oprnds1; | |
6822 auto_vec<tree> vec_oprnds2; | |
6823 tree def0; | |
6910 | 6824 |
6911 if (dump_enabled_p ()) | 6825 if (dump_enabled_p ()) |
6912 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n"); | 6826 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n"); |
6913 | 6827 |
6914 /* FORNOW: Multiple types are not supported for condition. */ | 6828 /* FORNOW: Multiple types are not supported for condition. */ |
6915 if (code == COND_EXPR) | 6829 if (code == COND_EXPR) |
6916 gcc_assert (ncopies == 1); | 6830 gcc_assert (ncopies == 1); |
6917 | 6831 |
6918 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo); | 6832 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo); |
6919 | 6833 |
6834 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info); | |
6920 if (reduction_type == FOLD_LEFT_REDUCTION) | 6835 if (reduction_type == FOLD_LEFT_REDUCTION) |
6921 return vectorize_fold_left_reduction | 6836 { |
6922 (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code, | 6837 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info); |
6923 reduc_fn, ops, vectype_in, reduc_index, masks); | 6838 return vectorize_fold_left_reduction |
6924 | 6839 (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code, |
6925 if (reduction_type == EXTRACT_LAST_REDUCTION) | 6840 reduc_fn, ops, vectype_in, reduc_index, masks); |
6926 { | 6841 } |
6927 gcc_assert (!slp_node); | 6842 |
6928 return vectorizable_condition (stmt_info, gsi, vec_stmt, | 6843 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info); |
6929 NULL, reduc_index, NULL, NULL); | 6844 gcc_assert (single_defuse_cycle |
6930 } | 6845 || code == DOT_PROD_EXPR |
6846 || code == WIDEN_SUM_EXPR | |
6847 || code == SAD_EXPR); | |
6931 | 6848 |
6932 /* Create the destination vector */ | 6849 /* Create the destination vector */ |
6933 vec_dest = vect_create_destination_var (scalar_dest, vectype_out); | 6850 tree scalar_dest = gimple_assign_lhs (stmt); |
6851 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out); | |
6934 | 6852 |
6935 prev_stmt_info = NULL; | 6853 prev_stmt_info = NULL; |
6936 prev_phi_info = NULL; | |
6937 if (!slp_node) | 6854 if (!slp_node) |
6938 { | 6855 { |
6939 vec_oprnds0.create (1); | 6856 vec_oprnds0.create (1); |
6940 vec_oprnds1.create (1); | 6857 vec_oprnds1.create (1); |
6941 if (op_type == ternary_op) | 6858 if (op_type == ternary_op) |
6942 vec_oprnds2.create (1); | 6859 vec_oprnds2.create (1); |
6943 } | 6860 } |
6944 | 6861 |
6945 phis.create (vec_num); | |
6946 vect_defs.create (vec_num); | |
6947 if (!slp_node) | |
6948 vect_defs.quick_push (NULL_TREE); | |
6949 | |
6950 if (slp_node) | |
6951 phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis)); | |
6952 else | |
6953 phis.quick_push (STMT_VINFO_VEC_STMT (reduc_def_info)); | |
6954 | |
6955 for (j = 0; j < ncopies; j++) | 6862 for (j = 0; j < ncopies; j++) |
6956 { | 6863 { |
6957 if (code == COND_EXPR) | |
6958 { | |
6959 gcc_assert (!slp_node); | |
6960 vectorizable_condition (stmt_info, gsi, vec_stmt, | |
6961 PHI_RESULT (phis[0]->stmt), | |
6962 reduc_index, NULL, NULL); | |
6963 /* Multiple types are not supported for condition. */ | |
6964 break; | |
6965 } | |
6966 | |
6967 /* Handle uses. */ | 6864 /* Handle uses. */ |
6968 if (j == 0) | 6865 if (j == 0) |
6969 { | 6866 { |
6970 if (slp_node) | 6867 if (slp_node) |
6971 { | 6868 { |
6972 /* Get vec defs for all the operands except the reduction index, | 6869 /* Get vec defs for all the operands except the reduction index, |
6973 ensuring the ordering of the ops in the vector is kept. */ | 6870 ensuring the ordering of the ops in the vector is kept. */ |
6974 auto_vec<tree, 3> slp_ops; | |
6975 auto_vec<vec<tree>, 3> vec_defs; | 6871 auto_vec<vec<tree>, 3> vec_defs; |
6976 | 6872 vect_get_slp_defs (slp_node, &vec_defs); |
6977 slp_ops.quick_push (ops[0]); | |
6978 slp_ops.quick_push (ops[1]); | |
6979 if (op_type == ternary_op) | |
6980 slp_ops.quick_push (ops[2]); | |
6981 | |
6982 vect_get_slp_defs (slp_ops, slp_node, &vec_defs); | |
6983 | |
6984 vec_oprnds0.safe_splice (vec_defs[0]); | 6873 vec_oprnds0.safe_splice (vec_defs[0]); |
6985 vec_defs[0].release (); | 6874 vec_defs[0].release (); |
6986 vec_oprnds1.safe_splice (vec_defs[1]); | 6875 vec_oprnds1.safe_splice (vec_defs[1]); |
6987 vec_defs[1].release (); | 6876 vec_defs[1].release (); |
6988 if (op_type == ternary_op) | 6877 if (op_type == ternary_op) |
7033 } | 6922 } |
7034 | 6923 |
7035 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0) | 6924 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0) |
7036 { | 6925 { |
7037 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE }; | 6926 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE }; |
7038 if (masked_loop_p) | 6927 if (masked_loop_p && !mask_by_cond_expr) |
7039 { | 6928 { |
7040 /* Make sure that the reduction accumulator is vop[0]. */ | 6929 /* Make sure that the reduction accumulator is vop[0]. */ |
7041 if (reduc_index == 1) | 6930 if (reduc_index == 1) |
7042 { | 6931 { |
7043 gcc_assert (commutative_tree_code (code)); | 6932 gcc_assert (commutative_tree_code (code)); |
7057 else | 6946 else |
7058 { | 6947 { |
7059 if (op_type == ternary_op) | 6948 if (op_type == ternary_op) |
7060 vop[2] = vec_oprnds2[i]; | 6949 vop[2] = vec_oprnds2[i]; |
7061 | 6950 |
6951 if (masked_loop_p && mask_by_cond_expr) | |
6952 { | |
6953 tree mask = vect_get_loop_mask (gsi, masks, | |
6954 vec_num * ncopies, | |
6955 vectype_in, i * ncopies + j); | |
6956 build_vect_cond_expr (code, vop, mask, gsi); | |
6957 } | |
6958 | |
7062 gassign *new_stmt = gimple_build_assign (vec_dest, code, | 6959 gassign *new_stmt = gimple_build_assign (vec_dest, code, |
7063 vop[0], vop[1], vop[2]); | 6960 vop[0], vop[1], vop[2]); |
7064 new_temp = make_ssa_name (vec_dest, new_stmt); | 6961 new_temp = make_ssa_name (vec_dest, new_stmt); |
7065 gimple_assign_set_lhs (new_stmt, new_temp); | 6962 gimple_assign_set_lhs (new_stmt, new_temp); |
7066 new_stmt_info | 6963 new_stmt_info |
7067 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi); | 6964 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi); |
7068 } | 6965 } |
7069 | 6966 |
7070 if (slp_node) | 6967 if (slp_node) |
7071 { | 6968 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info); |
7072 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info); | |
7073 vect_defs.quick_push (new_temp); | |
7074 } | |
7075 else | |
7076 vect_defs[0] = new_temp; | |
7077 } | 6969 } |
7078 | 6970 |
7079 if (slp_node) | 6971 if (slp_node || single_defuse_cycle) |
7080 continue; | 6972 continue; |
7081 | 6973 |
7082 if (j == 0) | 6974 if (j == 0) |
7083 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info; | 6975 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info; |
7084 else | 6976 else |
7085 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info; | 6977 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info; |
7086 | 6978 |
7087 prev_stmt_info = new_stmt_info; | 6979 prev_stmt_info = new_stmt_info; |
7088 } | 6980 } |
7089 | 6981 |
7090 /* Finalize the reduction-phi (set its arguments) and create the | 6982 if (single_defuse_cycle && !slp_node) |
7091 epilog reduction code. */ | 6983 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info; |
7092 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node) | |
7093 vect_defs[0] = gimple_get_lhs ((*vec_stmt)->stmt); | |
7094 | |
7095 vect_create_epilog_for_reduction (vect_defs, stmt_info, reduc_def_phi, | |
7096 epilog_copies, reduc_fn, phis, | |
7097 double_reduc, slp_node, slp_node_instance, | |
7098 cond_reduc_val, cond_reduc_op_code, | |
7099 neutral_op); | |
7100 | 6984 |
7101 return true; | 6985 return true; |
7102 } | 6986 } |
6987 | |
6988 /* Transform phase of a cycle PHI. */ | |
6989 | |
6990 bool | |
6991 vect_transform_cycle_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt, | |
6992 slp_tree slp_node, slp_instance slp_node_instance) | |
6993 { | |
6994 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); | |
6995 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
6996 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
6997 int i; | |
6998 int ncopies; | |
6999 stmt_vec_info prev_phi_info; | |
7000 int j; | |
7001 bool nested_cycle = false; | |
7002 int vec_num; | |
7003 | |
7004 if (nested_in_vect_loop_p (loop, stmt_info)) | |
7005 { | |
7006 loop = loop->inner; | |
7007 nested_cycle = true; | |
7008 } | |
7009 | |
7010 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info); | |
7011 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info); | |
7012 stmt_vec_info reduc_info = info_for_reduction (stmt_info); | |
7013 gcc_assert (reduc_info->is_reduc_info); | |
7014 | |
7015 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION | |
7016 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION) | |
7017 /* Leave the scalar phi in place. */ | |
7018 return true; | |
7019 | |
7020 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info); | |
7021 /* For a nested cycle we do not fill the above. */ | |
7022 if (!vectype_in) | |
7023 vectype_in = STMT_VINFO_VECTYPE (stmt_info); | |
7024 gcc_assert (vectype_in); | |
7025 | |
7026 if (slp_node) | |
7027 { | |
7028 /* The size vect_schedule_slp_instance computes is off for us. */ | |
7029 vec_num = vect_get_num_vectors | |
7030 (LOOP_VINFO_VECT_FACTOR (loop_vinfo) | |
7031 * SLP_TREE_SCALAR_STMTS (slp_node).length (), vectype_in); | |
7032 ncopies = 1; | |
7033 } | |
7034 else | |
7035 { | |
7036 vec_num = 1; | |
7037 ncopies = vect_get_num_copies (loop_vinfo, vectype_in); | |
7038 } | |
7039 | |
7040 /* Check whether we should use a single PHI node and accumulate | |
7041 vectors to one before the backedge. */ | |
7042 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info)) | |
7043 ncopies = 1; | |
7044 | |
7045 /* Create the destination vector */ | |
7046 gphi *phi = as_a <gphi *> (stmt_info->stmt); | |
7047 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi), | |
7048 vectype_out); | |
7049 | |
7050 /* Get the loop-entry arguments. */ | |
7051 tree vec_initial_def; | |
7052 auto_vec<tree> vec_initial_defs; | |
7053 if (slp_node) | |
7054 { | |
7055 vec_initial_defs.reserve (vec_num); | |
7056 gcc_assert (slp_node == slp_node_instance->reduc_phis); | |
7057 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info); | |
7058 tree neutral_op | |
7059 = neutral_op_for_slp_reduction (slp_node, vectype_out, | |
7060 STMT_VINFO_REDUC_CODE (reduc_info), | |
7061 first != NULL); | |
7062 get_initial_defs_for_reduction (slp_node_instance->reduc_phis, | |
7063 &vec_initial_defs, vec_num, | |
7064 first != NULL, neutral_op); | |
7065 } | |
7066 else | |
7067 { | |
7068 /* Get at the scalar def before the loop, that defines the initial | |
7069 value of the reduction variable. */ | |
7070 tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi, | |
7071 loop_preheader_edge (loop)); | |
7072 /* Optimize: if initial_def is for REDUC_MAX smaller than the base | |
7073 and we can't use zero for induc_val, use initial_def. Similarly | |
7074 for REDUC_MIN and initial_def larger than the base. */ | |
7075 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION) | |
7076 { | |
7077 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info); | |
7078 if (TREE_CODE (initial_def) == INTEGER_CST | |
7079 && !integer_zerop (induc_val) | |
7080 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR | |
7081 && tree_int_cst_lt (initial_def, induc_val)) | |
7082 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR | |
7083 && tree_int_cst_lt (induc_val, initial_def)))) | |
7084 { | |
7085 induc_val = initial_def; | |
7086 /* Communicate we used the initial_def to epilouge | |
7087 generation. */ | |
7088 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE; | |
7089 } | |
7090 vec_initial_def = build_vector_from_val (vectype_out, induc_val); | |
7091 } | |
7092 else if (nested_cycle) | |
7093 { | |
7094 /* Do not use an adjustment def as that case is not supported | |
7095 correctly if ncopies is not one. */ | |
7096 vec_initial_def = vect_get_vec_def_for_operand (initial_def, | |
7097 reduc_stmt_info); | |
7098 } | |
7099 else | |
7100 { | |
7101 tree adjustment_def = NULL_TREE; | |
7102 tree *adjustment_defp = &adjustment_def; | |
7103 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info); | |
7104 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def) | |
7105 adjustment_defp = NULL; | |
7106 vec_initial_def | |
7107 = get_initial_def_for_reduction (reduc_stmt_info, code, | |
7108 initial_def, adjustment_defp); | |
7109 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def; | |
7110 } | |
7111 vec_initial_defs.create (1); | |
7112 vec_initial_defs.quick_push (vec_initial_def); | |
7113 } | |
7114 | |
7115 /* Generate the reduction PHIs upfront. */ | |
7116 prev_phi_info = NULL; | |
7117 for (i = 0; i < vec_num; i++) | |
7118 { | |
7119 tree vec_init_def = vec_initial_defs[i]; | |
7120 for (j = 0; j < ncopies; j++) | |
7121 { | |
7122 /* Create the reduction-phi that defines the reduction | |
7123 operand. */ | |
7124 gphi *new_phi = create_phi_node (vec_dest, loop->header); | |
7125 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi); | |
7126 | |
7127 /* Set the loop-entry arg of the reduction-phi. */ | |
7128 if (j != 0 && nested_cycle) | |
7129 vec_init_def = vect_get_vec_def_for_stmt_copy (loop_vinfo, | |
7130 vec_init_def); | |
7131 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop), | |
7132 UNKNOWN_LOCATION); | |
7133 | |
7134 /* The loop-latch arg is set in epilogue processing. */ | |
7135 | |
7136 if (slp_node) | |
7137 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info); | |
7138 else | |
7139 { | |
7140 if (j == 0) | |
7141 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info; | |
7142 else | |
7143 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info; | |
7144 prev_phi_info = new_phi_info; | |
7145 } | |
7146 } | |
7147 } | |
7148 | |
7149 return true; | |
7150 } | |
7151 | |
7152 /* Vectorizes LC PHIs. */ | |
7153 | |
7154 bool | |
7155 vectorizable_lc_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt, | |
7156 slp_tree slp_node) | |
7157 { | |
7158 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
7159 if (!loop_vinfo | |
7160 || !is_a <gphi *> (stmt_info->stmt) | |
7161 || gimple_phi_num_args (stmt_info->stmt) != 1) | |
7162 return false; | |
7163 | |
7164 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def | |
7165 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def) | |
7166 return false; | |
7167 | |
7168 if (!vec_stmt) /* transformation not required. */ | |
7169 { | |
7170 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type; | |
7171 return true; | |
7172 } | |
7173 | |
7174 tree vectype = STMT_VINFO_VECTYPE (stmt_info); | |
7175 tree scalar_dest = gimple_phi_result (stmt_info->stmt); | |
7176 basic_block bb = gimple_bb (stmt_info->stmt); | |
7177 edge e = single_pred_edge (bb); | |
7178 tree vec_dest = vect_create_destination_var (scalar_dest, vectype); | |
7179 vec<tree> vec_oprnds = vNULL; | |
7180 vect_get_vec_defs (gimple_phi_arg_def (stmt_info->stmt, 0), NULL_TREE, | |
7181 stmt_info, &vec_oprnds, NULL, slp_node); | |
7182 if (slp_node) | |
7183 { | |
7184 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); | |
7185 gcc_assert (vec_oprnds.length () == vec_num); | |
7186 for (unsigned i = 0; i < vec_num; i++) | |
7187 { | |
7188 /* Create the vectorized LC PHI node. */ | |
7189 gphi *new_phi = create_phi_node (vec_dest, bb); | |
7190 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION); | |
7191 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi); | |
7192 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info); | |
7193 } | |
7194 } | |
7195 else | |
7196 { | |
7197 unsigned ncopies = vect_get_num_copies (loop_vinfo, vectype); | |
7198 stmt_vec_info prev_phi_info = NULL; | |
7199 for (unsigned i = 0; i < ncopies; i++) | |
7200 { | |
7201 if (i != 0) | |
7202 vect_get_vec_defs_for_stmt_copy (loop_vinfo, &vec_oprnds, NULL); | |
7203 /* Create the vectorized LC PHI node. */ | |
7204 gphi *new_phi = create_phi_node (vec_dest, bb); | |
7205 add_phi_arg (new_phi, vec_oprnds[0], e, UNKNOWN_LOCATION); | |
7206 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi); | |
7207 if (i == 0) | |
7208 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info; | |
7209 else | |
7210 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info; | |
7211 prev_phi_info = new_phi_info; | |
7212 } | |
7213 } | |
7214 vec_oprnds.release (); | |
7215 | |
7216 return true; | |
7217 } | |
7218 | |
7103 | 7219 |
7104 /* Function vect_min_worthwhile_factor. | 7220 /* Function vect_min_worthwhile_factor. |
7105 | 7221 |
7106 For a loop where we could vectorize the operation indicated by CODE, | 7222 For a loop where we could vectorize the operation indicated by CODE, |
7107 return the minimum vectorization factor that makes it worthwhile | 7223 return the minimum vectorization factor that makes it worthwhile |
7153 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED, | 7269 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED, |
7154 stmt_vec_info *vec_stmt, slp_tree slp_node, | 7270 stmt_vec_info *vec_stmt, slp_tree slp_node, |
7155 stmt_vector_for_cost *cost_vec) | 7271 stmt_vector_for_cost *cost_vec) |
7156 { | 7272 { |
7157 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | 7273 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); |
7158 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | 7274 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
7159 unsigned ncopies; | 7275 unsigned ncopies; |
7160 bool nested_in_vect_loop = false; | 7276 bool nested_in_vect_loop = false; |
7161 struct loop *iv_loop; | 7277 class loop *iv_loop; |
7162 tree vec_def; | 7278 tree vec_def; |
7163 edge pe = loop_preheader_edge (loop); | 7279 edge pe = loop_preheader_edge (loop); |
7164 basic_block new_bb; | 7280 basic_block new_bb; |
7165 tree new_vec, vec_init, vec_step, t; | 7281 tree new_vec, vec_init, vec_step, t; |
7166 tree new_name; | 7282 tree new_name; |
7287 latch_e = loop_latch_edge (iv_loop); | 7403 latch_e = loop_latch_edge (iv_loop); |
7288 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e); | 7404 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e); |
7289 | 7405 |
7290 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info); | 7406 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info); |
7291 gcc_assert (step_expr != NULL_TREE); | 7407 gcc_assert (step_expr != NULL_TREE); |
7408 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype); | |
7292 | 7409 |
7293 pe = loop_preheader_edge (iv_loop); | 7410 pe = loop_preheader_edge (iv_loop); |
7294 init_expr = PHI_ARG_DEF_FROM_EDGE (phi, | 7411 init_expr = PHI_ARG_DEF_FROM_EDGE (phi, |
7295 loop_preheader_edge (iv_loop)); | 7412 loop_preheader_edge (iv_loop)); |
7296 | 7413 |
7297 stmts = NULL; | 7414 stmts = NULL; |
7298 if (!nested_in_vect_loop) | 7415 if (!nested_in_vect_loop) |
7299 { | 7416 { |
7300 /* Convert the initial value to the desired type. */ | 7417 /* Convert the initial value to the IV update type. */ |
7301 tree new_type = TREE_TYPE (vectype); | 7418 tree new_type = TREE_TYPE (step_expr); |
7302 init_expr = gimple_convert (&stmts, new_type, init_expr); | 7419 init_expr = gimple_convert (&stmts, new_type, init_expr); |
7303 | 7420 |
7304 /* If we are using the loop mask to "peel" for alignment then we need | 7421 /* If we are using the loop mask to "peel" for alignment then we need |
7305 to adjust the start value here. */ | 7422 to adjust the start value here. */ |
7306 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo); | 7423 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo); |
7316 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type, | 7433 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type, |
7317 init_expr, skip_step); | 7434 init_expr, skip_step); |
7318 } | 7435 } |
7319 } | 7436 } |
7320 | 7437 |
7321 /* Convert the step to the desired type. */ | |
7322 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr); | |
7323 | |
7324 if (stmts) | 7438 if (stmts) |
7325 { | 7439 { |
7326 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); | 7440 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); |
7327 gcc_assert (!new_bb); | 7441 gcc_assert (!new_bb); |
7328 } | 7442 } |
7351 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr), | 7465 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr), |
7352 expr, step_expr); | 7466 expr, step_expr); |
7353 if (! CONSTANT_CLASS_P (new_name)) | 7467 if (! CONSTANT_CLASS_P (new_name)) |
7354 new_name = vect_init_vector (stmt_info, new_name, | 7468 new_name = vect_init_vector (stmt_info, new_name, |
7355 TREE_TYPE (step_expr), NULL); | 7469 TREE_TYPE (step_expr), NULL); |
7356 new_vec = build_vector_from_val (vectype, new_name); | 7470 new_vec = build_vector_from_val (step_vectype, new_name); |
7357 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL); | 7471 vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL); |
7358 | 7472 |
7359 /* Now generate the IVs. */ | 7473 /* Now generate the IVs. */ |
7360 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length (); | 7474 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length (); |
7361 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); | 7475 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); |
7362 unsigned elts = const_nunits * nvects; | 7476 unsigned elts = const_nunits * nvects; |
7365 gcc_assert (elts % group_size == 0); | 7479 gcc_assert (elts % group_size == 0); |
7366 tree elt = init_expr; | 7480 tree elt = init_expr; |
7367 unsigned ivn; | 7481 unsigned ivn; |
7368 for (ivn = 0; ivn < nivs; ++ivn) | 7482 for (ivn = 0; ivn < nivs; ++ivn) |
7369 { | 7483 { |
7370 tree_vector_builder elts (vectype, const_nunits, 1); | 7484 tree_vector_builder elts (step_vectype, const_nunits, 1); |
7371 stmts = NULL; | 7485 stmts = NULL; |
7372 for (unsigned eltn = 0; eltn < const_nunits; ++eltn) | 7486 for (unsigned eltn = 0; eltn < const_nunits; ++eltn) |
7373 { | 7487 { |
7374 if (ivn*const_nunits + eltn >= group_size | 7488 if (ivn*const_nunits + eltn >= group_size |
7375 && (ivn * const_nunits + eltn) % group_size == 0) | 7489 && (ivn * const_nunits + eltn) % group_size == 0) |
7376 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt), | 7490 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt), |
7377 elt, step_expr); | 7491 elt, step_expr); |
7378 elts.quick_push (elt); | 7492 elts.quick_push (elt); |
7379 } | 7493 } |
7380 vec_init = gimple_build_vector (&stmts, &elts); | 7494 vec_init = gimple_build_vector (&stmts, &elts); |
7495 vec_init = gimple_convert (&stmts, vectype, vec_init); | |
7381 if (stmts) | 7496 if (stmts) |
7382 { | 7497 { |
7383 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); | 7498 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); |
7384 gcc_assert (!new_bb); | 7499 gcc_assert (!new_bb); |
7385 } | 7500 } |
7390 stmt_vec_info induction_phi_info | 7505 stmt_vec_info induction_phi_info |
7391 = loop_vinfo->add_stmt (induction_phi); | 7506 = loop_vinfo->add_stmt (induction_phi); |
7392 induc_def = PHI_RESULT (induction_phi); | 7507 induc_def = PHI_RESULT (induction_phi); |
7393 | 7508 |
7394 /* Create the iv update inside the loop */ | 7509 /* Create the iv update inside the loop */ |
7395 vec_def = make_ssa_name (vec_dest); | 7510 gimple_seq stmts = NULL; |
7396 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step); | 7511 vec_def = gimple_convert (&stmts, step_vectype, induc_def); |
7397 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT); | 7512 vec_def = gimple_build (&stmts, |
7398 loop_vinfo->add_stmt (new_stmt); | 7513 PLUS_EXPR, step_vectype, vec_def, vec_step); |
7514 vec_def = gimple_convert (&stmts, vectype, vec_def); | |
7515 loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (vec_def)); | |
7516 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); | |
7399 | 7517 |
7400 /* Set the arguments of the phi node: */ | 7518 /* Set the arguments of the phi node: */ |
7401 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION); | 7519 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION); |
7402 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop), | 7520 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop), |
7403 UNKNOWN_LOCATION); | 7521 UNKNOWN_LOCATION); |
7421 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr), | 7539 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr), |
7422 expr, step_expr); | 7540 expr, step_expr); |
7423 if (! CONSTANT_CLASS_P (new_name)) | 7541 if (! CONSTANT_CLASS_P (new_name)) |
7424 new_name = vect_init_vector (stmt_info, new_name, | 7542 new_name = vect_init_vector (stmt_info, new_name, |
7425 TREE_TYPE (step_expr), NULL); | 7543 TREE_TYPE (step_expr), NULL); |
7426 new_vec = build_vector_from_val (vectype, new_name); | 7544 new_vec = build_vector_from_val (step_vectype, new_name); |
7427 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL); | 7545 vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL); |
7428 for (; ivn < nvects; ++ivn) | 7546 for (; ivn < nvects; ++ivn) |
7429 { | 7547 { |
7430 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt; | 7548 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt; |
7431 tree def; | 7549 tree def; |
7432 if (gimple_code (iv) == GIMPLE_PHI) | 7550 if (gimple_code (iv) == GIMPLE_PHI) |
7433 def = gimple_phi_result (iv); | 7551 def = gimple_phi_result (iv); |
7434 else | 7552 else |
7435 def = gimple_assign_lhs (iv); | 7553 def = gimple_assign_lhs (iv); |
7436 new_stmt = gimple_build_assign (make_ssa_name (vectype), | 7554 gimple_seq stmts = NULL; |
7437 PLUS_EXPR, | 7555 def = gimple_convert (&stmts, step_vectype, def); |
7438 def, vec_step); | 7556 def = gimple_build (&stmts, |
7557 PLUS_EXPR, step_vectype, def, vec_step); | |
7558 def = gimple_convert (&stmts, vectype, def); | |
7439 if (gimple_code (iv) == GIMPLE_PHI) | 7559 if (gimple_code (iv) == GIMPLE_PHI) |
7440 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT); | 7560 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); |
7441 else | 7561 else |
7442 { | 7562 { |
7443 gimple_stmt_iterator tgsi = gsi_for_stmt (iv); | 7563 gimple_stmt_iterator tgsi = gsi_for_stmt (iv); |
7444 gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING); | 7564 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING); |
7445 } | 7565 } |
7446 SLP_TREE_VEC_STMTS (slp_node).quick_push | 7566 SLP_TREE_VEC_STMTS (slp_node).quick_push |
7447 (loop_vinfo->add_stmt (new_stmt)); | 7567 (loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (def))); |
7448 } | 7568 } |
7449 } | 7569 } |
7450 | 7570 |
7451 return true; | 7571 return true; |
7452 } | 7572 } |
7478 else | 7598 else |
7479 { | 7599 { |
7480 /* iv_loop is the loop to be vectorized. Create: | 7600 /* iv_loop is the loop to be vectorized. Create: |
7481 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */ | 7601 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */ |
7482 stmts = NULL; | 7602 stmts = NULL; |
7483 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr); | 7603 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr); |
7484 | 7604 |
7485 unsigned HOST_WIDE_INT const_nunits; | 7605 unsigned HOST_WIDE_INT const_nunits; |
7486 if (nunits.is_constant (&const_nunits)) | 7606 if (nunits.is_constant (&const_nunits)) |
7487 { | 7607 { |
7488 tree_vector_builder elts (vectype, const_nunits, 1); | 7608 tree_vector_builder elts (step_vectype, const_nunits, 1); |
7489 elts.quick_push (new_name); | 7609 elts.quick_push (new_name); |
7490 for (i = 1; i < const_nunits; i++) | 7610 for (i = 1; i < const_nunits; i++) |
7491 { | 7611 { |
7492 /* Create: new_name_i = new_name + step_expr */ | 7612 /* Create: new_name_i = new_name + step_expr */ |
7493 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name), | 7613 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name), |
7498 new_name_nunits-1] */ | 7618 new_name_nunits-1] */ |
7499 vec_init = gimple_build_vector (&stmts, &elts); | 7619 vec_init = gimple_build_vector (&stmts, &elts); |
7500 } | 7620 } |
7501 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))) | 7621 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))) |
7502 /* Build the initial value directly from a VEC_SERIES_EXPR. */ | 7622 /* Build the initial value directly from a VEC_SERIES_EXPR. */ |
7503 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype, | 7623 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype, |
7504 new_name, step_expr); | 7624 new_name, step_expr); |
7505 else | 7625 else |
7506 { | 7626 { |
7507 /* Build: | 7627 /* Build: |
7508 [base, base, base, ...] | 7628 [base, base, base, ...] |
7509 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */ | 7629 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */ |
7510 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))); | 7630 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))); |
7511 gcc_assert (flag_associative_math); | 7631 gcc_assert (flag_associative_math); |
7512 tree index = build_index_vector (vectype, 0, 1); | 7632 tree index = build_index_vector (step_vectype, 0, 1); |
7513 tree base_vec = gimple_build_vector_from_val (&stmts, vectype, | 7633 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype, |
7514 new_name); | 7634 new_name); |
7515 tree step_vec = gimple_build_vector_from_val (&stmts, vectype, | 7635 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype, |
7516 step_expr); | 7636 step_expr); |
7517 vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index); | 7637 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index); |
7518 vec_init = gimple_build (&stmts, MULT_EXPR, vectype, | 7638 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype, |
7519 vec_init, step_vec); | 7639 vec_init, step_vec); |
7520 vec_init = gimple_build (&stmts, PLUS_EXPR, vectype, | 7640 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype, |
7521 vec_init, base_vec); | 7641 vec_init, base_vec); |
7522 } | 7642 } |
7643 vec_init = gimple_convert (&stmts, vectype, vec_init); | |
7523 | 7644 |
7524 if (stmts) | 7645 if (stmts) |
7525 { | 7646 { |
7526 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); | 7647 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); |
7527 gcc_assert (!new_bb); | 7648 gcc_assert (!new_bb); |
7556 } | 7677 } |
7557 | 7678 |
7558 t = unshare_expr (new_name); | 7679 t = unshare_expr (new_name); |
7559 gcc_assert (CONSTANT_CLASS_P (new_name) | 7680 gcc_assert (CONSTANT_CLASS_P (new_name) |
7560 || TREE_CODE (new_name) == SSA_NAME); | 7681 || TREE_CODE (new_name) == SSA_NAME); |
7561 new_vec = build_vector_from_val (vectype, t); | 7682 new_vec = build_vector_from_val (step_vectype, t); |
7562 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL); | 7683 vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL); |
7563 | 7684 |
7564 | 7685 |
7565 /* Create the following def-use cycle: | 7686 /* Create the following def-use cycle: |
7566 loop prolog: | 7687 loop prolog: |
7567 vec_init = ... | 7688 vec_init = ... |
7578 induction_phi = create_phi_node (vec_dest, iv_loop->header); | 7699 induction_phi = create_phi_node (vec_dest, iv_loop->header); |
7579 stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi); | 7700 stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi); |
7580 induc_def = PHI_RESULT (induction_phi); | 7701 induc_def = PHI_RESULT (induction_phi); |
7581 | 7702 |
7582 /* Create the iv update inside the loop */ | 7703 /* Create the iv update inside the loop */ |
7583 vec_def = make_ssa_name (vec_dest); | 7704 stmts = NULL; |
7584 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step); | 7705 vec_def = gimple_convert (&stmts, step_vectype, induc_def); |
7585 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT); | 7706 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step); |
7707 vec_def = gimple_convert (&stmts, vectype, vec_def); | |
7708 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); | |
7709 new_stmt = SSA_NAME_DEF_STMT (vec_def); | |
7586 stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt); | 7710 stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt); |
7587 | 7711 |
7588 /* Set the arguments of the phi node: */ | 7712 /* Set the arguments of the phi node: */ |
7589 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION); | 7713 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION); |
7590 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop), | 7714 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop), |
7622 } | 7746 } |
7623 | 7747 |
7624 t = unshare_expr (new_name); | 7748 t = unshare_expr (new_name); |
7625 gcc_assert (CONSTANT_CLASS_P (new_name) | 7749 gcc_assert (CONSTANT_CLASS_P (new_name) |
7626 || TREE_CODE (new_name) == SSA_NAME); | 7750 || TREE_CODE (new_name) == SSA_NAME); |
7627 new_vec = build_vector_from_val (vectype, t); | 7751 new_vec = build_vector_from_val (step_vectype, t); |
7628 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL); | 7752 vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL); |
7629 | 7753 |
7630 vec_def = induc_def; | 7754 vec_def = induc_def; |
7631 prev_stmt_vinfo = induction_phi_info; | 7755 prev_stmt_vinfo = induction_phi_info; |
7632 for (i = 1; i < ncopies; i++) | 7756 for (i = 1; i < ncopies; i++) |
7633 { | 7757 { |
7634 /* vec_i = vec_prev + vec_step */ | 7758 /* vec_i = vec_prev + vec_step */ |
7635 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, | 7759 gimple_seq stmts = NULL; |
7636 vec_def, vec_step); | 7760 vec_def = gimple_convert (&stmts, step_vectype, vec_def); |
7637 vec_def = make_ssa_name (vec_dest, new_stmt); | 7761 vec_def = gimple_build (&stmts, |
7638 gimple_assign_set_lhs (new_stmt, vec_def); | 7762 PLUS_EXPR, step_vectype, vec_def, vec_step); |
7763 vec_def = gimple_convert (&stmts, vectype, vec_def); | |
7639 | 7764 |
7640 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT); | 7765 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); |
7766 new_stmt = SSA_NAME_DEF_STMT (vec_def); | |
7641 new_stmt_info = loop_vinfo->add_stmt (new_stmt); | 7767 new_stmt_info = loop_vinfo->add_stmt (new_stmt); |
7642 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info; | 7768 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info; |
7643 prev_stmt_vinfo = new_stmt_info; | 7769 prev_stmt_vinfo = new_stmt_info; |
7644 } | 7770 } |
7645 } | 7771 } |
7691 STMT_INFO computes a value that is used outside the loop. Check if | 7817 STMT_INFO computes a value that is used outside the loop. Check if |
7692 it can be supported. */ | 7818 it can be supported. */ |
7693 | 7819 |
7694 bool | 7820 bool |
7695 vectorizable_live_operation (stmt_vec_info stmt_info, | 7821 vectorizable_live_operation (stmt_vec_info stmt_info, |
7696 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED, | 7822 gimple_stmt_iterator *gsi, |
7697 slp_tree slp_node, int slp_index, | 7823 slp_tree slp_node, slp_instance slp_node_instance, |
7698 stmt_vec_info *vec_stmt, | 7824 int slp_index, bool vec_stmt_p, |
7699 stmt_vector_for_cost *) | 7825 stmt_vector_for_cost *) |
7700 { | 7826 { |
7701 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | 7827 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); |
7702 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | 7828 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
7703 imm_use_iterator imm_iter; | 7829 imm_use_iterator imm_iter; |
7704 tree lhs, lhs_type, bitsize, vec_bitsize; | 7830 tree lhs, lhs_type, bitsize, vec_bitsize; |
7705 tree vectype = STMT_VINFO_VECTYPE (stmt_info); | 7831 tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
7706 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); | 7832 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); |
7707 int ncopies; | 7833 int ncopies; |
7710 int vec_entry = 0; | 7836 int vec_entry = 0; |
7711 poly_uint64 vec_index = 0; | 7837 poly_uint64 vec_index = 0; |
7712 | 7838 |
7713 gcc_assert (STMT_VINFO_LIVE_P (stmt_info)); | 7839 gcc_assert (STMT_VINFO_LIVE_P (stmt_info)); |
7714 | 7840 |
7715 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def) | 7841 /* If a stmt of a reduction is live, vectorize it via |
7716 return false; | 7842 vect_create_epilog_for_reduction. vectorizable_reduction assessed |
7843 validity so just trigger the transform here. */ | |
7844 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))) | |
7845 { | |
7846 if (!vec_stmt_p) | |
7847 return true; | |
7848 if (slp_node) | |
7849 { | |
7850 /* For reduction chains the meta-info is attached to | |
7851 the group leader. */ | |
7852 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)) | |
7853 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info); | |
7854 /* For SLP reductions we vectorize the epilogue for | |
7855 all involved stmts together. */ | |
7856 else if (slp_index != 0) | |
7857 return true; | |
7858 } | |
7859 stmt_vec_info reduc_info = info_for_reduction (stmt_info); | |
7860 gcc_assert (reduc_info->is_reduc_info); | |
7861 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION | |
7862 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION) | |
7863 return true; | |
7864 vect_create_epilog_for_reduction (stmt_info, slp_node, | |
7865 slp_node_instance); | |
7866 return true; | |
7867 } | |
7717 | 7868 |
7718 /* FORNOW. CHECKME. */ | 7869 /* FORNOW. CHECKME. */ |
7719 if (nested_in_vect_loop_p (loop, stmt_info)) | 7870 if (nested_in_vect_loop_p (loop, stmt_info)) |
7720 return false; | 7871 return false; |
7721 | 7872 |
7759 " final result.\n"); | 7910 " final result.\n"); |
7760 return false; | 7911 return false; |
7761 } | 7912 } |
7762 } | 7913 } |
7763 | 7914 |
7764 if (!vec_stmt) | 7915 if (!vec_stmt_p) |
7765 { | 7916 { |
7766 /* No transformation required. */ | 7917 /* No transformation required. */ |
7767 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)) | 7918 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)) |
7768 { | 7919 { |
7769 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype, | 7920 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype, |
7795 else | 7946 else |
7796 { | 7947 { |
7797 gcc_assert (ncopies == 1 && !slp_node); | 7948 gcc_assert (ncopies == 1 && !slp_node); |
7798 vect_record_loop_mask (loop_vinfo, | 7949 vect_record_loop_mask (loop_vinfo, |
7799 &LOOP_VINFO_MASKS (loop_vinfo), | 7950 &LOOP_VINFO_MASKS (loop_vinfo), |
7800 1, vectype); | 7951 1, vectype, NULL); |
7801 } | 7952 } |
7802 } | 7953 } |
7803 return true; | 7954 return true; |
7804 } | 7955 } |
7805 | 7956 |
7905 } | 8056 } |
7906 | 8057 |
7907 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */ | 8058 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */ |
7908 | 8059 |
7909 static void | 8060 static void |
7910 vect_loop_kill_debug_uses (struct loop *loop, stmt_vec_info stmt_info) | 8061 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info) |
7911 { | 8062 { |
7912 ssa_op_iter op_iter; | 8063 ssa_op_iter op_iter; |
7913 imm_use_iterator imm_iter; | 8064 imm_use_iterator imm_iter; |
7914 def_operand_p def_p; | 8065 def_operand_p def_p; |
7915 gimple *ustmt; | 8066 gimple *ustmt; |
7961 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters)) | 8112 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters)) |
7962 return true; | 8113 return true; |
7963 } | 8114 } |
7964 | 8115 |
7965 widest_int max; | 8116 widest_int max; |
7966 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | 8117 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
7967 /* Check the upper bound of loop niters. */ | 8118 /* Check the upper bound of loop niters. */ |
7968 if (get_max_loop_iterations (loop, &max)) | 8119 if (get_max_loop_iterations (loop, &max)) |
7969 { | 8120 { |
7970 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)); | 8121 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)); |
7971 signop sgn = TYPE_SIGN (type); | 8122 signop sgn = TYPE_SIGN (type); |
7974 return true; | 8125 return true; |
7975 } | 8126 } |
7976 return false; | 8127 return false; |
7977 } | 8128 } |
7978 | 8129 |
7979 /* Return a mask type with half the number of elements as TYPE. */ | 8130 /* Return a mask type with half the number of elements as OLD_TYPE, |
8131 given that it should have mode NEW_MODE. */ | |
7980 | 8132 |
7981 tree | 8133 tree |
7982 vect_halve_mask_nunits (tree type) | 8134 vect_halve_mask_nunits (tree old_type, machine_mode new_mode) |
7983 { | 8135 { |
7984 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2); | 8136 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2); |
7985 return build_truth_vector_type (nunits, current_vector_size); | 8137 return build_truth_vector_type_for_mode (nunits, new_mode); |
7986 } | 8138 } |
7987 | 8139 |
7988 /* Return a mask type with twice as many elements as TYPE. */ | 8140 /* Return a mask type with twice as many elements as OLD_TYPE, |
8141 given that it should have mode NEW_MODE. */ | |
7989 | 8142 |
7990 tree | 8143 tree |
7991 vect_double_mask_nunits (tree type) | 8144 vect_double_mask_nunits (tree old_type, machine_mode new_mode) |
7992 { | 8145 { |
7993 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2; | 8146 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2; |
7994 return build_truth_vector_type (nunits, current_vector_size); | 8147 return build_truth_vector_type_for_mode (nunits, new_mode); |
7995 } | 8148 } |
7996 | 8149 |
7997 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to | 8150 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to |
7998 contain a sequence of NVECTORS masks that each control a vector of type | 8151 contain a sequence of NVECTORS masks that each control a vector of type |
7999 VECTYPE. */ | 8152 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND |
8153 these vector masks with the vector version of SCALAR_MASK. */ | |
8000 | 8154 |
8001 void | 8155 void |
8002 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks, | 8156 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks, |
8003 unsigned int nvectors, tree vectype) | 8157 unsigned int nvectors, tree vectype, tree scalar_mask) |
8004 { | 8158 { |
8005 gcc_assert (nvectors != 0); | 8159 gcc_assert (nvectors != 0); |
8006 if (masks->length () < nvectors) | 8160 if (masks->length () < nvectors) |
8007 masks->safe_grow_cleared (nvectors); | 8161 masks->safe_grow_cleared (nvectors); |
8008 rgroup_masks *rgm = &(*masks)[nvectors - 1]; | 8162 rgroup_masks *rgm = &(*masks)[nvectors - 1]; |
8009 /* The number of scalars per iteration and the number of vectors are | 8163 /* The number of scalars per iteration and the number of vectors are |
8010 both compile-time constants. */ | 8164 both compile-time constants. */ |
8011 unsigned int nscalars_per_iter | 8165 unsigned int nscalars_per_iter |
8012 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype), | 8166 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype), |
8013 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant (); | 8167 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant (); |
8168 | |
8169 if (scalar_mask) | |
8170 { | |
8171 scalar_cond_masked_key cond (scalar_mask, nvectors); | |
8172 loop_vinfo->scalar_cond_masked_set.add (cond); | |
8173 } | |
8174 | |
8014 if (rgm->max_nscalars_per_iter < nscalars_per_iter) | 8175 if (rgm->max_nscalars_per_iter < nscalars_per_iter) |
8015 { | 8176 { |
8016 rgm->max_nscalars_per_iter = nscalars_per_iter; | 8177 rgm->max_nscalars_per_iter = nscalars_per_iter; |
8017 rgm->mask_type = build_same_sized_truth_vector_type (vectype); | 8178 rgm->mask_type = truth_type_for (vectype); |
8018 } | 8179 } |
8019 } | 8180 } |
8020 | 8181 |
8021 /* Given a complete set of masks MASKS, extract mask number INDEX | 8182 /* Given a complete set of masks MASKS, extract mask number INDEX |
8022 for an rgroup that operates on NVECTORS vectors of type VECTYPE, | 8183 for an rgroup that operates on NVECTORS vectors of type VECTYPE, |
8057 We can then view-convert the mask so that each sequence of | 8218 We can then view-convert the mask so that each sequence of |
8058 N elements is replaced by a single element. */ | 8219 N elements is replaced by a single element. */ |
8059 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type), | 8220 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type), |
8060 TYPE_VECTOR_SUBPARTS (vectype))); | 8221 TYPE_VECTOR_SUBPARTS (vectype))); |
8061 gimple_seq seq = NULL; | 8222 gimple_seq seq = NULL; |
8062 mask_type = build_same_sized_truth_vector_type (vectype); | 8223 mask_type = truth_type_for (vectype); |
8063 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask); | 8224 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask); |
8064 if (seq) | 8225 if (seq) |
8065 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT); | 8226 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT); |
8066 } | 8227 } |
8067 return mask; | 8228 return mask; |
8069 | 8230 |
8070 /* Scale profiling counters by estimation for LOOP which is vectorized | 8231 /* Scale profiling counters by estimation for LOOP which is vectorized |
8071 by factor VF. */ | 8232 by factor VF. */ |
8072 | 8233 |
8073 static void | 8234 static void |
8074 scale_profile_for_vect_loop (struct loop *loop, unsigned vf) | 8235 scale_profile_for_vect_loop (class loop *loop, unsigned vf) |
8075 { | 8236 { |
8076 edge preheader = loop_preheader_edge (loop); | 8237 edge preheader = loop_preheader_edge (loop); |
8077 /* Reduce loop iterations by the vectorization factor. */ | 8238 /* Reduce loop iterations by the vectorization factor. */ |
8078 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf); | 8239 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf); |
8079 profile_count freq_h = loop->header->count, freq_e = preheader->count (); | 8240 profile_count freq_h = loop->header->count, freq_e = preheader->count (); |
8107 | 8268 |
8108 static void | 8269 static void |
8109 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info, | 8270 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info, |
8110 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store) | 8271 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store) |
8111 { | 8272 { |
8112 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | 8273 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
8113 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); | 8274 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
8114 | 8275 |
8115 if (dump_enabled_p ()) | 8276 if (dump_enabled_p ()) |
8116 dump_printf_loc (MSG_NOTE, vect_location, | 8277 dump_printf_loc (MSG_NOTE, vect_location, |
8117 "------>vectorizing statement: %G", stmt_info->stmt); | 8278 "------>vectorizing statement: %G", stmt_info->stmt); |
8145 | 8306 |
8146 if (vect_transform_stmt (stmt_info, gsi, NULL, NULL)) | 8307 if (vect_transform_stmt (stmt_info, gsi, NULL, NULL)) |
8147 *seen_store = stmt_info; | 8308 *seen_store = stmt_info; |
8148 } | 8309 } |
8149 | 8310 |
8311 /* Helper function to pass to simplify_replace_tree to enable replacing tree's | |
8312 in the hash_map with its corresponding values. */ | |
8313 | |
8314 static tree | |
8315 find_in_mapping (tree t, void *context) | |
8316 { | |
8317 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context; | |
8318 | |
8319 tree *value = mapping->get (t); | |
8320 return value ? *value : t; | |
8321 } | |
8322 | |
8323 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the | |
8324 original loop that has now been vectorized. | |
8325 | |
8326 The inits of the data_references need to be advanced with the number of | |
8327 iterations of the main loop. This has been computed in vect_do_peeling and | |
8328 is stored in parameter ADVANCE. We first restore the data_references | |
8329 initial offset with the values recored in ORIG_DRS_INIT. | |
8330 | |
8331 Since the loop_vec_info of this EPILOGUE was constructed for the original | |
8332 loop, its stmt_vec_infos all point to the original statements. These need | |
8333 to be updated to point to their corresponding copies as well as the SSA_NAMES | |
8334 in their PATTERN_DEF_SEQs and RELATED_STMTs. | |
8335 | |
8336 The data_reference's connections also need to be updated. Their | |
8337 corresponding dr_vec_info need to be reconnected to the EPILOGUE's | |
8338 stmt_vec_infos, their statements need to point to their corresponding copy, | |
8339 if they are gather loads or scatter stores then their reference needs to be | |
8340 updated to point to its corresponding copy and finally we set | |
8341 'base_misaligned' to false as we have already peeled for alignment in the | |
8342 prologue of the main loop. */ | |
8343 | |
8344 static void | |
8345 update_epilogue_loop_vinfo (class loop *epilogue, tree advance) | |
8346 { | |
8347 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue); | |
8348 auto_vec<gimple *> stmt_worklist; | |
8349 hash_map<tree,tree> mapping; | |
8350 gimple *orig_stmt, *new_stmt; | |
8351 gimple_stmt_iterator epilogue_gsi; | |
8352 gphi_iterator epilogue_phi_gsi; | |
8353 stmt_vec_info stmt_vinfo = NULL, related_vinfo; | |
8354 basic_block *epilogue_bbs = get_loop_body (epilogue); | |
8355 unsigned i; | |
8356 | |
8357 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs; | |
8358 | |
8359 /* Advance data_reference's with the number of iterations of the previous | |
8360 loop and its prologue. */ | |
8361 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR); | |
8362 | |
8363 | |
8364 /* The EPILOGUE loop is a copy of the original loop so they share the same | |
8365 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to | |
8366 point to the copied statements. We also create a mapping of all LHS' in | |
8367 the original loop and all the LHS' in the EPILOGUE and create worklists to | |
8368 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */ | |
8369 for (unsigned i = 0; i < epilogue->num_nodes; ++i) | |
8370 { | |
8371 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]); | |
8372 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi)) | |
8373 { | |
8374 new_stmt = epilogue_phi_gsi.phi (); | |
8375 | |
8376 gcc_assert (gimple_uid (new_stmt) > 0); | |
8377 stmt_vinfo | |
8378 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1]; | |
8379 | |
8380 orig_stmt = STMT_VINFO_STMT (stmt_vinfo); | |
8381 STMT_VINFO_STMT (stmt_vinfo) = new_stmt; | |
8382 | |
8383 mapping.put (gimple_phi_result (orig_stmt), | |
8384 gimple_phi_result (new_stmt)); | |
8385 /* PHI nodes can not have patterns or related statements. */ | |
8386 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL | |
8387 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL); | |
8388 } | |
8389 | |
8390 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]); | |
8391 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi)) | |
8392 { | |
8393 new_stmt = gsi_stmt (epilogue_gsi); | |
8394 | |
8395 gcc_assert (gimple_uid (new_stmt) > 0); | |
8396 stmt_vinfo | |
8397 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1]; | |
8398 | |
8399 orig_stmt = STMT_VINFO_STMT (stmt_vinfo); | |
8400 STMT_VINFO_STMT (stmt_vinfo) = new_stmt; | |
8401 | |
8402 if (tree old_lhs = gimple_get_lhs (orig_stmt)) | |
8403 mapping.put (old_lhs, gimple_get_lhs (new_stmt)); | |
8404 | |
8405 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo)) | |
8406 { | |
8407 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo); | |
8408 for (gimple_stmt_iterator gsi = gsi_start (seq); | |
8409 !gsi_end_p (gsi); gsi_next (&gsi)) | |
8410 stmt_worklist.safe_push (gsi_stmt (gsi)); | |
8411 } | |
8412 | |
8413 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo); | |
8414 if (related_vinfo != NULL && related_vinfo != stmt_vinfo) | |
8415 { | |
8416 gimple *stmt = STMT_VINFO_STMT (related_vinfo); | |
8417 stmt_worklist.safe_push (stmt); | |
8418 /* Set BB such that the assert in | |
8419 'get_initial_def_for_reduction' is able to determine that | |
8420 the BB of the related stmt is inside this loop. */ | |
8421 gimple_set_bb (stmt, | |
8422 gimple_bb (new_stmt)); | |
8423 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo); | |
8424 gcc_assert (related_vinfo == NULL | |
8425 || related_vinfo == stmt_vinfo); | |
8426 } | |
8427 } | |
8428 } | |
8429 | |
8430 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed | |
8431 using the original main loop and thus need to be updated to refer to the | |
8432 cloned variables used in the epilogue. */ | |
8433 for (unsigned i = 0; i < stmt_worklist.length (); ++i) | |
8434 { | |
8435 gimple *stmt = stmt_worklist[i]; | |
8436 tree *new_op; | |
8437 | |
8438 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j) | |
8439 { | |
8440 tree op = gimple_op (stmt, j); | |
8441 if ((new_op = mapping.get(op))) | |
8442 gimple_set_op (stmt, j, *new_op); | |
8443 else | |
8444 { | |
8445 /* PR92429: The last argument of simplify_replace_tree disables | |
8446 folding when replacing arguments. This is required as | |
8447 otherwise you might end up with different statements than the | |
8448 ones analyzed in vect_loop_analyze, leading to different | |
8449 vectorization. */ | |
8450 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE, | |
8451 &find_in_mapping, &mapping, false); | |
8452 gimple_set_op (stmt, j, op); | |
8453 } | |
8454 } | |
8455 } | |
8456 | |
8457 struct data_reference *dr; | |
8458 vec<data_reference_p> datarefs = epilogue_vinfo->shared->datarefs; | |
8459 FOR_EACH_VEC_ELT (datarefs, i, dr) | |
8460 { | |
8461 orig_stmt = DR_STMT (dr); | |
8462 gcc_assert (gimple_uid (orig_stmt) > 0); | |
8463 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1]; | |
8464 /* Data references for gather loads and scatter stores do not use the | |
8465 updated offset we set using ADVANCE. Instead we have to make sure the | |
8466 reference in the data references point to the corresponding copy of | |
8467 the original in the epilogue. */ | |
8468 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo)) | |
8469 == VMAT_GATHER_SCATTER) | |
8470 { | |
8471 DR_REF (dr) | |
8472 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE, | |
8473 &find_in_mapping, &mapping); | |
8474 DR_BASE_ADDRESS (dr) | |
8475 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE, | |
8476 &find_in_mapping, &mapping); | |
8477 } | |
8478 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo); | |
8479 stmt_vinfo->dr_aux.stmt = stmt_vinfo; | |
8480 /* The vector size of the epilogue is smaller than that of the main loop | |
8481 so the alignment is either the same or lower. This means the dr will | |
8482 thus by definition be aligned. */ | |
8483 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false; | |
8484 } | |
8485 | |
8486 epilogue_vinfo->shared->datarefs_copy.release (); | |
8487 epilogue_vinfo->shared->save_datarefs (); | |
8488 } | |
8489 | |
8150 /* Function vect_transform_loop. | 8490 /* Function vect_transform_loop. |
8151 | 8491 |
8152 The analysis phase has determined that the loop is vectorizable. | 8492 The analysis phase has determined that the loop is vectorizable. |
8153 Vectorize the loop - created vectorized stmts to replace the scalar | 8493 Vectorize the loop - created vectorized stmts to replace the scalar |
8154 stmts in the loop, and update the loop exit condition. | 8494 stmts in the loop, and update the loop exit condition. |
8155 Returns scalar epilogue loop if any. */ | 8495 Returns scalar epilogue loop if any. */ |
8156 | 8496 |
8157 struct loop * | 8497 class loop * |
8158 vect_transform_loop (loop_vec_info loop_vinfo) | 8498 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call) |
8159 { | 8499 { |
8160 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | 8500 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
8161 struct loop *epilogue = NULL; | 8501 class loop *epilogue = NULL; |
8162 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); | 8502 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); |
8163 int nbbs = loop->num_nodes; | 8503 int nbbs = loop->num_nodes; |
8164 int i; | 8504 int i; |
8165 tree niters_vector = NULL_TREE; | 8505 tree niters_vector = NULL_TREE; |
8166 tree step_vector = NULL_TREE; | 8506 tree step_vector = NULL_TREE; |
8179 of iterations is constant assume the cost check has been performed | 8519 of iterations is constant assume the cost check has been performed |
8180 by our caller. If the threshold makes all loops profitable that | 8520 by our caller. If the threshold makes all loops profitable that |
8181 run at least the (estimated) vectorization factor number of times | 8521 run at least the (estimated) vectorization factor number of times |
8182 checking is pointless, too. */ | 8522 checking is pointless, too. */ |
8183 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo); | 8523 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo); |
8184 if (th >= vect_vf_for_cost (loop_vinfo) | 8524 if (vect_apply_runtime_profitability_check_p (loop_vinfo)) |
8185 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) | |
8186 { | 8525 { |
8187 if (dump_enabled_p ()) | 8526 if (dump_enabled_p ()) |
8188 dump_printf_loc (MSG_NOTE, vect_location, | 8527 dump_printf_loc (MSG_NOTE, vect_location, |
8189 "Profitability threshold is %d loop iterations.\n", | 8528 "Profitability threshold is %d loop iterations.\n", |
8190 th); | 8529 th); |
8191 check_profitability = true; | 8530 check_profitability = true; |
8192 } | 8531 } |
8193 | 8532 |
8194 /* Make sure there exists a single-predecessor exit bb. Do this before | 8533 /* Make sure there exists a single-predecessor exit bb. Do this before |
8195 versioning. */ | 8534 versioning. */ |
8196 edge e = single_exit (loop); | 8535 edge e = single_exit (loop); |
8197 if (! single_pred_p (e->dest)) | 8536 if (! single_pred_p (e->dest)) |
8198 { | 8537 { |
8199 split_loop_exit_edge (e); | 8538 split_loop_exit_edge (e, true); |
8200 if (dump_enabled_p ()) | 8539 if (dump_enabled_p ()) |
8201 dump_printf (MSG_NOTE, "split exit edge\n"); | 8540 dump_printf (MSG_NOTE, "split exit edge\n"); |
8202 } | 8541 } |
8203 | 8542 |
8204 /* Version the loop first, if required, so the profitability check | 8543 /* Version the loop first, if required, so the profitability check |
8205 comes first. */ | 8544 comes first. */ |
8206 | 8545 |
8207 if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) | 8546 if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) |
8208 { | 8547 { |
8209 poly_uint64 versioning_threshold | 8548 class loop *sloop |
8210 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo); | 8549 = vect_loop_versioning (loop_vinfo, loop_vectorized_call); |
8211 if (check_profitability | 8550 sloop->force_vectorize = false; |
8212 && ordered_p (poly_uint64 (th), versioning_threshold)) | |
8213 { | |
8214 versioning_threshold = ordered_max (poly_uint64 (th), | |
8215 versioning_threshold); | |
8216 check_profitability = false; | |
8217 } | |
8218 vect_loop_versioning (loop_vinfo, th, check_profitability, | |
8219 versioning_threshold); | |
8220 check_profitability = false; | 8551 check_profitability = false; |
8221 } | 8552 } |
8222 | 8553 |
8223 /* Make sure there exists a single-predecessor exit bb also on the | 8554 /* Make sure there exists a single-predecessor exit bb also on the |
8224 scalar loop copy. Do this after versioning but before peeling | 8555 scalar loop copy. Do this after versioning but before peeling |
8228 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)) | 8559 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)) |
8229 { | 8560 { |
8230 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)); | 8561 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)); |
8231 if (! single_pred_p (e->dest)) | 8562 if (! single_pred_p (e->dest)) |
8232 { | 8563 { |
8233 split_loop_exit_edge (e); | 8564 split_loop_exit_edge (e, true); |
8234 if (dump_enabled_p ()) | 8565 if (dump_enabled_p ()) |
8235 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n"); | 8566 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n"); |
8236 } | 8567 } |
8237 } | 8568 } |
8238 | 8569 |
8239 tree niters = vect_build_loop_niters (loop_vinfo); | 8570 tree niters = vect_build_loop_niters (loop_vinfo); |
8240 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters; | 8571 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters; |
8241 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo)); | 8572 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo)); |
8242 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo); | 8573 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo); |
8574 tree advance; | |
8575 drs_init_vec orig_drs_init; | |
8576 | |
8243 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector, | 8577 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector, |
8244 &step_vector, &niters_vector_mult_vf, th, | 8578 &step_vector, &niters_vector_mult_vf, th, |
8245 check_profitability, niters_no_overflow); | 8579 check_profitability, niters_no_overflow, |
8580 &advance); | |
8581 | |
8582 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo) | |
8583 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ()) | |
8584 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo), | |
8585 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo)); | |
8246 | 8586 |
8247 if (niters_vector == NULL_TREE) | 8587 if (niters_vector == NULL_TREE) |
8248 { | 8588 { |
8249 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) | 8589 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) |
8250 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) | 8590 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) |
8314 && dump_enabled_p ()) | 8654 && dump_enabled_p ()) |
8315 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n"); | 8655 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n"); |
8316 | 8656 |
8317 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def | 8657 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def |
8318 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def | 8658 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def |
8319 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle) | 8659 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def |
8660 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle | |
8661 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def) | |
8320 && ! PURE_SLP_STMT (stmt_info)) | 8662 && ! PURE_SLP_STMT (stmt_info)) |
8321 { | 8663 { |
8322 if (dump_enabled_p ()) | 8664 if (dump_enabled_p ()) |
8323 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n"); | 8665 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n"); |
8324 vect_transform_stmt (stmt_info, NULL, NULL, NULL); | 8666 vect_transform_stmt (stmt_info, NULL, NULL, NULL); |
8466 dump_printf_loc (MSG_NOTE, vect_location, | 8808 dump_printf_loc (MSG_NOTE, vect_location, |
8467 "OUTER LOOP VECTORIZED\n"); | 8809 "OUTER LOOP VECTORIZED\n"); |
8468 dump_printf (MSG_NOTE, "\n"); | 8810 dump_printf (MSG_NOTE, "\n"); |
8469 } | 8811 } |
8470 else | 8812 else |
8471 { | 8813 dump_printf_loc (MSG_NOTE, vect_location, |
8472 dump_printf_loc (MSG_NOTE, vect_location, | 8814 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n", |
8473 "LOOP EPILOGUE VECTORIZED (VS="); | 8815 GET_MODE_NAME (loop_vinfo->vector_mode)); |
8474 dump_dec (MSG_NOTE, current_vector_size); | 8816 } |
8475 dump_printf (MSG_NOTE, ")\n"); | 8817 |
8476 } | 8818 /* Loops vectorized with a variable factor won't benefit from |
8477 } | 8819 unrolling/peeling. */ |
8478 | 8820 if (!vf.is_constant ()) |
8821 { | |
8822 loop->unroll = 1; | |
8823 if (dump_enabled_p ()) | |
8824 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to" | |
8825 " variable-length vectorization factor\n"); | |
8826 } | |
8479 /* Free SLP instances here because otherwise stmt reference counting | 8827 /* Free SLP instances here because otherwise stmt reference counting |
8480 won't work. */ | 8828 won't work. */ |
8481 slp_instance instance; | 8829 slp_instance instance; |
8482 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance) | 8830 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance) |
8483 vect_free_slp_instance (instance, true); | 8831 vect_free_slp_instance (instance, true); |
8484 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release (); | 8832 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release (); |
8485 /* Clear-up safelen field since its value is invalid after vectorization | 8833 /* Clear-up safelen field since its value is invalid after vectorization |
8486 since vectorized loop can have loop-carried dependencies. */ | 8834 since vectorized loop can have loop-carried dependencies. */ |
8487 loop->safelen = 0; | 8835 loop->safelen = 0; |
8488 | 8836 |
8489 /* Don't vectorize epilogue for epilogue. */ | |
8490 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) | |
8491 epilogue = NULL; | |
8492 | |
8493 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK)) | |
8494 epilogue = NULL; | |
8495 | |
8496 if (epilogue) | 8837 if (epilogue) |
8497 { | 8838 { |
8498 auto_vector_sizes vector_sizes; | 8839 update_epilogue_loop_vinfo (epilogue, advance); |
8499 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes); | 8840 |
8500 unsigned int next_size = 0; | 8841 epilogue->simduid = loop->simduid; |
8501 | |
8502 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) | |
8503 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0 | |
8504 && known_eq (vf, lowest_vf)) | |
8505 { | |
8506 unsigned int eiters | |
8507 = (LOOP_VINFO_INT_NITERS (loop_vinfo) | |
8508 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)); | |
8509 eiters = eiters % lowest_vf; | |
8510 epilogue->nb_iterations_upper_bound = eiters - 1; | |
8511 | |
8512 unsigned int ratio; | |
8513 while (next_size < vector_sizes.length () | |
8514 && !(constant_multiple_p (current_vector_size, | |
8515 vector_sizes[next_size], &ratio) | |
8516 && eiters >= lowest_vf / ratio)) | |
8517 next_size += 1; | |
8518 } | |
8519 else | |
8520 while (next_size < vector_sizes.length () | |
8521 && maybe_lt (current_vector_size, vector_sizes[next_size])) | |
8522 next_size += 1; | |
8523 | |
8524 if (next_size == vector_sizes.length ()) | |
8525 epilogue = NULL; | |
8526 } | |
8527 | |
8528 if (epilogue) | |
8529 { | |
8530 epilogue->force_vectorize = loop->force_vectorize; | 8842 epilogue->force_vectorize = loop->force_vectorize; |
8531 epilogue->safelen = loop->safelen; | |
8532 epilogue->dont_vectorize = false; | 8843 epilogue->dont_vectorize = false; |
8533 | |
8534 /* We may need to if-convert epilogue to vectorize it. */ | |
8535 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)) | |
8536 tree_if_conversion (epilogue); | |
8537 } | 8844 } |
8538 | 8845 |
8539 return epilogue; | 8846 return epilogue; |
8540 } | 8847 } |
8541 | 8848 |
8561 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184); | 8868 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184); |
8562 } | 8869 } |
8563 */ | 8870 */ |
8564 | 8871 |
8565 void | 8872 void |
8566 optimize_mask_stores (struct loop *loop) | 8873 optimize_mask_stores (class loop *loop) |
8567 { | 8874 { |
8568 basic_block *bbs = get_loop_body (loop); | 8875 basic_block *bbs = get_loop_body (loop); |
8569 unsigned nbbs = loop->num_nodes; | 8876 unsigned nbbs = loop->num_nodes; |
8570 unsigned i; | 8877 unsigned i; |
8571 basic_block bb; | 8878 basic_block bb; |
8572 struct loop *bb_loop; | 8879 class loop *bb_loop; |
8573 gimple_stmt_iterator gsi; | 8880 gimple_stmt_iterator gsi; |
8574 gimple *stmt; | 8881 gimple *stmt; |
8575 auto_vec<gimple *> worklist; | 8882 auto_vec<gimple *> worklist; |
8883 auto_purge_vect_location sentinel; | |
8576 | 8884 |
8577 vect_location = find_loop_location (loop); | 8885 vect_location = find_loop_location (loop); |
8578 /* Pick up all masked stores in loop if any. */ | 8886 /* Pick up all masked stores in loop if any. */ |
8579 for (i = 0; i < nbbs; i++) | 8887 for (i = 0; i < nbbs; i++) |
8580 { | 8888 { |
8745 last = worklist.pop (); | 9053 last = worklist.pop (); |
8746 } | 9054 } |
8747 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION); | 9055 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION); |
8748 } | 9056 } |
8749 } | 9057 } |
9058 | |
9059 /* Decide whether it is possible to use a zero-based induction variable | |
9060 when vectorizing LOOP_VINFO with a fully-masked loop. If it is, | |
9061 return the value that the induction variable must be able to hold | |
9062 in order to ensure that the loop ends with an all-false mask. | |
9063 Return -1 otherwise. */ | |
9064 widest_int | |
9065 vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo) | |
9066 { | |
9067 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo); | |
9068 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
9069 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo); | |
9070 | |
9071 /* Calculate the value that the induction variable must be able | |
9072 to hit in order to ensure that we end the loop with an all-false mask. | |
9073 This involves adding the maximum number of inactive trailing scalar | |
9074 iterations. */ | |
9075 widest_int iv_limit = -1; | |
9076 if (max_loop_iterations (loop, &iv_limit)) | |
9077 { | |
9078 if (niters_skip) | |
9079 { | |
9080 /* Add the maximum number of skipped iterations to the | |
9081 maximum iteration count. */ | |
9082 if (TREE_CODE (niters_skip) == INTEGER_CST) | |
9083 iv_limit += wi::to_widest (niters_skip); | |
9084 else | |
9085 iv_limit += max_vf - 1; | |
9086 } | |
9087 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)) | |
9088 /* Make a conservatively-correct assumption. */ | |
9089 iv_limit += max_vf - 1; | |
9090 | |
9091 /* IV_LIMIT is the maximum number of latch iterations, which is also | |
9092 the maximum in-range IV value. Round this value down to the previous | |
9093 vector alignment boundary and then add an extra full iteration. */ | |
9094 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); | |
9095 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf; | |
9096 } | |
9097 return iv_limit; | |
9098 } | |
9099 |