comparison gcc/tree-vect-loop.c @ 145:1830386684a0

gcc-9.2.0
author anatofuz
date Thu, 13 Feb 2020 11:34:05 +0900
parents 84e7813d76e9
children
comparison
equal deleted inserted replaced
131:84e7813d76e9 145:1830386684a0
1 /* Loop Vectorization 1 /* Loop Vectorization
2 Copyright (C) 2003-2018 Free Software Foundation, Inc. 2 Copyright (C) 2003-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and 3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com> 4 Ira Rosen <irar@il.ibm.com>
5 5
6 This file is part of GCC. 6 This file is part of GCC.
7 7
41 #include "tree-ssa-loop-ivopts.h" 41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h" 42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h" 43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h" 44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h" 45 #include "cfgloop.h"
46 #include "params.h"
47 #include "tree-scalar-evolution.h" 46 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h" 47 #include "tree-vectorizer.h"
49 #include "gimple-fold.h" 48 #include "gimple-fold.h"
50 #include "cgraph.h" 49 #include "cgraph.h"
51 #include "tree-cfg.h" 50 #include "tree-cfg.h"
152 For additional information on this project see: 151 For additional information on this project see:
153 http://gcc.gnu.org/projects/tree-ssa/vectorization.html 152 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
154 */ 153 */
155 154
156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *); 155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
156 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
157 bool *, bool *);
157 158
158 /* Subroutine of vect_determine_vf_for_stmt that handles only one 159 /* Subroutine of vect_determine_vf_for_stmt that handles only one
159 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE 160 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
160 may already be set for general statements (not just data refs). */ 161 may already be set for general statements (not just data refs). */
161 162
162 static opt_result 163 static opt_result
163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info, 164 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
164 bool vectype_maybe_set_p, 165 bool vectype_maybe_set_p,
165 poly_uint64 *vf, 166 poly_uint64 *vf)
166 vec<stmt_vec_info > *mask_producers)
167 { 167 {
168 gimple *stmt = stmt_info->stmt; 168 gimple *stmt = stmt_info->stmt;
169 169
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info) 170 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
171 && !STMT_VINFO_LIVE_P (stmt_info)) 171 && !STMT_VINFO_LIVE_P (stmt_info))
189 that contain a data ref, or for "pattern-stmts" (stmts generated 189 that contain a data ref, or for "pattern-stmts" (stmts generated
190 by the vectorizer to represent/replace a certain idiom). */ 190 by the vectorizer to represent/replace a certain idiom). */
191 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info) 191 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
192 || vectype_maybe_set_p) 192 || vectype_maybe_set_p)
193 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype); 193 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
194 else if (stmt_vectype == boolean_type_node)
195 mask_producers->safe_push (stmt_info);
196 else 194 else
197 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype; 195 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
198 } 196 }
199 197
200 if (nunits_vectype) 198 if (nunits_vectype)
203 return opt_result::success (); 201 return opt_result::success ();
204 } 202 }
205 203
206 /* Subroutine of vect_determine_vectorization_factor. Set the vector 204 /* Subroutine of vect_determine_vectorization_factor. Set the vector
207 types of STMT_INFO and all attached pattern statements and update 205 types of STMT_INFO and all attached pattern statements and update
208 the vectorization factor VF accordingly. If some of the statements 206 the vectorization factor VF accordingly. Return true on success
209 produce a mask result whose vector type can only be calculated later, 207 or false if something prevented vectorization. */
210 add them to MASK_PRODUCERS. Return true on success or false if
211 something prevented vectorization. */
212 208
213 static opt_result 209 static opt_result
214 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf, 210 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf)
215 vec<stmt_vec_info > *mask_producers)
216 { 211 {
217 vec_info *vinfo = stmt_info->vinfo; 212 vec_info *vinfo = stmt_info->vinfo;
218 if (dump_enabled_p ()) 213 if (dump_enabled_p ())
219 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G", 214 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
220 stmt_info->stmt); 215 stmt_info->stmt);
221 opt_result res 216 opt_result res = vect_determine_vf_for_stmt_1 (stmt_info, false, vf);
222 = vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers);
223 if (!res) 217 if (!res)
224 return res; 218 return res;
225 219
226 if (STMT_VINFO_IN_PATTERN_P (stmt_info) 220 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
227 && STMT_VINFO_RELATED_STMT (stmt_info)) 221 && STMT_VINFO_RELATED_STMT (stmt_info))
236 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si)); 230 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
237 if (dump_enabled_p ()) 231 if (dump_enabled_p ())
238 dump_printf_loc (MSG_NOTE, vect_location, 232 dump_printf_loc (MSG_NOTE, vect_location,
239 "==> examining pattern def stmt: %G", 233 "==> examining pattern def stmt: %G",
240 def_stmt_info->stmt); 234 def_stmt_info->stmt);
241 if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true, 235 res = vect_determine_vf_for_stmt_1 (def_stmt_info, true, vf);
242 vf, mask_producers))
243 res = vect_determine_vf_for_stmt_1 (def_stmt_info, true,
244 vf, mask_producers);
245 if (!res) 236 if (!res)
246 return res; 237 return res;
247 } 238 }
248 239
249 if (dump_enabled_p ()) 240 if (dump_enabled_p ())
250 dump_printf_loc (MSG_NOTE, vect_location, 241 dump_printf_loc (MSG_NOTE, vect_location,
251 "==> examining pattern statement: %G", 242 "==> examining pattern statement: %G",
252 stmt_info->stmt); 243 stmt_info->stmt);
253 res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers); 244 res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf);
254 if (!res) 245 if (!res)
255 return res; 246 return res;
256 } 247 }
257 248
258 return opt_result::success (); 249 return opt_result::success ();
284 */ 275 */
285 276
286 static opt_result 277 static opt_result
287 vect_determine_vectorization_factor (loop_vec_info loop_vinfo) 278 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
288 { 279 {
289 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 280 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
290 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 281 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
291 unsigned nbbs = loop->num_nodes; 282 unsigned nbbs = loop->num_nodes;
292 poly_uint64 vectorization_factor = 1; 283 poly_uint64 vectorization_factor = 1;
293 tree scalar_type = NULL_TREE; 284 tree scalar_type = NULL_TREE;
294 gphi *phi; 285 gphi *phi;
295 tree vectype; 286 tree vectype;
296 stmt_vec_info stmt_info; 287 stmt_vec_info stmt_info;
297 unsigned i; 288 unsigned i;
298 auto_vec<stmt_vec_info> mask_producers;
299 289
300 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor"); 290 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
301 291
302 for (i = 0; i < nbbs; i++) 292 for (i = 0; i < nbbs; i++)
303 { 293 {
323 if (dump_enabled_p ()) 313 if (dump_enabled_p ())
324 dump_printf_loc (MSG_NOTE, vect_location, 314 dump_printf_loc (MSG_NOTE, vect_location,
325 "get vectype for scalar type: %T\n", 315 "get vectype for scalar type: %T\n",
326 scalar_type); 316 scalar_type);
327 317
328 vectype = get_vectype_for_scalar_type (scalar_type); 318 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
329 if (!vectype) 319 if (!vectype)
330 return opt_result::failure_at (phi, 320 return opt_result::failure_at (phi,
331 "not vectorized: unsupported " 321 "not vectorized: unsupported "
332 "data-type %T\n", 322 "data-type %T\n",
333 scalar_type); 323 scalar_type);
351 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si); 341 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
352 gsi_next (&si)) 342 gsi_next (&si))
353 { 343 {
354 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si)); 344 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
355 opt_result res 345 opt_result res
356 = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor, 346 = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor);
357 &mask_producers);
358 if (!res) 347 if (!res)
359 return res; 348 return res;
360 } 349 }
361 } 350 }
362 351
370 359
371 if (known_le (vectorization_factor, 1U)) 360 if (known_le (vectorization_factor, 1U))
372 return opt_result::failure_at (vect_location, 361 return opt_result::failure_at (vect_location,
373 "not vectorized: unsupported data-type\n"); 362 "not vectorized: unsupported data-type\n");
374 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor; 363 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
375
376 for (i = 0; i < mask_producers.length (); i++)
377 {
378 stmt_info = mask_producers[i];
379 opt_tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
380 if (!mask_type)
381 return opt_result::propagate_failure (mask_type);
382 STMT_VINFO_VECTYPE (stmt_info) = mask_type;
383 }
384
385 return opt_result::success (); 364 return opt_result::success ();
386 } 365 }
387 366
388 367
389 /* Function vect_is_simple_iv_evolution. 368 /* Function vect_is_simple_iv_evolution.
479 in LOOP. LOOP_VINFO represents the loop that is now being 458 in LOOP. LOOP_VINFO represents the loop that is now being
480 considered for vectorization (can be LOOP, or an outer-loop 459 considered for vectorization (can be LOOP, or an outer-loop
481 enclosing LOOP). */ 460 enclosing LOOP). */
482 461
483 static void 462 static void
484 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop) 463 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
485 { 464 {
486 basic_block bb = loop->header; 465 basic_block bb = loop->header;
487 tree init, step; 466 tree init, step;
488 auto_vec<stmt_vec_info, 64> worklist; 467 auto_vec<stmt_vec_info, 64> worklist;
489 gphi_iterator gsi; 468 gphi_iterator gsi;
490 bool double_reduc; 469 bool double_reduc, reduc_chain;
491 470
492 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles"); 471 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
493 472
494 /* First - identify all inductions. Reduction detection assumes that all the 473 /* First - identify all inductions. Reduction detection assumes that all the
495 inductions have been identified, therefore, this order must not be 474 inductions have been identified, therefore, this order must not be
557 536
558 gcc_assert (!virtual_operand_p (def) 537 gcc_assert (!virtual_operand_p (def)
559 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type); 538 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
560 539
561 stmt_vec_info reduc_stmt_info 540 stmt_vec_info reduc_stmt_info
562 = vect_force_simple_reduction (loop_vinfo, stmt_vinfo, 541 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
563 &double_reduc, false); 542 &reduc_chain);
564 if (reduc_stmt_info) 543 if (reduc_stmt_info)
565 { 544 {
566 if (double_reduc) 545 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
567 { 546 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
568 if (dump_enabled_p ()) 547 if (double_reduc)
569 dump_printf_loc (MSG_NOTE, vect_location, 548 {
549 if (dump_enabled_p ())
550 dump_printf_loc (MSG_NOTE, vect_location,
570 "Detected double reduction.\n"); 551 "Detected double reduction.\n");
571 552
572 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def; 553 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
573 STMT_VINFO_DEF_TYPE (reduc_stmt_info) 554 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
574 = vect_double_reduction_def;
575 } 555 }
576 else 556 else
577 { 557 {
578 if (loop != LOOP_VINFO_LOOP (loop_vinfo)) 558 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
579 { 559 {
580 if (dump_enabled_p ()) 560 if (dump_enabled_p ())
581 dump_printf_loc (MSG_NOTE, vect_location, 561 dump_printf_loc (MSG_NOTE, vect_location,
582 "Detected vectorizable nested cycle.\n"); 562 "Detected vectorizable nested cycle.\n");
583 563
584 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle; 564 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
585 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_nested_cycle;
586 } 565 }
587 else 566 else
588 { 567 {
589 if (dump_enabled_p ()) 568 if (dump_enabled_p ())
590 dump_printf_loc (MSG_NOTE, vect_location, 569 dump_printf_loc (MSG_NOTE, vect_location,
593 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def; 572 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
594 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def; 573 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
595 /* Store the reduction cycles for possible vectorization in 574 /* Store the reduction cycles for possible vectorization in
596 loop-aware SLP if it was not detected as reduction 575 loop-aware SLP if it was not detected as reduction
597 chain. */ 576 chain. */
598 if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info)) 577 if (! reduc_chain)
599 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push 578 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
600 (reduc_stmt_info); 579 (reduc_stmt_info);
601 } 580 }
602 } 581 }
603 } 582 }
631 a[i] = i; */ 610 a[i] = i; */
632 611
633 static void 612 static void
634 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo) 613 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
635 { 614 {
636 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 615 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
637 616
638 vect_analyze_scalar_cycles_1 (loop_vinfo, loop); 617 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
639 618
640 /* When vectorizing an outer-loop, the inner-loop is executed sequentially. 619 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
641 Reductions in such inner-loop therefore have different properties than 620 Reductions in such inner-loop therefore have different properties than
662 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)); 641 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
663 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info); 642 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
664 do 643 do
665 { 644 {
666 stmtp = STMT_VINFO_RELATED_STMT (stmt_info); 645 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
646 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
647 == STMT_VINFO_DEF_TYPE (stmt_info));
667 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp; 648 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
668 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info); 649 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
669 if (stmt_info) 650 if (stmt_info)
670 REDUC_GROUP_NEXT_ELEMENT (stmtp) 651 REDUC_GROUP_NEXT_ELEMENT (stmtp)
671 = STMT_VINFO_RELATED_STMT (stmt_info); 652 = STMT_VINFO_RELATED_STMT (stmt_info);
672 } 653 }
673 while (stmt_info); 654 while (stmt_info);
674 STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
675 } 655 }
676 656
677 /* Fixup scalar cycles that now have their stmts detected as patterns. */ 657 /* Fixup scalar cycles that now have their stmts detected as patterns. */
678 658
679 static void 659 static void
686 if (STMT_VINFO_IN_PATTERN_P (first)) 666 if (STMT_VINFO_IN_PATTERN_P (first))
687 { 667 {
688 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first); 668 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
689 while (next) 669 while (next)
690 { 670 {
691 if (! STMT_VINFO_IN_PATTERN_P (next)) 671 if (! STMT_VINFO_IN_PATTERN_P (next)
672 || STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (next)) == -1)
692 break; 673 break;
693 next = REDUC_GROUP_NEXT_ELEMENT (next); 674 next = REDUC_GROUP_NEXT_ELEMENT (next);
694 } 675 }
695 /* If not all stmt in the chain are patterns try to handle 676 /* If not all stmt in the chain are patterns or if we failed
696 the chain without patterns. */ 677 to update STMT_VINFO_REDUC_IDX try to handle the chain
697 if (! next) 678 without patterns. */
679 if (! next
680 && STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (first)) != -1)
698 { 681 {
699 vect_fixup_reduc_chain (first); 682 vect_fixup_reduc_chain (first);
700 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i] 683 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
701 = STMT_VINFO_RELATED_STMT (first); 684 = STMT_VINFO_RELATED_STMT (first);
702 } 685 }
712 695
713 Return the loop exit condition. */ 696 Return the loop exit condition. */
714 697
715 698
716 static gcond * 699 static gcond *
717 vect_get_loop_niters (struct loop *loop, tree *assumptions, 700 vect_get_loop_niters (class loop *loop, tree *assumptions,
718 tree *number_of_iterations, tree *number_of_iterationsm1) 701 tree *number_of_iterations, tree *number_of_iterationsm1)
719 { 702 {
720 edge exit = single_exit (loop); 703 edge exit = single_exit (loop);
721 struct tree_niter_desc niter_desc; 704 class tree_niter_desc niter_desc;
722 tree niter_assumptions, niter, may_be_zero; 705 tree niter_assumptions, niter, may_be_zero;
723 gcond *cond = get_loop_exit_condition (loop); 706 gcond *cond = get_loop_exit_condition (loop);
724 707
725 *assumptions = boolean_true_node; 708 *assumptions = boolean_true_node;
726 *number_of_iterationsm1 = chrec_dont_know; 709 *number_of_iterationsm1 = chrec_dont_know;
728 DUMP_VECT_SCOPE ("get_loop_niters"); 711 DUMP_VECT_SCOPE ("get_loop_niters");
729 712
730 if (!exit) 713 if (!exit)
731 return cond; 714 return cond;
732 715
733 niter = chrec_dont_know;
734 may_be_zero = NULL_TREE; 716 may_be_zero = NULL_TREE;
735 niter_assumptions = boolean_true_node;
736 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL) 717 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
737 || chrec_contains_undetermined (niter_desc.niter)) 718 || chrec_contains_undetermined (niter_desc.niter))
738 return cond; 719 return cond;
739 720
740 niter_assumptions = niter_desc.assumptions; 721 niter_assumptions = niter_desc.assumptions;
793 Used as predicate for dfs order traversal of the loop bbs. */ 774 Used as predicate for dfs order traversal of the loop bbs. */
794 775
795 static bool 776 static bool
796 bb_in_loop_p (const_basic_block bb, const void *data) 777 bb_in_loop_p (const_basic_block bb, const void *data)
797 { 778 {
798 const struct loop *const loop = (const struct loop *)data; 779 const class loop *const loop = (const class loop *)data;
799 if (flow_bb_inside_loop_p (loop, bb)) 780 if (flow_bb_inside_loop_p (loop, bb))
800 return true; 781 return true;
801 return false; 782 return false;
802 } 783 }
803 784
804 785
805 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as 786 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
806 stmt_vec_info structs for all the stmts in LOOP_IN. */ 787 stmt_vec_info structs for all the stmts in LOOP_IN. */
807 788
808 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared) 789 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
809 : vec_info (vec_info::loop, init_cost (loop_in), shared), 790 : vec_info (vec_info::loop, init_cost (loop_in), shared),
810 loop (loop_in), 791 loop (loop_in),
811 bbs (XCNEWVEC (basic_block, loop->num_nodes)), 792 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
812 num_itersm1 (NULL_TREE), 793 num_itersm1 (NULL_TREE),
813 num_iters (NULL_TREE), 794 num_iters (NULL_TREE),
817 versioning_threshold (0), 798 versioning_threshold (0),
818 vectorization_factor (0), 799 vectorization_factor (0),
819 max_vectorization_factor (0), 800 max_vectorization_factor (0),
820 mask_skip_niters (NULL_TREE), 801 mask_skip_niters (NULL_TREE),
821 mask_compare_type (NULL_TREE), 802 mask_compare_type (NULL_TREE),
803 simd_if_cond (NULL_TREE),
822 unaligned_dr (NULL), 804 unaligned_dr (NULL),
823 peeling_for_alignment (0), 805 peeling_for_alignment (0),
824 ptr_mask (0), 806 ptr_mask (0),
825 ivexpr_map (NULL), 807 ivexpr_map (NULL),
808 scan_map (NULL),
826 slp_unrolling_factor (1), 809 slp_unrolling_factor (1),
827 single_scalar_iteration_cost (0), 810 single_scalar_iteration_cost (0),
811 vec_outside_cost (0),
812 vec_inside_cost (0),
828 vectorizable (false), 813 vectorizable (false),
829 can_fully_mask_p (true), 814 can_fully_mask_p (true),
830 fully_masked_p (false), 815 fully_masked_p (false),
831 peeling_for_gaps (false), 816 peeling_for_gaps (false),
832 peeling_for_niter (false), 817 peeling_for_niter (false),
833 operands_swapped (false),
834 no_data_dependencies (false), 818 no_data_dependencies (false),
835 has_mask_store (false), 819 has_mask_store (false),
820 scalar_loop_scaling (profile_probability::uninitialized ()),
836 scalar_loop (NULL), 821 scalar_loop (NULL),
837 orig_loop_info (NULL) 822 orig_loop_info (NULL)
838 { 823 {
839 /* CHECKME: We want to visit all BBs before their successors (except for 824 /* CHECKME: We want to visit all BBs before their successors (except for
840 latch blocks, for which this assertion wouldn't hold). In the simple 825 latch blocks, for which this assertion wouldn't hold). In the simple
860 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) 845 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
861 { 846 {
862 gimple *stmt = gsi_stmt (si); 847 gimple *stmt = gsi_stmt (si);
863 gimple_set_uid (stmt, 0); 848 gimple_set_uid (stmt, 0);
864 add_stmt (stmt); 849 add_stmt (stmt);
865 } 850 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
866 } 851 third argument is the #pragma omp simd if (x) condition, when 0,
852 loop shouldn't be vectorized, when non-zero constant, it should
853 be vectorized normally, otherwise versioned with vectorized loop
854 done if the condition is non-zero at runtime. */
855 if (loop_in->simduid
856 && is_gimple_call (stmt)
857 && gimple_call_internal_p (stmt)
858 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
859 && gimple_call_num_args (stmt) >= 3
860 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
861 && (loop_in->simduid
862 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
863 {
864 tree arg = gimple_call_arg (stmt, 2);
865 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
866 simd_if_cond = arg;
867 else
868 gcc_assert (integer_nonzerop (arg));
869 }
870 }
871 }
872
873 epilogue_vinfos.create (6);
867 } 874 }
868 875
869 /* Free all levels of MASKS. */ 876 /* Free all levels of MASKS. */
870 877
871 void 878 void
881 /* Free all memory used by the _loop_vec_info, as well as all the 888 /* Free all memory used by the _loop_vec_info, as well as all the
882 stmt_vec_info structs of all the stmts in the loop. */ 889 stmt_vec_info structs of all the stmts in the loop. */
883 890
884 _loop_vec_info::~_loop_vec_info () 891 _loop_vec_info::~_loop_vec_info ()
885 { 892 {
886 int nbbs;
887 gimple_stmt_iterator si;
888 int j;
889
890 nbbs = loop->num_nodes;
891 for (j = 0; j < nbbs; j++)
892 {
893 basic_block bb = bbs[j];
894 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
895 {
896 gimple *stmt = gsi_stmt (si);
897
898 /* We may have broken canonical form by moving a constant
899 into RHS1 of a commutative op. Fix such occurrences. */
900 if (operands_swapped && is_gimple_assign (stmt))
901 {
902 enum tree_code code = gimple_assign_rhs_code (stmt);
903
904 if ((code == PLUS_EXPR
905 || code == POINTER_PLUS_EXPR
906 || code == MULT_EXPR)
907 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
908 swap_ssa_operands (stmt,
909 gimple_assign_rhs1_ptr (stmt),
910 gimple_assign_rhs2_ptr (stmt));
911 else if (code == COND_EXPR
912 && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
913 {
914 tree cond_expr = gimple_assign_rhs1 (stmt);
915 enum tree_code cond_code = TREE_CODE (cond_expr);
916
917 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
918 {
919 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
920 0));
921 cond_code = invert_tree_comparison (cond_code,
922 honor_nans);
923 if (cond_code != ERROR_MARK)
924 {
925 TREE_SET_CODE (cond_expr, cond_code);
926 swap_ssa_operands (stmt,
927 gimple_assign_rhs2_ptr (stmt),
928 gimple_assign_rhs3_ptr (stmt));
929 }
930 }
931 }
932 }
933 gsi_next (&si);
934 }
935 }
936
937 free (bbs); 893 free (bbs);
938 894
939 release_vec_loop_masks (&masks); 895 release_vec_loop_masks (&masks);
940 delete ivexpr_map; 896 delete ivexpr_map;
897 delete scan_map;
898 epilogue_vinfos.release ();
941 899
942 loop->aux = NULL; 900 loop->aux = NULL;
943 } 901 }
944 902
945 /* Return an invariant or register for EXPR and emit necessary 903 /* Return an invariant or register for EXPR and emit necessary
1005 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */ 963 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
1006 964
1007 static bool 965 static bool
1008 vect_verify_full_masking (loop_vec_info loop_vinfo) 966 vect_verify_full_masking (loop_vec_info loop_vinfo)
1009 { 967 {
1010 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 968 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1011 unsigned int min_ni_width; 969 unsigned int min_ni_width;
970 unsigned int max_nscalars_per_iter
971 = vect_get_max_nscalars_per_iter (loop_vinfo);
1012 972
1013 /* Use a normal loop if there are no statements that need masking. 973 /* Use a normal loop if there are no statements that need masking.
1014 This only happens in rare degenerate cases: it means that the loop 974 This only happens in rare degenerate cases: it means that the loop
1015 has no loads, no stores, and no live-out values. */ 975 has no loads, no stores, and no live-out values. */
1016 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ()) 976 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1025 widest_int max_back_edges; 985 widest_int max_back_edges;
1026 if (max_loop_iterations (loop, &max_back_edges)) 986 if (max_loop_iterations (loop, &max_back_edges))
1027 max_ni = wi::smin (max_ni, max_back_edges + 1); 987 max_ni = wi::smin (max_ni, max_back_edges + 1);
1028 988
1029 /* Account for rgroup masks, in which each bit is replicated N times. */ 989 /* Account for rgroup masks, in which each bit is replicated N times. */
1030 max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo); 990 max_ni *= max_nscalars_per_iter;
1031 991
1032 /* Work out how many bits we need to represent the limit. */ 992 /* Work out how many bits we need to represent the limit. */
1033 min_ni_width = wi::min_precision (max_ni, UNSIGNED); 993 min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1034 994
1035 /* Find a scalar mode for which WHILE_ULT is supported. */ 995 /* Find a scalar mode for which WHILE_ULT is supported. */
1036 opt_scalar_int_mode cmp_mode_iter; 996 opt_scalar_int_mode cmp_mode_iter;
1037 tree cmp_type = NULL_TREE; 997 tree cmp_type = NULL_TREE;
998 tree iv_type = NULL_TREE;
999 widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
1000 unsigned int iv_precision = UINT_MAX;
1001
1002 if (iv_limit != -1)
1003 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1004 UNSIGNED);
1005
1038 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT) 1006 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1039 { 1007 {
1040 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ()); 1008 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1041 if (cmp_bits >= min_ni_width 1009 if (cmp_bits >= min_ni_width
1042 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ())) 1010 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1044 tree this_type = build_nonstandard_integer_type (cmp_bits, true); 1012 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1045 if (this_type 1013 if (this_type
1046 && can_produce_all_loop_masks_p (loop_vinfo, this_type)) 1014 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1047 { 1015 {
1048 /* Although we could stop as soon as we find a valid mode, 1016 /* Although we could stop as soon as we find a valid mode,
1049 it's often better to continue until we hit Pmode, since the 1017 there are at least two reasons why that's not always the
1050 operands to the WHILE are more likely to be reusable in 1018 best choice:
1051 address calculations. */ 1019
1052 cmp_type = this_type; 1020 - An IV that's Pmode or wider is more likely to be reusable
1021 in address calculations than an IV that's narrower than
1022 Pmode.
1023
1024 - Doing the comparison in IV_PRECISION or wider allows
1025 a natural 0-based IV, whereas using a narrower comparison
1026 type requires mitigations against wrap-around.
1027
1028 Conversely, if the IV limit is variable, doing the comparison
1029 in a wider type than the original type can introduce
1030 unnecessary extensions, so picking the widest valid mode
1031 is not always a good choice either.
1032
1033 Here we prefer the first IV type that's Pmode or wider,
1034 and the first comparison type that's IV_PRECISION or wider.
1035 (The comparison type must be no wider than the IV type,
1036 to avoid extensions in the vector loop.)
1037
1038 ??? We might want to try continuing beyond Pmode for ILP32
1039 targets if CMP_BITS < IV_PRECISION. */
1040 iv_type = this_type;
1041 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1042 cmp_type = this_type;
1053 if (cmp_bits >= GET_MODE_BITSIZE (Pmode)) 1043 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1054 break; 1044 break;
1055 } 1045 }
1056 } 1046 }
1057 } 1047 }
1058 1048
1059 if (!cmp_type) 1049 if (!cmp_type)
1060 return false; 1050 return false;
1061 1051
1062 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type; 1052 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1053 LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type;
1063 return true; 1054 return true;
1064 } 1055 }
1065 1056
1066 /* Calculate the cost of one scalar iteration of the loop. */ 1057 /* Calculate the cost of one scalar iteration of the loop. */
1067 static void 1058 static void
1068 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo) 1059 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1069 { 1060 {
1070 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1061 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1071 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 1062 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1072 int nbbs = loop->num_nodes, factor; 1063 int nbbs = loop->num_nodes, factor;
1073 int innerloop_iters, i; 1064 int innerloop_iters, i;
1074 1065
1075 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost"); 1066 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1098 1089
1099 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt)) 1090 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1100 continue; 1091 continue;
1101 1092
1102 /* Skip stmts that are not vectorized inside the loop. */ 1093 /* Skip stmts that are not vectorized inside the loop. */
1103 if (stmt_info 1094 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1104 && !STMT_VINFO_RELEVANT_P (stmt_info) 1095 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1105 && (!STMT_VINFO_LIVE_P (stmt_info) 1096 && (!STMT_VINFO_LIVE_P (vstmt_info)
1106 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))) 1097 || !VECTORIZABLE_CYCLE_DEF
1107 && !STMT_VINFO_IN_PATTERN_P (stmt_info)) 1098 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1108 continue; 1099 continue;
1109 1100
1110 vect_cost_for_stmt kind; 1101 vect_cost_for_stmt kind;
1111 if (STMT_VINFO_DATA_REF (stmt_info)) 1102 if (STMT_VINFO_DATA_REF (stmt_info))
1112 { 1103 {
1113 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) 1104 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1114 kind = scalar_load; 1105 kind = scalar_load;
1115 else 1106 else
1116 kind = scalar_store; 1107 kind = scalar_store;
1117 } 1108 }
1118 else 1109 else if (vect_nop_conversion_p (stmt_info))
1110 continue;
1111 else
1119 kind = scalar_stmt; 1112 kind = scalar_stmt;
1120 1113
1121 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), 1114 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1122 factor, kind, stmt_info, 0, vect_prologue); 1115 factor, kind, stmt_info, 0, vect_prologue);
1123 } 1116 }
1147 - the loop exit condition is simple enough 1140 - the loop exit condition is simple enough
1148 - the number of iterations can be analyzed, i.e, a countable loop. The 1141 - the number of iterations can be analyzed, i.e, a countable loop. The
1149 niter could be analyzed under some assumptions. */ 1142 niter could be analyzed under some assumptions. */
1150 1143
1151 opt_result 1144 opt_result
1152 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond, 1145 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1153 tree *assumptions, tree *number_of_iterationsm1, 1146 tree *assumptions, tree *number_of_iterationsm1,
1154 tree *number_of_iterations, gcond **inner_loop_cond) 1147 tree *number_of_iterations, gcond **inner_loop_cond)
1155 { 1148 {
1156 DUMP_VECT_SCOPE ("vect_analyze_loop_form"); 1149 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1157 1150
1182 return opt_result::failure_at (vect_location, 1175 return opt_result::failure_at (vect_location,
1183 "not vectorized: empty loop.\n"); 1176 "not vectorized: empty loop.\n");
1184 } 1177 }
1185 else 1178 else
1186 { 1179 {
1187 struct loop *innerloop = loop->inner; 1180 class loop *innerloop = loop->inner;
1188 edge entryedge; 1181 edge entryedge;
1189 1182
1190 /* Nested loop. We currently require that the loop is doubly-nested, 1183 /* Nested loop. We currently require that the loop is doubly-nested,
1191 contains a single inner loop, and the number of BBs is exactly 5. 1184 contains a single inner loop, and the number of BBs is exactly 5.
1192 Vectorizable outer-loops look like this: 1185 Vectorizable outer-loops look like this:
1299 } 1292 }
1300 1293
1301 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */ 1294 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1302 1295
1303 opt_loop_vec_info 1296 opt_loop_vec_info
1304 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared) 1297 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1305 { 1298 {
1306 tree assumptions, number_of_iterations, number_of_iterationsm1; 1299 tree assumptions, number_of_iterations, number_of_iterationsm1;
1307 gcond *loop_cond, *inner_loop_cond = NULL; 1300 gcond *loop_cond, *inner_loop_cond = NULL;
1308 1301
1309 opt_result res 1302 opt_result res
1362 statements update the vectorization factor. */ 1355 statements update the vectorization factor. */
1363 1356
1364 static void 1357 static void
1365 vect_update_vf_for_slp (loop_vec_info loop_vinfo) 1358 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1366 { 1359 {
1367 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1360 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1368 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 1361 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1369 int nbbs = loop->num_nodes; 1362 int nbbs = loop->num_nodes;
1370 poly_uint64 vectorization_factor; 1363 poly_uint64 vectorization_factor;
1371 int i; 1364 int i;
1372 1365
1382 exploited. */ 1375 exploited. */
1383 bool only_slp_in_loop = true; 1376 bool only_slp_in_loop = true;
1384 for (i = 0; i < nbbs; i++) 1377 for (i = 0; i < nbbs; i++)
1385 { 1378 {
1386 basic_block bb = bbs[i]; 1379 basic_block bb = bbs[i];
1380 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1381 gsi_next (&si))
1382 {
1383 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1384 if (!stmt_info)
1385 continue;
1386 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1387 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1388 && !PURE_SLP_STMT (stmt_info))
1389 /* STMT needs both SLP and loop-based vectorization. */
1390 only_slp_in_loop = false;
1391 }
1387 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si); 1392 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1388 gsi_next (&si)) 1393 gsi_next (&si))
1389 { 1394 {
1390 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si)); 1395 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1391 stmt_info = vect_stmt_to_vectorize (stmt_info); 1396 stmt_info = vect_stmt_to_vectorize (stmt_info);
1397 } 1402 }
1398 } 1403 }
1399 1404
1400 if (only_slp_in_loop) 1405 if (only_slp_in_loop)
1401 { 1406 {
1402 dump_printf_loc (MSG_NOTE, vect_location, 1407 if (dump_enabled_p ())
1403 "Loop contains only SLP stmts\n"); 1408 dump_printf_loc (MSG_NOTE, vect_location,
1409 "Loop contains only SLP stmts\n");
1404 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo); 1410 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1405 } 1411 }
1406 else 1412 else
1407 { 1413 {
1408 dump_printf_loc (MSG_NOTE, vect_location, 1414 if (dump_enabled_p ())
1409 "Loop contains SLP and non-SLP stmts\n"); 1415 dump_printf_loc (MSG_NOTE, vect_location,
1416 "Loop contains SLP and non-SLP stmts\n");
1410 /* Both the vectorization factor and unroll factor have the form 1417 /* Both the vectorization factor and unroll factor have the form
1411 current_vector_size * X for some rational X, so they must have 1418 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1412 a common multiple. */ 1419 so they must have a common multiple. */
1413 vectorization_factor 1420 vectorization_factor
1414 = force_common_multiple (vectorization_factor, 1421 = force_common_multiple (vectorization_factor,
1415 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo)); 1422 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1416 } 1423 }
1417 1424
1456 Scan the loop stmts and make sure they are all vectorizable. */ 1463 Scan the loop stmts and make sure they are all vectorizable. */
1457 1464
1458 static opt_result 1465 static opt_result
1459 vect_analyze_loop_operations (loop_vec_info loop_vinfo) 1466 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1460 { 1467 {
1461 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1468 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1462 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 1469 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1463 int nbbs = loop->num_nodes; 1470 int nbbs = loop->num_nodes;
1464 int i; 1471 int i;
1465 stmt_vec_info stmt_info; 1472 stmt_vec_info stmt_info;
1466 bool need_to_vectorize = false; 1473 bool need_to_vectorize = false;
1467 bool ok; 1474 bool ok;
1468 1475
1469 DUMP_VECT_SCOPE ("vect_analyze_loop_operations"); 1476 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1470 1477
1471 stmt_vector_for_cost cost_vec; 1478 auto_vec<stmt_info_for_cost> cost_vec;
1472 cost_vec.create (2);
1473 1479
1474 for (i = 0; i < nbbs; i++) 1480 for (i = 0; i < nbbs; i++)
1475 { 1481 {
1476 basic_block bb = bbs[i]; 1482 basic_block bb = bbs[i];
1477 1483
1511 return opt_result::failure_at (phi, "unsupported phi"); 1517 return opt_result::failure_at (phi, "unsupported phi");
1512 1518
1513 phi_op = PHI_ARG_DEF (phi, 0); 1519 phi_op = PHI_ARG_DEF (phi, 0);
1514 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op); 1520 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1515 if (!op_def_info) 1521 if (!op_def_info)
1516 return opt_result::failure_at (phi, "unsupported phi"); 1522 return opt_result::failure_at (phi, "unsupported phi\n");
1517 1523
1518 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer 1524 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1519 && (STMT_VINFO_RELEVANT (op_def_info) 1525 && (STMT_VINFO_RELEVANT (op_def_info)
1520 != vect_used_in_outer_by_reduction)) 1526 != vect_used_in_outer_by_reduction))
1521 return opt_result::failure_at (phi, "unsupported phi"); 1527 return opt_result::failure_at (phi, "unsupported phi\n");
1528
1529 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1530 || (STMT_VINFO_DEF_TYPE (stmt_info)
1531 == vect_double_reduction_def))
1532 && !vectorizable_lc_phi (stmt_info, NULL, NULL))
1533 return opt_result::failure_at (phi, "unsupported phi\n");
1522 } 1534 }
1523 1535
1524 continue; 1536 continue;
1525 } 1537 }
1526 1538
1540 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def 1552 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1541 && ! PURE_SLP_STMT (stmt_info)) 1553 && ! PURE_SLP_STMT (stmt_info))
1542 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL, 1554 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1543 &cost_vec); 1555 &cost_vec);
1544 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def 1556 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1557 || (STMT_VINFO_DEF_TYPE (stmt_info)
1558 == vect_double_reduction_def)
1545 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle) 1559 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1546 && ! PURE_SLP_STMT (stmt_info)) 1560 && ! PURE_SLP_STMT (stmt_info))
1547 ok = vectorizable_reduction (stmt_info, NULL, NULL, NULL, NULL, 1561 ok = vectorizable_reduction (stmt_info, NULL, NULL, &cost_vec);
1548 &cost_vec);
1549 } 1562 }
1550 1563
1551 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */ 1564 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1552 if (ok 1565 if (ok
1553 && STMT_VINFO_LIVE_P (stmt_info) 1566 && STMT_VINFO_LIVE_P (stmt_info)
1554 && !PURE_SLP_STMT (stmt_info)) 1567 && !PURE_SLP_STMT (stmt_info))
1555 ok = vectorizable_live_operation (stmt_info, NULL, NULL, -1, NULL, 1568 ok = vectorizable_live_operation (stmt_info, NULL, NULL, NULL,
1556 &cost_vec); 1569 -1, false, &cost_vec);
1557 1570
1558 if (!ok) 1571 if (!ok)
1559 return opt_result::failure_at (phi, 1572 return opt_result::failure_at (phi,
1560 "not vectorized: relevant phi not " 1573 "not vectorized: relevant phi not "
1561 "supported: %G", 1574 "supported: %G",
1577 } 1590 }
1578 } 1591 }
1579 } /* bbs */ 1592 } /* bbs */
1580 1593
1581 add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec); 1594 add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1582 cost_vec.release ();
1583 1595
1584 /* All operations in the loop are either irrelevant (deal with loop 1596 /* All operations in the loop are either irrelevant (deal with loop
1585 control, or dead), or only used outside the loop and can be moved 1597 control, or dead), or only used outside the loop and can be moved
1586 out of the loop (e.g. invariants, inductions). The loop can be 1598 out of the loop (e.g. invariants, inductions). The loop can be
1587 optimized away by scalar optimizations. We're better off not 1599 optimized away by scalar optimizations. We're better off not
1604 definitely no, or -1 if it's worth retrying. */ 1616 definitely no, or -1 if it's worth retrying. */
1605 1617
1606 static int 1618 static int
1607 vect_analyze_loop_costing (loop_vec_info loop_vinfo) 1619 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1608 { 1620 {
1609 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1621 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1610 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo); 1622 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1611 1623
1612 /* Only fully-masked loops can have iteration counts less than the 1624 /* Only fully-masked loops can have iteration counts less than the
1613 vectorization factor. */ 1625 vectorization factor. */
1614 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 1626 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1645 "not vectorized: vector version will never be " 1657 "not vectorized: vector version will never be "
1646 "profitable.\n"); 1658 "profitable.\n");
1647 return -1; 1659 return -1;
1648 } 1660 }
1649 1661
1650 int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND) 1662 int min_scalar_loop_bound = (param_min_vect_loop_bound
1651 * assumed_vf); 1663 * assumed_vf);
1652 1664
1653 /* Use the cost model only if it is more conservative than user specified 1665 /* Use the cost model only if it is more conservative than user specified
1654 threshold. */ 1666 threshold. */
1655 unsigned int th = (unsigned) MAX (min_scalar_loop_bound, 1667 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1669 "specified loop bound parameter or minimum profitable " 1681 "specified loop bound parameter or minimum profitable "
1670 "iterations (whichever is more conservative).\n"); 1682 "iterations (whichever is more conservative).\n");
1671 return 0; 1683 return 0;
1672 } 1684 }
1673 1685
1674 HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop); 1686 /* The static profitablity threshold min_profitable_estimate includes
1675 if (estimated_niter == -1) 1687 the cost of having to check at runtime whether the scalar loop
1676 estimated_niter = likely_max_stmt_executions_int (loop); 1688 should be used instead. If it turns out that we don't need or want
1689 such a check, the threshold we should use for the static estimate
1690 is simply the point at which the vector loop becomes more profitable
1691 than the scalar loop. */
1692 if (min_profitable_estimate > min_profitable_iters
1693 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1694 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1695 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1696 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1697 {
1698 if (dump_enabled_p ())
1699 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1700 " choice between the scalar and vector loops\n");
1701 min_profitable_estimate = min_profitable_iters;
1702 }
1703
1704 HOST_WIDE_INT estimated_niter;
1705
1706 /* If we are vectorizing an epilogue then we know the maximum number of
1707 scalar iterations it will cover is at least one lower than the
1708 vectorization factor of the main loop. */
1709 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1710 estimated_niter
1711 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1712 else
1713 {
1714 estimated_niter = estimated_stmt_executions_int (loop);
1715 if (estimated_niter == -1)
1716 estimated_niter = likely_max_stmt_executions_int (loop);
1717 }
1677 if (estimated_niter != -1 1718 if (estimated_niter != -1
1678 && ((unsigned HOST_WIDE_INT) estimated_niter 1719 && ((unsigned HOST_WIDE_INT) estimated_niter
1679 < MAX (th, (unsigned) min_profitable_estimate))) 1720 < MAX (th, (unsigned) min_profitable_estimate)))
1680 { 1721 {
1681 if (dump_enabled_p ()) 1722 if (dump_enabled_p ())
1744 return res; 1785 return res;
1745 } 1786 }
1746 /* If dependence analysis will give up due to the limit on the 1787 /* If dependence analysis will give up due to the limit on the
1747 number of datarefs stop here and fail fatally. */ 1788 number of datarefs stop here and fail fatally. */
1748 if (datarefs->length () 1789 if (datarefs->length ()
1749 > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS)) 1790 > (unsigned)param_loop_max_datarefs_for_datadeps)
1750 return opt_result::failure_at (stmt, "exceeded param " 1791 return opt_result::failure_at (stmt, "exceeded param "
1751 "loop-max-datarefs-for-datadeps\n"); 1792 "loop-max-datarefs-for-datadeps\n");
1752 } 1793 }
1753 return opt_result::success (); 1794 return opt_result::success ();
1754 } 1795 }
1755 1796
1756 /* Function vect_analyze_loop_2. 1797 /* Look for SLP-only access groups and turn each individual access into its own
1757 1798 group. */
1758 Apply a set of analyses on LOOP, and create a loop_vec_info struct 1799 static void
1759 for it. The different analyses will record information in the 1800 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
1760 loop_vec_info struct. */
1761 static opt_result
1762 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1763 { 1801 {
1764 opt_result ok = opt_result::success (); 1802 unsigned int i;
1765 int res; 1803 struct data_reference *dr;
1766 unsigned int max_vf = MAX_VECTORIZATION_FACTOR; 1804
1767 poly_uint64 min_vf = 2; 1805 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1768 1806
1769 /* The first group of checks is independent of the vector size. */ 1807 vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
1770 fatal = true; 1808 FOR_EACH_VEC_ELT (datarefs, i, dr)
1771 1809 {
1772 /* Find all data references in the loop (which correspond to vdefs/vuses) 1810 gcc_assert (DR_REF (dr));
1773 and analyze their evolution in the loop. */ 1811 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
1774 1812
1775 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo); 1813 /* Check if the load is a part of an interleaving chain. */
1776 1814 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1777 /* Gather the data references and count stmts in the loop. */ 1815 {
1778 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ()) 1816 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
1779 { 1817 unsigned int group_size = DR_GROUP_SIZE (first_element);
1780 opt_result res 1818
1781 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo), 1819 /* Check if SLP-only groups. */
1782 &LOOP_VINFO_DATAREFS (loop_vinfo), 1820 if (!STMT_SLP_TYPE (stmt_info)
1783 n_stmts); 1821 && STMT_VINFO_SLP_VECT_ONLY (first_element))
1784 if (!res) 1822 {
1785 { 1823 /* Dissolve the group. */
1786 if (dump_enabled_p ()) 1824 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
1787 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1825
1788 "not vectorized: loop contains function " 1826 stmt_vec_info vinfo = first_element;
1789 "calls or data references that cannot " 1827 while (vinfo)
1790 "be analyzed\n"); 1828 {
1791 return res; 1829 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
1792 } 1830 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
1793 loop_vinfo->shared->save_datarefs (); 1831 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
1794 } 1832 DR_GROUP_SIZE (vinfo) = 1;
1795 else 1833 if (STMT_VINFO_STRIDED_P (first_element))
1796 loop_vinfo->shared->check_datarefs (); 1834 DR_GROUP_GAP (vinfo) = 0;
1797 1835 else
1798 /* Analyze the data references and also adjust the minimal 1836 DR_GROUP_GAP (vinfo) = group_size - 1;
1799 vectorization factor according to the loads and stores. */ 1837 vinfo = next;
1800 1838 }
1801 ok = vect_analyze_data_refs (loop_vinfo, &min_vf); 1839 }
1802 if (!ok) 1840 }
1803 { 1841 }
1804 if (dump_enabled_p ()) 1842 }
1805 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1843
1806 "bad data references.\n"); 1844
1807 return ok; 1845 /* Decides whether we need to create an epilogue loop to handle
1808 } 1846 remaining scalar iterations and sets PEELING_FOR_NITERS accordingly. */
1809 1847
1810 /* Classify all cross-iteration scalar data-flow cycles. 1848 void
1811 Cross-iteration cycles caused by virtual phis are analyzed separately. */ 1849 determine_peel_for_niter (loop_vec_info loop_vinfo)
1812 vect_analyze_scalar_cycles (loop_vinfo); 1850 {
1813 1851 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1814 vect_pattern_recog (loop_vinfo); 1852
1815 1853 unsigned HOST_WIDE_INT const_vf;
1816 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1817
1818 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1819 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1820
1821 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1822 if (!ok)
1823 {
1824 if (dump_enabled_p ())
1825 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1826 "bad data access.\n");
1827 return ok;
1828 }
1829
1830 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1831
1832 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1833 if (!ok)
1834 {
1835 if (dump_enabled_p ())
1836 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1837 "unexpected pattern.\n");
1838 return ok;
1839 }
1840
1841 /* While the rest of the analysis below depends on it in some way. */
1842 fatal = false;
1843
1844 /* Analyze data dependences between the data-refs in the loop
1845 and adjust the maximum vectorization factor according to
1846 the dependences.
1847 FORNOW: fail at the first data dependence that we encounter. */
1848
1849 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1850 if (!ok)
1851 {
1852 if (dump_enabled_p ())
1853 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1854 "bad data dependence.\n");
1855 return ok;
1856 }
1857 if (max_vf != MAX_VECTORIZATION_FACTOR
1858 && maybe_lt (max_vf, min_vf))
1859 return opt_result::failure_at (vect_location, "bad data dependence.\n");
1860 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1861
1862 ok = vect_determine_vectorization_factor (loop_vinfo);
1863 if (!ok)
1864 {
1865 if (dump_enabled_p ())
1866 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1867 "can't determine vectorization factor.\n");
1868 return ok;
1869 }
1870 if (max_vf != MAX_VECTORIZATION_FACTOR
1871 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1872 return opt_result::failure_at (vect_location, "bad data dependence.\n");
1873
1874 /* Compute the scalar iteration cost. */
1875 vect_compute_single_scalar_iteration_cost (loop_vinfo);
1876
1877 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1878 unsigned th;
1879
1880 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1881 ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1882 if (!ok)
1883 return ok;
1884
1885 /* If there are any SLP instances mark them as pure_slp. */
1886 bool slp = vect_make_slp_decision (loop_vinfo);
1887 if (slp)
1888 {
1889 /* Find stmts that need to be both vectorized and SLPed. */
1890 vect_detect_hybrid_slp (loop_vinfo);
1891
1892 /* Update the vectorization factor based on the SLP decision. */
1893 vect_update_vf_for_slp (loop_vinfo);
1894 }
1895
1896 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1897
1898 /* We don't expect to have to roll back to anything other than an empty
1899 set of rgroups. */
1900 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1901
1902 /* This is the point where we can re-start analysis with SLP forced off. */
1903 start_over:
1904
1905 /* Now the vectorization factor is final. */
1906 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1907 gcc_assert (known_ne (vectorization_factor, 0U));
1908
1909 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1910 {
1911 dump_printf_loc (MSG_NOTE, vect_location,
1912 "vectorization_factor = ");
1913 dump_dec (MSG_NOTE, vectorization_factor);
1914 dump_printf (MSG_NOTE, ", niters = %wd\n",
1915 LOOP_VINFO_INT_NITERS (loop_vinfo));
1916 }
1917
1918 HOST_WIDE_INT max_niter 1854 HOST_WIDE_INT max_niter
1919 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo)); 1855 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1920 1856
1921 /* Analyze the alignment of the data-refs in the loop. 1857 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1922 Fail if a data reference is found that cannot be vectorized. */ 1858 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1923 1859 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1924 ok = vect_analyze_data_refs_alignment (loop_vinfo); 1860 (loop_vinfo));
1925 if (!ok) 1861
1926 {
1927 if (dump_enabled_p ())
1928 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1929 "bad data alignment.\n");
1930 return ok;
1931 }
1932
1933 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1934 It is important to call pruning after vect_analyze_data_ref_accesses,
1935 since we use grouping information gathered by interleaving analysis. */
1936 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1937 if (!ok)
1938 return ok;
1939
1940 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
1941 vectorization, since we do not want to add extra peeling or
1942 add versioning for alignment. */
1943 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1944 /* This pass will decide on using loop versioning and/or loop peeling in
1945 order to enhance the alignment of data references in the loop. */
1946 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1947 else
1948 ok = vect_verify_datarefs_alignment (loop_vinfo);
1949 if (!ok)
1950 return ok;
1951
1952 if (slp)
1953 {
1954 /* Analyze operations in the SLP instances. Note this may
1955 remove unsupported SLP instances which makes the above
1956 SLP kind detection invalid. */
1957 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
1958 vect_slp_analyze_operations (loop_vinfo);
1959 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
1960 {
1961 ok = opt_result::failure_at (vect_location,
1962 "unsupported SLP instances\n");
1963 goto again;
1964 }
1965 }
1966
1967 /* Scan all the remaining operations in the loop that are not subject
1968 to SLP and make sure they are vectorizable. */
1969 ok = vect_analyze_loop_operations (loop_vinfo);
1970 if (!ok)
1971 {
1972 if (dump_enabled_p ())
1973 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1974 "bad operation or unsupported loop bound.\n");
1975 return ok;
1976 }
1977
1978 /* Decide whether to use a fully-masked loop for this vectorization
1979 factor. */
1980 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
1981 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
1982 && vect_verify_full_masking (loop_vinfo));
1983 if (dump_enabled_p ())
1984 {
1985 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1986 dump_printf_loc (MSG_NOTE, vect_location,
1987 "using a fully-masked loop.\n");
1988 else
1989 dump_printf_loc (MSG_NOTE, vect_location,
1990 "not using a fully-masked loop.\n");
1991 }
1992
1993 /* If epilog loop is required because of data accesses with gaps,
1994 one additional iteration needs to be peeled. Check if there is
1995 enough iterations for vectorization. */
1996 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1997 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1998 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1999 {
2000 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2001 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2002
2003 if (known_lt (wi::to_widest (scalar_niters), vf))
2004 return opt_result::failure_at (vect_location,
2005 "loop has no enough iterations to"
2006 " support peeling for gaps.\n");
2007 }
2008
2009 /* Check the costings of the loop make vectorizing worthwhile. */
2010 res = vect_analyze_loop_costing (loop_vinfo);
2011 if (res < 0)
2012 {
2013 ok = opt_result::failure_at (vect_location,
2014 "Loop costings may not be worthwhile.\n");
2015 goto again;
2016 }
2017 if (!res)
2018 return opt_result::failure_at (vect_location,
2019 "Loop costings not worthwhile.\n");
2020
2021 /* Decide whether we need to create an epilogue loop to handle
2022 remaining scalar iterations. */
2023 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2024
2025 unsigned HOST_WIDE_INT const_vf;
2026 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 1862 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2027 /* The main loop handles all iterations. */ 1863 /* The main loop handles all iterations. */
2028 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false; 1864 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2029 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 1865 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2030 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0) 1866 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
2051 the epilogue is unnecessary. */ 1887 the epilogue is unnecessary. */
2052 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo) 1888 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2053 || ((unsigned HOST_WIDE_INT) max_niter 1889 || ((unsigned HOST_WIDE_INT) max_niter
2054 > (th / const_vf) * const_vf)))) 1890 > (th / const_vf) * const_vf))))
2055 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true; 1891 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2056 1892 }
1893
1894
1895 /* Function vect_analyze_loop_2.
1896
1897 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1898 for it. The different analyses will record information in the
1899 loop_vec_info struct. */
1900 static opt_result
1901 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1902 {
1903 opt_result ok = opt_result::success ();
1904 int res;
1905 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1906 poly_uint64 min_vf = 2;
1907 loop_vec_info orig_loop_vinfo = NULL;
1908
1909 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
1910 loop_vec_info of the first vectorized loop. */
1911 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1912 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1913 else
1914 orig_loop_vinfo = loop_vinfo;
1915 gcc_assert (orig_loop_vinfo);
1916
1917 /* The first group of checks is independent of the vector size. */
1918 fatal = true;
1919
1920 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
1921 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
1922 return opt_result::failure_at (vect_location,
1923 "not vectorized: simd if(0)\n");
1924
1925 /* Find all data references in the loop (which correspond to vdefs/vuses)
1926 and analyze their evolution in the loop. */
1927
1928 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1929
1930 /* Gather the data references and count stmts in the loop. */
1931 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1932 {
1933 opt_result res
1934 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1935 &LOOP_VINFO_DATAREFS (loop_vinfo),
1936 n_stmts);
1937 if (!res)
1938 {
1939 if (dump_enabled_p ())
1940 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1941 "not vectorized: loop contains function "
1942 "calls or data references that cannot "
1943 "be analyzed\n");
1944 return res;
1945 }
1946 loop_vinfo->shared->save_datarefs ();
1947 }
1948 else
1949 loop_vinfo->shared->check_datarefs ();
1950
1951 /* Analyze the data references and also adjust the minimal
1952 vectorization factor according to the loads and stores. */
1953
1954 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
1955 if (!ok)
1956 {
1957 if (dump_enabled_p ())
1958 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1959 "bad data references.\n");
1960 return ok;
1961 }
1962
1963 /* Classify all cross-iteration scalar data-flow cycles.
1964 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1965 vect_analyze_scalar_cycles (loop_vinfo);
1966
1967 vect_pattern_recog (loop_vinfo);
1968
1969 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1970
1971 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1972 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1973
1974 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1975 if (!ok)
1976 {
1977 if (dump_enabled_p ())
1978 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1979 "bad data access.\n");
1980 return ok;
1981 }
1982
1983 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1984
1985 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
1986 if (!ok)
1987 {
1988 if (dump_enabled_p ())
1989 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1990 "unexpected pattern.\n");
1991 return ok;
1992 }
1993
1994 /* While the rest of the analysis below depends on it in some way. */
1995 fatal = false;
1996
1997 /* Analyze data dependences between the data-refs in the loop
1998 and adjust the maximum vectorization factor according to
1999 the dependences.
2000 FORNOW: fail at the first data dependence that we encounter. */
2001
2002 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2003 if (!ok)
2004 {
2005 if (dump_enabled_p ())
2006 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2007 "bad data dependence.\n");
2008 return ok;
2009 }
2010 if (max_vf != MAX_VECTORIZATION_FACTOR
2011 && maybe_lt (max_vf, min_vf))
2012 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2013 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2014
2015 ok = vect_determine_vectorization_factor (loop_vinfo);
2016 if (!ok)
2017 {
2018 if (dump_enabled_p ())
2019 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2020 "can't determine vectorization factor.\n");
2021 return ok;
2022 }
2023 if (max_vf != MAX_VECTORIZATION_FACTOR
2024 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2025 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2026
2027 /* Compute the scalar iteration cost. */
2028 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2029
2030 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2031
2032 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
2033 ok = vect_analyze_slp (loop_vinfo, *n_stmts);
2034 if (!ok)
2035 return ok;
2036
2037 /* If there are any SLP instances mark them as pure_slp. */
2038 bool slp = vect_make_slp_decision (loop_vinfo);
2039 if (slp)
2040 {
2041 /* Find stmts that need to be both vectorized and SLPed. */
2042 vect_detect_hybrid_slp (loop_vinfo);
2043
2044 /* Update the vectorization factor based on the SLP decision. */
2045 vect_update_vf_for_slp (loop_vinfo);
2046 }
2047
2048 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2049
2050 /* We don't expect to have to roll back to anything other than an empty
2051 set of rgroups. */
2052 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2053
2054 /* This is the point where we can re-start analysis with SLP forced off. */
2055 start_over:
2056
2057 /* Now the vectorization factor is final. */
2058 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2059 gcc_assert (known_ne (vectorization_factor, 0U));
2060
2061 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2062 {
2063 dump_printf_loc (MSG_NOTE, vect_location,
2064 "vectorization_factor = ");
2065 dump_dec (MSG_NOTE, vectorization_factor);
2066 dump_printf (MSG_NOTE, ", niters = %wd\n",
2067 LOOP_VINFO_INT_NITERS (loop_vinfo));
2068 }
2069
2070 /* Analyze the alignment of the data-refs in the loop.
2071 Fail if a data reference is found that cannot be vectorized. */
2072
2073 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2074 if (!ok)
2075 {
2076 if (dump_enabled_p ())
2077 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2078 "bad data alignment.\n");
2079 return ok;
2080 }
2081
2082 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2083 It is important to call pruning after vect_analyze_data_ref_accesses,
2084 since we use grouping information gathered by interleaving analysis. */
2085 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2086 if (!ok)
2087 return ok;
2088
2089 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2090 vectorization, since we do not want to add extra peeling or
2091 add versioning for alignment. */
2092 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2093 /* This pass will decide on using loop versioning and/or loop peeling in
2094 order to enhance the alignment of data references in the loop. */
2095 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2096 else
2097 ok = vect_verify_datarefs_alignment (loop_vinfo);
2098 if (!ok)
2099 return ok;
2100
2101 if (slp)
2102 {
2103 /* Analyze operations in the SLP instances. Note this may
2104 remove unsupported SLP instances which makes the above
2105 SLP kind detection invalid. */
2106 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2107 vect_slp_analyze_operations (loop_vinfo);
2108 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2109 {
2110 ok = opt_result::failure_at (vect_location,
2111 "unsupported SLP instances\n");
2112 goto again;
2113 }
2114 }
2115
2116 /* Dissolve SLP-only groups. */
2117 vect_dissolve_slp_only_groups (loop_vinfo);
2118
2119 /* Scan all the remaining operations in the loop that are not subject
2120 to SLP and make sure they are vectorizable. */
2121 ok = vect_analyze_loop_operations (loop_vinfo);
2122 if (!ok)
2123 {
2124 if (dump_enabled_p ())
2125 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2126 "bad operation or unsupported loop bound.\n");
2127 return ok;
2128 }
2129
2130 /* Decide whether to use a fully-masked loop for this vectorization
2131 factor. */
2132 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2133 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2134 && vect_verify_full_masking (loop_vinfo));
2135 if (dump_enabled_p ())
2136 {
2137 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2138 dump_printf_loc (MSG_NOTE, vect_location,
2139 "using a fully-masked loop.\n");
2140 else
2141 dump_printf_loc (MSG_NOTE, vect_location,
2142 "not using a fully-masked loop.\n");
2143 }
2144
2145 /* If epilog loop is required because of data accesses with gaps,
2146 one additional iteration needs to be peeled. Check if there is
2147 enough iterations for vectorization. */
2148 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2149 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2150 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2151 {
2152 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2153 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2154
2155 if (known_lt (wi::to_widest (scalar_niters), vf))
2156 return opt_result::failure_at (vect_location,
2157 "loop has no enough iterations to"
2158 " support peeling for gaps.\n");
2159 }
2160
2161 /* If we're vectorizing an epilogue loop, we either need a fully-masked
2162 loop or a loop that has a lower VF than the main loop. */
2163 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2164 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2165 && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2166 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2167 return opt_result::failure_at (vect_location,
2168 "Vectorization factor too high for"
2169 " epilogue loop.\n");
2170
2171 /* Check the costings of the loop make vectorizing worthwhile. */
2172 res = vect_analyze_loop_costing (loop_vinfo);
2173 if (res < 0)
2174 {
2175 ok = opt_result::failure_at (vect_location,
2176 "Loop costings may not be worthwhile.\n");
2177 goto again;
2178 }
2179 if (!res)
2180 return opt_result::failure_at (vect_location,
2181 "Loop costings not worthwhile.\n");
2182
2183 determine_peel_for_niter (loop_vinfo);
2057 /* If an epilogue loop is required make sure we can create one. */ 2184 /* If an epilogue loop is required make sure we can create one. */
2058 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) 2185 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2059 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)) 2186 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2060 { 2187 {
2061 if (dump_enabled_p ()) 2188 if (dump_enabled_p ())
2073 } 2200 }
2074 2201
2075 /* During peeling, we need to check if number of loop iterations is 2202 /* During peeling, we need to check if number of loop iterations is
2076 enough for both peeled prolog loop and vector loop. This check 2203 enough for both peeled prolog loop and vector loop. This check
2077 can be merged along with threshold check of loop versioning, so 2204 can be merged along with threshold check of loop versioning, so
2078 increase threshold for this case if necessary. */ 2205 increase threshold for this case if necessary.
2079 if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) 2206
2207 If we are analyzing an epilogue we still want to check what its
2208 versioning threshold would be. If we decide to vectorize the epilogues we
2209 will want to use the lowest versioning threshold of all epilogues and main
2210 loop. This will enable us to enter a vectorized epilogue even when
2211 versioning the loop. We can't simply check whether the epilogue requires
2212 versioning though since we may have skipped some versioning checks when
2213 analyzing the epilogue. For instance, checks for alias versioning will be
2214 skipped when dealing with epilogues as we assume we already checked them
2215 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2216 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2080 { 2217 {
2081 poly_uint64 niters_th = 0; 2218 poly_uint64 niters_th = 0;
2219 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2082 2220
2083 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo)) 2221 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2084 { 2222 {
2085 /* Niters for peeled prolog loop. */ 2223 /* Niters for peeled prolog loop. */
2086 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0) 2224 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2097 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 2235 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2098 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo); 2236 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2099 /* One additional iteration because of peeling for gap. */ 2237 /* One additional iteration because of peeling for gap. */
2100 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) 2238 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2101 niters_th += 1; 2239 niters_th += 1;
2240
2241 /* Use the same condition as vect_transform_loop to decide when to use
2242 the cost to determine a versioning threshold. */
2243 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2244 && ordered_p (th, niters_th))
2245 niters_th = ordered_max (poly_uint64 (th), niters_th);
2246
2102 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th; 2247 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2103 } 2248 }
2104 2249
2105 gcc_assert (known_eq (vectorization_factor, 2250 gcc_assert (known_eq (vectorization_factor,
2106 LOOP_VINFO_VECT_FACTOR (loop_vinfo))); 2251 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2174 for (gimple_stmt_iterator si = gsi_start_phis (bb); 2319 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2175 !gsi_end_p (si); gsi_next (&si)) 2320 !gsi_end_p (si); gsi_next (&si))
2176 { 2321 {
2177 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si)); 2322 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2178 STMT_SLP_TYPE (stmt_info) = loop_vect; 2323 STMT_SLP_TYPE (stmt_info) = loop_vect;
2324 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2325 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2326 {
2327 /* vectorizable_reduction adjusts reduction stmt def-types,
2328 restore them to that of the PHI. */
2329 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2330 = STMT_VINFO_DEF_TYPE (stmt_info);
2331 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2332 (STMT_VINFO_REDUC_DEF (stmt_info)))
2333 = STMT_VINFO_DEF_TYPE (stmt_info);
2334 }
2179 } 2335 }
2180 for (gimple_stmt_iterator si = gsi_start_bb (bb); 2336 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2181 !gsi_end_p (si); gsi_next (&si)) 2337 !gsi_end_p (si); gsi_next (&si))
2182 { 2338 {
2183 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si)); 2339 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2212 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p; 2368 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2213 2369
2214 goto start_over; 2370 goto start_over;
2215 } 2371 }
2216 2372
2373 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2374 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2375 OLD_LOOP_VINFO is better unless something specifically indicates
2376 otherwise.
2377
2378 Note that this deliberately isn't a partial order. */
2379
2380 static bool
2381 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2382 loop_vec_info old_loop_vinfo)
2383 {
2384 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2385 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2386
2387 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2388 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2389
2390 /* Always prefer a VF of loop->simdlen over any other VF. */
2391 if (loop->simdlen)
2392 {
2393 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2394 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2395 if (new_simdlen_p != old_simdlen_p)
2396 return new_simdlen_p;
2397 }
2398
2399 /* Limit the VFs to what is likely to be the maximum number of iterations,
2400 to handle cases in which at least one loop_vinfo is fully-masked. */
2401 HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
2402 if (estimated_max_niter != -1)
2403 {
2404 if (known_le (estimated_max_niter, new_vf))
2405 new_vf = estimated_max_niter;
2406 if (known_le (estimated_max_niter, old_vf))
2407 old_vf = estimated_max_niter;
2408 }
2409
2410 /* Check whether the (fractional) cost per scalar iteration is lower
2411 or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf. */
2412 poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost
2413 * poly_widest_int (old_vf));
2414 poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost
2415 * poly_widest_int (new_vf));
2416 if (maybe_lt (rel_old, rel_new))
2417 return false;
2418 if (known_lt (rel_new, rel_old))
2419 return true;
2420
2421 /* If there's nothing to choose between the loop bodies, see whether
2422 there's a difference in the prologue and epilogue costs. */
2423 if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2424 return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2425
2426 return false;
2427 }
2428
2429 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2430 true if we should. */
2431
2432 static bool
2433 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2434 loop_vec_info old_loop_vinfo)
2435 {
2436 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2437 return false;
2438
2439 if (dump_enabled_p ())
2440 dump_printf_loc (MSG_NOTE, vect_location,
2441 "***** Preferring vector mode %s to vector mode %s\n",
2442 GET_MODE_NAME (new_loop_vinfo->vector_mode),
2443 GET_MODE_NAME (old_loop_vinfo->vector_mode));
2444 return true;
2445 }
2446
2217 /* Function vect_analyze_loop. 2447 /* Function vect_analyze_loop.
2218 2448
2219 Apply a set of analyses on LOOP, and create a loop_vec_info struct 2449 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2220 for it. The different analyses will record information in the 2450 for it. The different analyses will record information in the
2221 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must 2451 loop_vec_info struct. */
2222 be vectorized. */
2223 opt_loop_vec_info 2452 opt_loop_vec_info
2224 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo, 2453 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2225 vec_info_shared *shared)
2226 { 2454 {
2227 auto_vector_sizes vector_sizes; 2455 auto_vector_modes vector_modes;
2228 2456
2229 /* Autodetect first vector size we try. */ 2457 /* Autodetect first vector size we try. */
2230 current_vector_size = 0; 2458 unsigned int autovec_flags
2231 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes); 2459 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2232 unsigned int next_size = 0; 2460 loop->simdlen != 0);
2461 unsigned int mode_i = 0;
2233 2462
2234 DUMP_VECT_SCOPE ("analyze_loop_nest"); 2463 DUMP_VECT_SCOPE ("analyze_loop_nest");
2235 2464
2236 if (loop_outer (loop) 2465 if (loop_outer (loop)
2237 && loop_vec_info_for_loop (loop_outer (loop)) 2466 && loop_vec_info_for_loop (loop_outer (loop))
2244 (vect_location, 2473 (vect_location,
2245 "not vectorized: loop nest containing two or more consecutive inner" 2474 "not vectorized: loop nest containing two or more consecutive inner"
2246 " loops cannot be vectorized\n"); 2475 " loops cannot be vectorized\n");
2247 2476
2248 unsigned n_stmts = 0; 2477 unsigned n_stmts = 0;
2249 poly_uint64 autodetected_vector_size = 0; 2478 machine_mode autodetected_vector_mode = VOIDmode;
2479 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2480 machine_mode next_vector_mode = VOIDmode;
2481 poly_uint64 lowest_th = 0;
2482 unsigned vectorized_loops = 0;
2483 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2484 && !unlimited_cost_model (loop));
2485
2486 bool vect_epilogues = false;
2487 opt_result res = opt_result::success ();
2488 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2250 while (1) 2489 while (1)
2251 { 2490 {
2252 /* Check the CFG characteristics of the loop (nesting, entry/exit). */ 2491 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2253 opt_loop_vec_info loop_vinfo 2492 opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared);
2254 = vect_analyze_loop_form (loop, shared);
2255 if (!loop_vinfo) 2493 if (!loop_vinfo)
2256 { 2494 {
2257 if (dump_enabled_p ()) 2495 if (dump_enabled_p ())
2258 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2496 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2259 "bad loop form.\n"); 2497 "bad loop form.\n");
2498 gcc_checking_assert (first_loop_vinfo == NULL);
2260 return loop_vinfo; 2499 return loop_vinfo;
2261 } 2500 }
2501 loop_vinfo->vector_mode = next_vector_mode;
2262 2502
2263 bool fatal = false; 2503 bool fatal = false;
2264 2504
2265 if (orig_loop_vinfo) 2505 /* When pick_lowest_cost_p is true, we should in principle iterate
2266 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo; 2506 over all the loop_vec_infos that LOOP_VINFO could replace and
2267 2507 try to vectorize LOOP_VINFO under the same conditions.
2268 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts); 2508 E.g. when trying to replace an epilogue loop, we should vectorize
2509 LOOP_VINFO as an epilogue loop with the same VF limit. When trying
2510 to replace the main loop, we should vectorize LOOP_VINFO as a main
2511 loop too.
2512
2513 However, autovectorize_vector_modes is usually sorted as follows:
2514
2515 - Modes that naturally produce lower VFs usually follow modes that
2516 naturally produce higher VFs.
2517
2518 - When modes naturally produce the same VF, maskable modes
2519 usually follow unmaskable ones, so that the maskable mode
2520 can be used to vectorize the epilogue of the unmaskable mode.
2521
2522 This order is preferred because it leads to the maximum
2523 epilogue vectorization opportunities. Targets should only use
2524 a different order if they want to make wide modes available while
2525 disparaging them relative to earlier, smaller modes. The assumption
2526 in that case is that the wider modes are more expensive in some
2527 way that isn't reflected directly in the costs.
2528
2529 There should therefore be few interesting cases in which
2530 LOOP_VINFO fails when treated as an epilogue loop, succeeds when
2531 treated as a standalone loop, and ends up being genuinely cheaper
2532 than FIRST_LOOP_VINFO. */
2533 if (vect_epilogues)
2534 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
2535
2536 res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2537 if (mode_i == 0)
2538 autodetected_vector_mode = loop_vinfo->vector_mode;
2539 if (dump_enabled_p ())
2540 {
2541 if (res)
2542 dump_printf_loc (MSG_NOTE, vect_location,
2543 "***** Analysis succeeded with vector mode %s\n",
2544 GET_MODE_NAME (loop_vinfo->vector_mode));
2545 else
2546 dump_printf_loc (MSG_NOTE, vect_location,
2547 "***** Analysis failed with vector mode %s\n",
2548 GET_MODE_NAME (loop_vinfo->vector_mode));
2549 }
2550
2551 loop->aux = NULL;
2552
2553 if (!fatal)
2554 while (mode_i < vector_modes.length ()
2555 && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i]))
2556 {
2557 if (dump_enabled_p ())
2558 dump_printf_loc (MSG_NOTE, vect_location,
2559 "***** The result for vector mode %s would"
2560 " be the same\n",
2561 GET_MODE_NAME (vector_modes[mode_i]));
2562 mode_i += 1;
2563 }
2564
2269 if (res) 2565 if (res)
2270 { 2566 {
2271 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1; 2567 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2272 2568 vectorized_loops++;
2273 return loop_vinfo; 2569
2274 } 2570 /* Once we hit the desired simdlen for the first time,
2275 2571 discard any previous attempts. */
2276 delete loop_vinfo; 2572 if (simdlen
2277 2573 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
2278 if (next_size == 0) 2574 {
2279 autodetected_vector_size = current_vector_size; 2575 delete first_loop_vinfo;
2280 2576 first_loop_vinfo = opt_loop_vec_info::success (NULL);
2281 if (next_size < vector_sizes.length () 2577 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2282 && known_eq (vector_sizes[next_size], autodetected_vector_size)) 2578 simdlen = 0;
2283 next_size += 1; 2579 }
2284 2580 else if (pick_lowest_cost_p && first_loop_vinfo)
2285 if (fatal 2581 {
2286 || next_size == vector_sizes.length () 2582 /* Keep trying to roll back vectorization attempts while the
2287 || known_eq (current_vector_size, 0U)) 2583 loop_vec_infos they produced were worse than this one. */
2288 return opt_loop_vec_info::propagate_failure (res); 2584 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
2585 while (!vinfos.is_empty ()
2586 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
2587 {
2588 gcc_assert (vect_epilogues);
2589 delete vinfos.pop ();
2590 }
2591 if (vinfos.is_empty ()
2592 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
2593 {
2594 delete first_loop_vinfo;
2595 first_loop_vinfo = opt_loop_vec_info::success (NULL);
2596 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2597 }
2598 }
2599
2600 if (first_loop_vinfo == NULL)
2601 {
2602 first_loop_vinfo = loop_vinfo;
2603 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
2604 }
2605 else if (vect_epilogues
2606 /* For now only allow one epilogue loop. */
2607 && first_loop_vinfo->epilogue_vinfos.is_empty ())
2608 {
2609 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
2610 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
2611 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2612 || maybe_ne (lowest_th, 0U));
2613 /* Keep track of the known smallest versioning
2614 threshold. */
2615 if (ordered_p (lowest_th, th))
2616 lowest_th = ordered_min (lowest_th, th);
2617 }
2618 else
2619 delete loop_vinfo;
2620
2621 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
2622 enabled, SIMDUID is not set, it is the innermost loop and we have
2623 either already found the loop's SIMDLEN or there was no SIMDLEN to
2624 begin with.
2625 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
2626 vect_epilogues = (!simdlen
2627 && loop->inner == NULL
2628 && param_vect_epilogues_nomask
2629 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
2630 && !loop->simduid
2631 /* For now only allow one epilogue loop, but allow
2632 pick_lowest_cost_p to replace it. */
2633 && (first_loop_vinfo->epilogue_vinfos.is_empty ()
2634 || pick_lowest_cost_p));
2635
2636 /* Commit to first_loop_vinfo if we have no reason to try
2637 alternatives. */
2638 if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
2639 break;
2640 }
2641 else
2642 {
2643 delete loop_vinfo;
2644 if (fatal)
2645 {
2646 gcc_checking_assert (first_loop_vinfo == NULL);
2647 break;
2648 }
2649 }
2650
2651 if (mode_i < vector_modes.length ()
2652 && VECTOR_MODE_P (autodetected_vector_mode)
2653 && (related_vector_mode (vector_modes[mode_i],
2654 GET_MODE_INNER (autodetected_vector_mode))
2655 == autodetected_vector_mode)
2656 && (related_vector_mode (autodetected_vector_mode,
2657 GET_MODE_INNER (vector_modes[mode_i]))
2658 == vector_modes[mode_i]))
2659 {
2660 if (dump_enabled_p ())
2661 dump_printf_loc (MSG_NOTE, vect_location,
2662 "***** Skipping vector mode %s, which would"
2663 " repeat the analysis for %s\n",
2664 GET_MODE_NAME (vector_modes[mode_i]),
2665 GET_MODE_NAME (autodetected_vector_mode));
2666 mode_i += 1;
2667 }
2668
2669 if (mode_i == vector_modes.length ()
2670 || autodetected_vector_mode == VOIDmode)
2671 break;
2289 2672
2290 /* Try the next biggest vector size. */ 2673 /* Try the next biggest vector size. */
2291 current_vector_size = vector_sizes[next_size++]; 2674 next_vector_mode = vector_modes[mode_i++];
2292 if (dump_enabled_p ()) 2675 if (dump_enabled_p ())
2293 { 2676 dump_printf_loc (MSG_NOTE, vect_location,
2294 dump_printf_loc (MSG_NOTE, vect_location, 2677 "***** Re-trying analysis with vector mode %s\n",
2295 "***** Re-trying analysis with " 2678 GET_MODE_NAME (next_vector_mode));
2296 "vector size "); 2679 }
2297 dump_dec (MSG_NOTE, current_vector_size); 2680
2298 dump_printf (MSG_NOTE, "\n"); 2681 if (first_loop_vinfo)
2299 } 2682 {
2300 } 2683 loop->aux = (loop_vec_info) first_loop_vinfo;
2684 if (dump_enabled_p ())
2685 dump_printf_loc (MSG_NOTE, vect_location,
2686 "***** Choosing vector mode %s\n",
2687 GET_MODE_NAME (first_loop_vinfo->vector_mode));
2688 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
2689 return first_loop_vinfo;
2690 }
2691
2692 return opt_loop_vec_info::propagate_failure (res);
2301 } 2693 }
2302 2694
2303 /* Return true if there is an in-order reduction function for CODE, storing 2695 /* Return true if there is an in-order reduction function for CODE, storing
2304 it in *REDUC_FN if so. */ 2696 it in *REDUC_FN if so. */
2305 2697
2369 } 2761 }
2370 } 2762 }
2371 2763
2372 /* If there is a neutral value X such that SLP reduction NODE would not 2764 /* If there is a neutral value X such that SLP reduction NODE would not
2373 be affected by the introduction of additional X elements, return that X, 2765 be affected by the introduction of additional X elements, return that X,
2374 otherwise return null. CODE is the code of the reduction. REDUC_CHAIN 2766 otherwise return null. CODE is the code of the reduction and VECTOR_TYPE
2375 is true if the SLP statements perform a single reduction, false if each 2767 is the vector type that would hold element X. REDUC_CHAIN is true if
2376 statement performs an independent reduction. */ 2768 the SLP statements perform a single reduction, false if each statement
2769 performs an independent reduction. */
2377 2770
2378 static tree 2771 static tree
2379 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code, 2772 neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
2380 bool reduc_chain) 2773 tree_code code, bool reduc_chain)
2381 { 2774 {
2382 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node); 2775 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2383 stmt_vec_info stmt_vinfo = stmts[0]; 2776 stmt_vec_info stmt_vinfo = stmts[0];
2384 tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2385 tree scalar_type = TREE_TYPE (vector_type); 2777 tree scalar_type = TREE_TYPE (vector_type);
2386 struct loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father; 2778 class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2387 gcc_assert (loop); 2779 gcc_assert (loop);
2388 2780
2389 switch (code) 2781 switch (code)
2390 { 2782 {
2391 case WIDEN_SUM_EXPR: 2783 case WIDEN_SUM_EXPR:
2425 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg) 2817 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2426 { 2818 {
2427 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt); 2819 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2428 } 2820 }
2429 2821
2430 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction
2431 operation. Return true if the results of DEF_STMT_INFO are something
2432 that can be accumulated by such a reduction. */
2433
2434 static bool
2435 vect_valid_reduction_input_p (stmt_vec_info def_stmt_info)
2436 {
2437 return (is_gimple_assign (def_stmt_info->stmt)
2438 || is_gimple_call (def_stmt_info->stmt)
2439 || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
2440 || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
2441 && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
2442 && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
2443 }
2444
2445 /* Detect SLP reduction of the form:
2446
2447 #a1 = phi <a5, a0>
2448 a2 = operation (a1)
2449 a3 = operation (a2)
2450 a4 = operation (a3)
2451 a5 = operation (a4)
2452
2453 #a = phi <a5>
2454
2455 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2456 FIRST_STMT is the first reduction stmt in the chain
2457 (a2 = operation (a1)).
2458
2459 Return TRUE if a reduction chain was detected. */
2460
2461 static bool
2462 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2463 gimple *first_stmt)
2464 {
2465 struct loop *loop = (gimple_bb (phi))->loop_father;
2466 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2467 enum tree_code code;
2468 gimple *loop_use_stmt = NULL;
2469 stmt_vec_info use_stmt_info, current_stmt_info = NULL;
2470 tree lhs;
2471 imm_use_iterator imm_iter;
2472 use_operand_p use_p;
2473 int nloop_uses, size = 0, n_out_of_loop_uses;
2474 bool found = false;
2475
2476 if (loop != vect_loop)
2477 return false;
2478
2479 lhs = PHI_RESULT (phi);
2480 code = gimple_assign_rhs_code (first_stmt);
2481 while (1)
2482 {
2483 nloop_uses = 0;
2484 n_out_of_loop_uses = 0;
2485 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2486 {
2487 gimple *use_stmt = USE_STMT (use_p);
2488 if (is_gimple_debug (use_stmt))
2489 continue;
2490
2491 /* Check if we got back to the reduction phi. */
2492 if (use_stmt == phi)
2493 {
2494 loop_use_stmt = use_stmt;
2495 found = true;
2496 break;
2497 }
2498
2499 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2500 {
2501 loop_use_stmt = use_stmt;
2502 nloop_uses++;
2503 }
2504 else
2505 n_out_of_loop_uses++;
2506
2507 /* There are can be either a single use in the loop or two uses in
2508 phi nodes. */
2509 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2510 return false;
2511 }
2512
2513 if (found)
2514 break;
2515
2516 /* We reached a statement with no loop uses. */
2517 if (nloop_uses == 0)
2518 return false;
2519
2520 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2521 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2522 return false;
2523
2524 if (!is_gimple_assign (loop_use_stmt)
2525 || code != gimple_assign_rhs_code (loop_use_stmt)
2526 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2527 return false;
2528
2529 /* Insert USE_STMT into reduction chain. */
2530 use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
2531 if (current_stmt_info)
2532 {
2533 REDUC_GROUP_NEXT_ELEMENT (current_stmt_info) = use_stmt_info;
2534 REDUC_GROUP_FIRST_ELEMENT (use_stmt_info)
2535 = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2536 }
2537 else
2538 REDUC_GROUP_FIRST_ELEMENT (use_stmt_info) = use_stmt_info;
2539
2540 lhs = gimple_assign_lhs (loop_use_stmt);
2541 current_stmt_info = use_stmt_info;
2542 size++;
2543 }
2544
2545 if (!found || loop_use_stmt != phi || size < 2)
2546 return false;
2547
2548 /* Swap the operands, if needed, to make the reduction operand be the second
2549 operand. */
2550 lhs = PHI_RESULT (phi);
2551 stmt_vec_info next_stmt_info = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2552 while (next_stmt_info)
2553 {
2554 gassign *next_stmt = as_a <gassign *> (next_stmt_info->stmt);
2555 if (gimple_assign_rhs2 (next_stmt) == lhs)
2556 {
2557 tree op = gimple_assign_rhs1 (next_stmt);
2558 stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2559
2560 /* Check that the other def is either defined in the loop
2561 ("vect_internal_def"), or it's an induction (defined by a
2562 loop-header phi-node). */
2563 if (def_stmt_info
2564 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2565 && vect_valid_reduction_input_p (def_stmt_info))
2566 {
2567 lhs = gimple_assign_lhs (next_stmt);
2568 next_stmt_info = REDUC_GROUP_NEXT_ELEMENT (next_stmt_info);
2569 continue;
2570 }
2571
2572 return false;
2573 }
2574 else
2575 {
2576 tree op = gimple_assign_rhs2 (next_stmt);
2577 stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2578
2579 /* Check that the other def is either defined in the loop
2580 ("vect_internal_def"), or it's an induction (defined by a
2581 loop-header phi-node). */
2582 if (def_stmt_info
2583 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2584 && vect_valid_reduction_input_p (def_stmt_info))
2585 {
2586 if (dump_enabled_p ())
2587 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: %G",
2588 next_stmt);
2589
2590 swap_ssa_operands (next_stmt,
2591 gimple_assign_rhs1_ptr (next_stmt),
2592 gimple_assign_rhs2_ptr (next_stmt));
2593 update_stmt (next_stmt);
2594
2595 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2596 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2597 }
2598 else
2599 return false;
2600 }
2601
2602 lhs = gimple_assign_lhs (next_stmt);
2603 next_stmt_info = REDUC_GROUP_NEXT_ELEMENT (next_stmt_info);
2604 }
2605
2606 /* Save the chain for further analysis in SLP detection. */
2607 stmt_vec_info first_stmt_info
2608 = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2609 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first_stmt_info);
2610 REDUC_GROUP_SIZE (first_stmt_info) = size;
2611
2612 return true;
2613 }
2614
2615 /* Return true if we need an in-order reduction for operation CODE 2822 /* Return true if we need an in-order reduction for operation CODE
2616 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer 2823 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2617 overflow must wrap. */ 2824 overflow must wrap. */
2618 2825
2619 static bool 2826 bool
2620 needs_fold_left_reduction_p (tree type, tree_code code, 2827 needs_fold_left_reduction_p (tree type, tree_code code)
2621 bool need_wrapping_integral_overflow)
2622 { 2828 {
2623 /* CHECKME: check for !flag_finite_math_only too? */ 2829 /* CHECKME: check for !flag_finite_math_only too? */
2624 if (SCALAR_FLOAT_TYPE_P (type)) 2830 if (SCALAR_FLOAT_TYPE_P (type))
2625 switch (code) 2831 switch (code)
2626 { 2832 {
2634 2840
2635 if (INTEGRAL_TYPE_P (type)) 2841 if (INTEGRAL_TYPE_P (type))
2636 { 2842 {
2637 if (!operation_no_trapping_overflow (type, code)) 2843 if (!operation_no_trapping_overflow (type, code))
2638 return true; 2844 return true;
2639 if (need_wrapping_integral_overflow
2640 && !TYPE_OVERFLOW_WRAPS (type)
2641 && operation_can_overflow (code))
2642 return true;
2643 return false; 2845 return false;
2644 } 2846 }
2645 2847
2646 if (SAT_FIXED_POINT_TYPE_P (type)) 2848 if (SAT_FIXED_POINT_TYPE_P (type))
2647 return true; 2849 return true;
2648 2850
2649 return false; 2851 return false;
2650 } 2852 }
2651 2853
2652 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and 2854 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2653 reduction operation CODE has a handled computation expression. */ 2855 has a handled computation expression. Store the main reduction
2654 2856 operation in *CODE. */
2655 bool 2857
2858 static bool
2656 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi, 2859 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2657 tree loop_arg, enum tree_code code) 2860 tree loop_arg, enum tree_code *code,
2861 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
2658 { 2862 {
2659 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2660 auto_bitmap visited; 2863 auto_bitmap visited;
2661 tree lookfor = PHI_RESULT (phi); 2864 tree lookfor = PHI_RESULT (phi);
2662 ssa_op_iter curri; 2865 ssa_op_iter curri;
2663 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE); 2866 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2664 while (USE_FROM_PTR (curr) != loop_arg) 2867 while (USE_FROM_PTR (curr) != loop_arg)
2722 } 2925 }
2723 2926
2724 /* Check whether the reduction path detected is valid. */ 2927 /* Check whether the reduction path detected is valid. */
2725 bool fail = path.length () == 0; 2928 bool fail = path.length () == 0;
2726 bool neg = false; 2929 bool neg = false;
2930 int sign = -1;
2931 *code = ERROR_MARK;
2727 for (unsigned i = 1; i < path.length (); ++i) 2932 for (unsigned i = 1; i < path.length (); ++i)
2728 { 2933 {
2729 gimple *use_stmt = USE_STMT (path[i].second); 2934 gimple *use_stmt = USE_STMT (path[i].second);
2730 tree op = USE_FROM_PTR (path[i].second); 2935 tree op = USE_FROM_PTR (path[i].second);
2731 if (! has_single_use (op) 2936 if (! is_gimple_assign (use_stmt)
2732 || ! is_gimple_assign (use_stmt)) 2937 /* The following make sure we can compute the operand index
2938 easily plus it mostly disallows chaining via COND_EXPR condition
2939 operands. */
2940 || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
2941 && (gimple_num_ops (use_stmt) <= 2
2942 || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
2943 && (gimple_num_ops (use_stmt) <= 3
2944 || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
2733 { 2945 {
2734 fail = true; 2946 fail = true;
2735 break; 2947 break;
2736 } 2948 }
2737 if (gimple_assign_rhs_code (use_stmt) != code) 2949 /* Check there's only a single stmt the op is used on inside
2738 { 2950 of the loop. */
2739 if (code == PLUS_EXPR 2951 imm_use_iterator imm_iter;
2740 && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR) 2952 gimple *op_use_stmt;
2741 { 2953 unsigned cnt = 0;
2742 /* Track whether we negate the reduction value each iteration. */ 2954 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
2743 if (gimple_assign_rhs2 (use_stmt) == op) 2955 if (!is_gimple_debug (op_use_stmt)
2744 neg = ! neg; 2956 && flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt)))
2745 } 2957 {
2746 else 2958 /* We want to allow x + x but not x < 1 ? x : 2. */
2747 { 2959 if (is_gimple_assign (op_use_stmt)
2748 fail = true; 2960 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
2749 break; 2961 {
2750 } 2962 use_operand_p use_p;
2751 } 2963 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
2752 } 2964 cnt++;
2753 return ! fail && ! neg; 2965 }
2966 else
2967 cnt++;
2968 }
2969 if (cnt != 1)
2970 {
2971 fail = true;
2972 break;
2973 }
2974 tree_code use_code = gimple_assign_rhs_code (use_stmt);
2975 if (use_code == MINUS_EXPR)
2976 {
2977 use_code = PLUS_EXPR;
2978 /* Track whether we negate the reduction value each iteration. */
2979 if (gimple_assign_rhs2 (use_stmt) == op)
2980 neg = ! neg;
2981 }
2982 if (CONVERT_EXPR_CODE_P (use_code)
2983 && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
2984 TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
2985 ;
2986 else if (*code == ERROR_MARK)
2987 {
2988 *code = use_code;
2989 sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
2990 }
2991 else if (use_code != *code)
2992 {
2993 fail = true;
2994 break;
2995 }
2996 else if ((use_code == MIN_EXPR
2997 || use_code == MAX_EXPR)
2998 && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
2999 {
3000 fail = true;
3001 break;
3002 }
3003 }
3004 return ! fail && ! neg && *code != ERROR_MARK;
2754 } 3005 }
3006
3007 bool
3008 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3009 tree loop_arg, enum tree_code code)
3010 {
3011 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3012 enum tree_code code_;
3013 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3014 && code_ == code);
3015 }
3016
2755 3017
2756 3018
2757 /* Function vect_is_simple_reduction 3019 /* Function vect_is_simple_reduction
2758 3020
2759 (1) Detect a cross-iteration def-use cycle that represents a simple 3021 (1) Detect a cross-iteration def-use cycle that represents a simple
2798 3060
2799 */ 3061 */
2800 3062
2801 static stmt_vec_info 3063 static stmt_vec_info
2802 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info, 3064 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
2803 bool *double_reduc, 3065 bool *double_reduc, bool *reduc_chain_p)
2804 bool need_wrapping_integral_overflow,
2805 enum vect_reduction_type *v_reduc_type)
2806 { 3066 {
2807 gphi *phi = as_a <gphi *> (phi_info->stmt); 3067 gphi *phi = as_a <gphi *> (phi_info->stmt);
2808 struct loop *loop = (gimple_bb (phi))->loop_father;
2809 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2810 gimple *phi_use_stmt = NULL; 3068 gimple *phi_use_stmt = NULL;
2811 enum tree_code orig_code, code;
2812 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2813 tree type;
2814 int nloop_uses;
2815 tree name;
2816 imm_use_iterator imm_iter; 3069 imm_use_iterator imm_iter;
2817 use_operand_p use_p; 3070 use_operand_p use_p;
2818 bool phi_def;
2819 3071
2820 *double_reduc = false; 3072 *double_reduc = false;
2821 *v_reduc_type = TREE_CODE_REDUCTION; 3073 *reduc_chain_p = false;
3074 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
2822 3075
2823 tree phi_name = PHI_RESULT (phi); 3076 tree phi_name = PHI_RESULT (phi);
2824 /* ??? If there are no uses of the PHI result the inner loop reduction 3077 /* ??? If there are no uses of the PHI result the inner loop reduction
2825 won't be detected as possibly double-reduction by vectorizable_reduction 3078 won't be detected as possibly double-reduction by vectorizable_reduction
2826 because that tries to walk the PHI arg from the preheader edge which 3079 because that tries to walk the PHI arg from the preheader edge which
2827 can be constant. See PR60382. */ 3080 can be constant. See PR60382. */
2828 if (has_zero_uses (phi_name)) 3081 if (has_zero_uses (phi_name))
2829 return NULL; 3082 return NULL;
2830 nloop_uses = 0; 3083 class loop *loop = (gimple_bb (phi))->loop_father;
3084 unsigned nphi_def_loop_uses = 0;
2831 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name) 3085 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2832 { 3086 {
2833 gimple *use_stmt = USE_STMT (use_p); 3087 gimple *use_stmt = USE_STMT (use_p);
2834 if (is_gimple_debug (use_stmt)) 3088 if (is_gimple_debug (use_stmt))
2835 continue; 3089 continue;
2841 "intermediate value used outside loop.\n"); 3095 "intermediate value used outside loop.\n");
2842 3096
2843 return NULL; 3097 return NULL;
2844 } 3098 }
2845 3099
2846 nloop_uses++; 3100 nphi_def_loop_uses++;
2847 if (nloop_uses > 1)
2848 {
2849 if (dump_enabled_p ())
2850 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2851 "reduction value used in loop.\n");
2852 return NULL;
2853 }
2854
2855 phi_use_stmt = use_stmt; 3101 phi_use_stmt = use_stmt;
2856 } 3102 }
2857 3103
2858 edge latch_e = loop_latch_edge (loop); 3104 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
2859 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e); 3105 if (TREE_CODE (latch_def) != SSA_NAME)
2860 if (TREE_CODE (loop_arg) != SSA_NAME)
2861 { 3106 {
2862 if (dump_enabled_p ()) 3107 if (dump_enabled_p ())
2863 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3108 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2864 "reduction: not ssa_name: %T\n", loop_arg); 3109 "reduction: not ssa_name: %T\n", latch_def);
2865 return NULL; 3110 return NULL;
2866 } 3111 }
2867 3112
2868 stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg); 3113 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
2869 if (!def_stmt_info 3114 if (!def_stmt_info
2870 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))) 3115 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
2871 return NULL; 3116 return NULL;
2872 3117
2873 if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt)) 3118 bool nested_in_vect_loop
2874 { 3119 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
2875 name = gimple_assign_lhs (def_stmt); 3120 unsigned nlatch_def_loop_uses = 0;
2876 phi_def = false;
2877 }
2878 else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
2879 {
2880 name = PHI_RESULT (def_stmt);
2881 phi_def = true;
2882 }
2883 else
2884 {
2885 if (dump_enabled_p ())
2886 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2887 "reduction: unhandled reduction operation: %G",
2888 def_stmt_info->stmt);
2889 return NULL;
2890 }
2891
2892 nloop_uses = 0;
2893 auto_vec<gphi *, 3> lcphis; 3121 auto_vec<gphi *, 3> lcphis;
2894 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name) 3122 bool inner_loop_of_double_reduc = false;
3123 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
2895 { 3124 {
2896 gimple *use_stmt = USE_STMT (use_p); 3125 gimple *use_stmt = USE_STMT (use_p);
2897 if (is_gimple_debug (use_stmt)) 3126 if (is_gimple_debug (use_stmt))
2898 continue; 3127 continue;
2899 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))) 3128 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2900 nloop_uses++; 3129 nlatch_def_loop_uses++;
2901 else 3130 else
2902 /* We can have more than one loop-closed PHI. */ 3131 {
2903 lcphis.safe_push (as_a <gphi *> (use_stmt)); 3132 /* We can have more than one loop-closed PHI. */
2904 if (nloop_uses > 1) 3133 lcphis.safe_push (as_a <gphi *> (use_stmt));
2905 { 3134 if (nested_in_vect_loop
2906 if (dump_enabled_p ()) 3135 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
2907 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3136 == vect_double_reduction_def))
2908 "reduction used in loop.\n"); 3137 inner_loop_of_double_reduc = true;
2909 return NULL; 3138 }
2910 } 3139 }
3140
3141 /* If we are vectorizing an inner reduction we are executing that
3142 in the original order only in case we are not dealing with a
3143 double reduction. */
3144 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3145 {
3146 if (dump_enabled_p ())
3147 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3148 "detected nested cycle: ");
3149 return def_stmt_info;
3150 }
3151
3152 /* If this isn't a nested cycle or if the nested cycle reduction value
3153 is used ouside of the inner loop we cannot handle uses of the reduction
3154 value. */
3155 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3156 {
3157 if (dump_enabled_p ())
3158 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3159 "reduction used in loop.\n");
3160 return NULL;
2911 } 3161 }
2912 3162
2913 /* If DEF_STMT is a phi node itself, we expect it to have a single argument 3163 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2914 defined in the inner loop. */ 3164 defined in the inner loop. */
2915 if (phi_def) 3165 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
2916 { 3166 {
2917 gphi *def_stmt = as_a <gphi *> (def_stmt_info->stmt); 3167 tree op1 = PHI_ARG_DEF (def_stmt, 0);
2918 op1 = PHI_ARG_DEF (def_stmt, 0);
2919
2920 if (gimple_phi_num_args (def_stmt) != 1 3168 if (gimple_phi_num_args (def_stmt) != 1
2921 || TREE_CODE (op1) != SSA_NAME) 3169 || TREE_CODE (op1) != SSA_NAME)
2922 { 3170 {
2923 if (dump_enabled_p ()) 3171 if (dump_enabled_p ())
2924 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3172 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2945 } 3193 }
2946 3194
2947 return NULL; 3195 return NULL;
2948 } 3196 }
2949 3197
2950 /* If we are vectorizing an inner reduction we are executing that 3198 /* Look for the expression computing latch_def from then loop PHI result. */
2951 in the original order only in case we are not dealing with a 3199 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2952 double reduction. */ 3200 enum tree_code code;
2953 bool check_reduction = true; 3201 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
2954 if (flow_loop_nested_p (vect_loop, loop)) 3202 path))
2955 { 3203 {
2956 gphi *lcphi; 3204 STMT_VINFO_REDUC_CODE (phi_info) = code;
3205 if (code == COND_EXPR && !nested_in_vect_loop)
3206 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3207
3208 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3209 reduction chain for which the additional restriction is that
3210 all operations in the chain are the same. */
3211 auto_vec<stmt_vec_info, 8> reduc_chain;
2957 unsigned i; 3212 unsigned i;
2958 check_reduction = false; 3213 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
2959 FOR_EACH_VEC_ELT (lcphis, i, lcphi) 3214 for (i = path.length () - 1; i >= 1; --i)
2960 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi)) 3215 {
2961 { 3216 gimple *stmt = USE_STMT (path[i].second);
2962 gimple *use_stmt = USE_STMT (use_p); 3217 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
2963 if (is_gimple_debug (use_stmt)) 3218 STMT_VINFO_REDUC_IDX (stmt_info)
2964 continue; 3219 = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
2965 if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt))) 3220 enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
2966 check_reduction = true; 3221 bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
2967 } 3222 && (i == 1 || i == path.length () - 1));
2968 } 3223 if ((stmt_code != code && !leading_conversion)
2969 3224 /* We can only handle the final value in epilogue
2970 gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt); 3225 generation for reduction chains. */
2971 bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop); 3226 || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
2972 code = orig_code = gimple_assign_rhs_code (def_stmt); 3227 is_slp_reduc = false;
2973 3228 /* For reduction chains we support a trailing/leading
2974 /* We can handle "res -= x[i]", which is non-associative by 3229 conversions. We do not store those in the actual chain. */
2975 simply rewriting this into "res += -x[i]". Avoid changing 3230 if (leading_conversion)
2976 gimple instruction for the first simple tests and only do this 3231 continue;
2977 if we're allowed to change code at all. */ 3232 reduc_chain.safe_push (stmt_info);
2978 if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name) 3233 }
2979 code = PLUS_EXPR; 3234 if (is_slp_reduc && reduc_chain.length () > 1)
2980 3235 {
2981 if (code == COND_EXPR) 3236 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
2982 { 3237 {
2983 if (! nested_in_vect_loop) 3238 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
2984 *v_reduc_type = COND_REDUCTION; 3239 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
2985 3240 }
2986 op3 = gimple_assign_rhs1 (def_stmt); 3241 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
2987 if (COMPARISON_CLASS_P (op3)) 3242 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
2988 { 3243
2989 op4 = TREE_OPERAND (op3, 1); 3244 /* Save the chain for further analysis in SLP detection. */
2990 op3 = TREE_OPERAND (op3, 0); 3245 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
2991 } 3246 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
2992 if (op3 == phi_name || op4 == phi_name) 3247
2993 { 3248 *reduc_chain_p = true;
2994 if (dump_enabled_p ()) 3249 if (dump_enabled_p ())
2995 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, 3250 dump_printf_loc (MSG_NOTE, vect_location,
2996 "reduction: condition depends on previous" 3251 "reduction: detected reduction chain\n");
2997 " iteration: "); 3252 }
2998 return NULL; 3253 else if (dump_enabled_p ())
2999 } 3254 dump_printf_loc (MSG_NOTE, vect_location,
3000 3255 "reduction: detected reduction\n");
3001 op1 = gimple_assign_rhs2 (def_stmt); 3256
3002 op2 = gimple_assign_rhs3 (def_stmt);
3003 }
3004 else if (!commutative_tree_code (code) || !associative_tree_code (code))
3005 {
3006 if (dump_enabled_p ())
3007 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3008 "reduction: not commutative/associative: ");
3009 return NULL;
3010 }
3011 else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3012 {
3013 op1 = gimple_assign_rhs1 (def_stmt);
3014 op2 = gimple_assign_rhs2 (def_stmt);
3015 }
3016 else
3017 {
3018 if (dump_enabled_p ())
3019 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3020 "reduction: not handled operation: ");
3021 return NULL;
3022 }
3023
3024 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3025 {
3026 if (dump_enabled_p ())
3027 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3028 "reduction: both uses not ssa_names: ");
3029
3030 return NULL;
3031 }
3032
3033 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3034 if ((TREE_CODE (op1) == SSA_NAME
3035 && !types_compatible_p (type,TREE_TYPE (op1)))
3036 || (TREE_CODE (op2) == SSA_NAME
3037 && !types_compatible_p (type, TREE_TYPE (op2)))
3038 || (op3 && TREE_CODE (op3) == SSA_NAME
3039 && !types_compatible_p (type, TREE_TYPE (op3)))
3040 || (op4 && TREE_CODE (op4) == SSA_NAME
3041 && !types_compatible_p (type, TREE_TYPE (op4))))
3042 {
3043 if (dump_enabled_p ())
3044 {
3045 dump_printf_loc (MSG_NOTE, vect_location,
3046 "reduction: multiple types: operation type: "
3047 "%T, operands types: %T,%T",
3048 type, TREE_TYPE (op1), TREE_TYPE (op2));
3049 if (op3)
3050 dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op3));
3051
3052 if (op4)
3053 dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op4));
3054 dump_printf (MSG_NOTE, "\n");
3055 }
3056
3057 return NULL;
3058 }
3059
3060 /* Check whether it's ok to change the order of the computation.
3061 Generally, when vectorizing a reduction we change the order of the
3062 computation. This may change the behavior of the program in some
3063 cases, so we need to check that this is ok. One exception is when
3064 vectorizing an outer-loop: the inner-loop is executed sequentially,
3065 and therefore vectorizing reductions in the inner-loop during
3066 outer-loop vectorization is safe. */
3067 if (check_reduction
3068 && *v_reduc_type == TREE_CODE_REDUCTION
3069 && needs_fold_left_reduction_p (type, code,
3070 need_wrapping_integral_overflow))
3071 *v_reduc_type = FOLD_LEFT_REDUCTION;
3072
3073 /* Reduction is safe. We're dealing with one of the following:
3074 1) integer arithmetic and no trapv
3075 2) floating point arithmetic, and special flags permit this optimization
3076 3) nested cycle (i.e., outer loop vectorization). */
3077 stmt_vec_info def1_info = loop_info->lookup_def (op1);
3078 stmt_vec_info def2_info = loop_info->lookup_def (op2);
3079 if (code != COND_EXPR && !def1_info && !def2_info)
3080 {
3081 if (dump_enabled_p ())
3082 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3083 return NULL;
3084 }
3085
3086 /* Check that one def is the reduction def, defined by PHI,
3087 the other def is either defined in the loop ("vect_internal_def"),
3088 or it's an induction (defined by a loop-header phi-node). */
3089
3090 if (def2_info
3091 && def2_info->stmt == phi
3092 && (code == COND_EXPR
3093 || !def1_info
3094 || !flow_bb_inside_loop_p (loop, gimple_bb (def1_info->stmt))
3095 || vect_valid_reduction_input_p (def1_info)))
3096 {
3097 if (dump_enabled_p ())
3098 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3099 return def_stmt_info; 3257 return def_stmt_info;
3100 } 3258 }
3101 3259
3102 if (def1_info
3103 && def1_info->stmt == phi
3104 && (code == COND_EXPR
3105 || !def2_info
3106 || !flow_bb_inside_loop_p (loop, gimple_bb (def2_info->stmt))
3107 || vect_valid_reduction_input_p (def2_info)))
3108 {
3109 if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3110 {
3111 /* Check if we can swap operands (just for simplicity - so that
3112 the rest of the code can assume that the reduction variable
3113 is always the last (second) argument). */
3114 if (code == COND_EXPR)
3115 {
3116 /* Swap cond_expr by inverting the condition. */
3117 tree cond_expr = gimple_assign_rhs1 (def_stmt);
3118 enum tree_code invert_code = ERROR_MARK;
3119 enum tree_code cond_code = TREE_CODE (cond_expr);
3120
3121 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3122 {
3123 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3124 invert_code = invert_tree_comparison (cond_code, honor_nans);
3125 }
3126 if (invert_code != ERROR_MARK)
3127 {
3128 TREE_SET_CODE (cond_expr, invert_code);
3129 swap_ssa_operands (def_stmt,
3130 gimple_assign_rhs2_ptr (def_stmt),
3131 gimple_assign_rhs3_ptr (def_stmt));
3132 }
3133 else
3134 {
3135 if (dump_enabled_p ())
3136 report_vect_op (MSG_NOTE, def_stmt,
3137 "detected reduction: cannot swap operands "
3138 "for cond_expr");
3139 return NULL;
3140 }
3141 }
3142 else
3143 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3144 gimple_assign_rhs2_ptr (def_stmt));
3145
3146 if (dump_enabled_p ())
3147 report_vect_op (MSG_NOTE, def_stmt,
3148 "detected reduction: need to swap operands: ");
3149
3150 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3151 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3152 }
3153 else
3154 {
3155 if (dump_enabled_p ())
3156 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3157 }
3158
3159 return def_stmt_info;
3160 }
3161
3162 /* Try to find SLP reduction chain. */
3163 if (! nested_in_vect_loop
3164 && code != COND_EXPR
3165 && orig_code != MINUS_EXPR
3166 && vect_is_slp_reduction (loop_info, phi, def_stmt))
3167 {
3168 if (dump_enabled_p ())
3169 report_vect_op (MSG_NOTE, def_stmt,
3170 "reduction: detected reduction chain: ");
3171
3172 return def_stmt_info;
3173 }
3174
3175 /* Dissolve group eventually half-built by vect_is_slp_reduction. */
3176 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (def_stmt_info);
3177 while (first)
3178 {
3179 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
3180 REDUC_GROUP_FIRST_ELEMENT (first) = NULL;
3181 REDUC_GROUP_NEXT_ELEMENT (first) = NULL;
3182 first = next;
3183 }
3184
3185 /* Look for the expression computing loop_arg from loop PHI result. */
3186 if (check_reduction_path (vect_location, loop, phi, loop_arg, code))
3187 return def_stmt_info;
3188
3189 if (dump_enabled_p ()) 3260 if (dump_enabled_p ())
3190 { 3261 dump_printf_loc (MSG_NOTE, vect_location,
3191 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, 3262 "reduction: unknown pattern\n");
3192 "reduction: unknown pattern: ");
3193 }
3194 3263
3195 return NULL; 3264 return NULL;
3196 }
3197
3198 /* Wrapper around vect_is_simple_reduction, which will modify code
3199 in-place if it enables detection of more reductions. Arguments
3200 as there. */
3201
3202 stmt_vec_info
3203 vect_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3204 bool *double_reduc,
3205 bool need_wrapping_integral_overflow)
3206 {
3207 enum vect_reduction_type v_reduc_type;
3208 stmt_vec_info def_info
3209 = vect_is_simple_reduction (loop_info, phi_info, double_reduc,
3210 need_wrapping_integral_overflow,
3211 &v_reduc_type);
3212 if (def_info)
3213 {
3214 STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type;
3215 STMT_VINFO_REDUC_DEF (phi_info) = def_info;
3216 STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type;
3217 STMT_VINFO_REDUC_DEF (def_info) = phi_info;
3218 }
3219 return def_info;
3220 } 3265 }
3221 3266
3222 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */ 3267 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3223 int 3268 int
3224 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue, 3269 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3240 3285
3241 /* If peeled iterations are known but number of scalar loop 3286 /* If peeled iterations are known but number of scalar loop
3242 iterations are unknown, count a taken branch per peeled loop. */ 3287 iterations are unknown, count a taken branch per peeled loop. */
3243 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken, 3288 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3244 NULL, 0, vect_prologue); 3289 NULL, 0, vect_prologue);
3245 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken, 3290 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3246 NULL, 0, vect_epilogue); 3291 NULL, 0, vect_epilogue);
3247 } 3292 }
3248 else 3293 else
3249 { 3294 {
3250 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo); 3295 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3251 peel_iters_prologue = niters < peel_iters_prologue ? 3296 peel_iters_prologue = niters < peel_iters_prologue ?
3309 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo); 3354 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3310 3355
3311 /* Cost model disabled. */ 3356 /* Cost model disabled. */
3312 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) 3357 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3313 { 3358 {
3314 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n"); 3359 if (dump_enabled_p ())
3360 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3315 *ret_min_profitable_niters = 0; 3361 *ret_min_profitable_niters = 0;
3316 *ret_min_profitable_estimate = 0; 3362 *ret_min_profitable_estimate = 0;
3317 return; 3363 return;
3318 } 3364 }
3319 3365
3322 { 3368 {
3323 /* FIXME: Make cost depend on complexity of individual check. */ 3369 /* FIXME: Make cost depend on complexity of individual check. */
3324 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length (); 3370 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3325 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0, 3371 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3326 vect_prologue); 3372 vect_prologue);
3327 dump_printf (MSG_NOTE, 3373 if (dump_enabled_p ())
3328 "cost model: Adding cost of checks for loop " 3374 dump_printf (MSG_NOTE,
3329 "versioning to treat misalignment.\n"); 3375 "cost model: Adding cost of checks for loop "
3376 "versioning to treat misalignment.\n");
3330 } 3377 }
3331 3378
3332 /* Requires loop versioning with alias checks. */ 3379 /* Requires loop versioning with alias checks. */
3333 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)) 3380 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3334 { 3381 {
3351 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p) 3398 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3352 nstmts += 1; 3399 nstmts += 1;
3353 (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt, 3400 (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3354 NULL, 0, vect_prologue); 3401 NULL, 0, vect_prologue);
3355 } 3402 }
3356 dump_printf (MSG_NOTE, 3403 if (dump_enabled_p ())
3357 "cost model: Adding cost of checks for loop " 3404 dump_printf (MSG_NOTE,
3358 "versioning aliasing.\n"); 3405 "cost model: Adding cost of checks for loop "
3406 "versioning aliasing.\n");
3359 } 3407 }
3360 3408
3361 /* Requires loop versioning with niter checks. */ 3409 /* Requires loop versioning with niter checks. */
3362 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo)) 3410 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3363 { 3411 {
3364 /* FIXME: Make cost depend on complexity of individual check. */ 3412 /* FIXME: Make cost depend on complexity of individual check. */
3365 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0, 3413 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3366 vect_prologue); 3414 vect_prologue);
3367 dump_printf (MSG_NOTE, 3415 if (dump_enabled_p ())
3368 "cost model: Adding cost of checks for loop " 3416 dump_printf (MSG_NOTE,
3369 "versioning niters.\n"); 3417 "cost model: Adding cost of checks for loop "
3418 "versioning niters.\n");
3370 } 3419 }
3371 3420
3372 if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) 3421 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3373 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0, 3422 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3374 vect_prologue); 3423 vect_prologue);
3408 j, si) 3457 j, si)
3409 (void) add_stmt_cost (target_cost_data, si->count, 3458 (void) add_stmt_cost (target_cost_data, si->count,
3410 si->kind, si->stmt_info, si->misalign, 3459 si->kind, si->stmt_info, si->misalign,
3411 vect_epilogue); 3460 vect_epilogue);
3412 } 3461 }
3462
3463 /* Calculate how many masks we need to generate. */
3464 unsigned int num_masks = 0;
3465 rgroup_masks *rgm;
3466 unsigned int num_vectors_m1;
3467 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
3468 if (rgm->mask_type)
3469 num_masks += num_vectors_m1 + 1;
3470 gcc_assert (num_masks > 0);
3471
3472 /* In the worst case, we need to generate each mask in the prologue
3473 and in the loop body. One of the loop body mask instructions
3474 replaces the comparison in the scalar loop, and since we don't
3475 count the scalar comparison against the scalar body, we shouldn't
3476 count that vector instruction against the vector body either.
3477
3478 Sometimes we can use unpacks instead of generating prologue
3479 masks and sometimes the prologue mask will fold to a constant,
3480 so the actual prologue cost might be smaller. However, it's
3481 simpler and safer to use the worst-case cost; if this ends up
3482 being the tie-breaker between vectorizing or not, then it's
3483 probably better not to vectorize. */
3484 (void) add_stmt_cost (target_cost_data, num_masks, vector_stmt,
3485 NULL, 0, vect_prologue);
3486 (void) add_stmt_cost (target_cost_data, num_masks - 1, vector_stmt,
3487 NULL, 0, vect_body);
3413 } 3488 }
3414 else if (npeel < 0) 3489 else if (npeel < 0)
3415 { 3490 {
3416 peel_iters_prologue = assumed_vf / 2; 3491 peel_iters_prologue = assumed_vf / 2;
3417 dump_printf (MSG_NOTE, "cost model: " 3492 if (dump_enabled_p ())
3418 "prologue peel iters set to vf/2.\n"); 3493 dump_printf (MSG_NOTE, "cost model: "
3494 "prologue peel iters set to vf/2.\n");
3419 3495
3420 /* If peeling for alignment is unknown, loop bound of main loop becomes 3496 /* If peeling for alignment is unknown, loop bound of main loop becomes
3421 unknown. */ 3497 unknown. */
3422 peel_iters_epilogue = assumed_vf / 2; 3498 peel_iters_epilogue = assumed_vf / 2;
3423 dump_printf (MSG_NOTE, "cost model: " 3499 if (dump_enabled_p ())
3424 "epilogue peel iters set to vf/2 because " 3500 dump_printf (MSG_NOTE, "cost model: "
3425 "peeling for alignment is unknown.\n"); 3501 "epilogue peel iters set to vf/2 because "
3502 "peeling for alignment is unknown.\n");
3426 3503
3427 /* If peeled iterations are unknown, count a taken branch and a not taken 3504 /* If peeled iterations are unknown, count a taken branch and a not taken
3428 branch per peeled loop. Even if scalar loop iterations are known, 3505 branch per peeled loop. Even if scalar loop iterations are known,
3429 vector iterations are not known since peeled prologue iterations are 3506 vector iterations are not known since peeled prologue iterations are
3430 not known. Hence guards remain the same. */ 3507 not known. Hence guards remain the same. */
3554 /* Complete the target-specific cost calculations. */ 3631 /* Complete the target-specific cost calculations. */
3555 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost, 3632 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3556 &vec_inside_cost, &vec_epilogue_cost); 3633 &vec_inside_cost, &vec_epilogue_cost);
3557 3634
3558 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost); 3635 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3559 3636
3637 /* Stash the costs so that we can compare two loop_vec_infos. */
3638 loop_vinfo->vec_inside_cost = vec_inside_cost;
3639 loop_vinfo->vec_outside_cost = vec_outside_cost;
3640
3560 if (dump_enabled_p ()) 3641 if (dump_enabled_p ())
3561 { 3642 {
3562 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n"); 3643 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3563 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n", 3644 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3564 vec_inside_cost); 3645 vec_inside_cost);
3579 } 3660 }
3580 3661
3581 /* Calculate number of iterations required to make the vector version 3662 /* Calculate number of iterations required to make the vector version
3582 profitable, relative to the loop bodies only. The following condition 3663 profitable, relative to the loop bodies only. The following condition
3583 must hold true: 3664 must hold true:
3584 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC 3665 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
3585 where 3666 where
3586 SIC = scalar iteration cost, VIC = vector iteration cost, 3667 SIC = scalar iteration cost, VIC = vector iteration cost,
3587 VOC = vector outside cost, VF = vectorization factor, 3668 VOC = vector outside cost, VF = vectorization factor,
3588 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations 3669 NPEEL = prologue iterations + epilogue iterations,
3589 SOC = scalar outside cost for run time cost model check. */ 3670 SOC = scalar outside cost for run time cost model check. */
3590 3671
3591 if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost) 3672 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
3592 { 3673 - vec_inside_cost);
3593 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) 3674 if (saving_per_viter <= 0)
3594 * assumed_vf
3595 - vec_inside_cost * peel_iters_prologue
3596 - vec_inside_cost * peel_iters_epilogue);
3597 if (min_profitable_iters <= 0)
3598 min_profitable_iters = 0;
3599 else
3600 {
3601 min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3602 - vec_inside_cost);
3603
3604 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3605 <= (((int) vec_inside_cost * min_profitable_iters)
3606 + (((int) vec_outside_cost - scalar_outside_cost)
3607 * assumed_vf)))
3608 min_profitable_iters++;
3609 }
3610 }
3611 /* vector version will never be profitable. */
3612 else
3613 { 3675 {
3614 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize) 3676 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3615 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd, 3677 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3616 "vectorization did not happen for a simd loop"); 3678 "vectorization did not happen for a simd loop");
3617 3679
3625 *ret_min_profitable_niters = -1; 3687 *ret_min_profitable_niters = -1;
3626 *ret_min_profitable_estimate = -1; 3688 *ret_min_profitable_estimate = -1;
3627 return; 3689 return;
3628 } 3690 }
3629 3691
3630 dump_printf (MSG_NOTE, 3692 /* ??? The "if" arm is written to handle all cases; see below for what
3631 " Calculated minimum iters for profitability: %d\n", 3693 we would do for !LOOP_VINFO_FULLY_MASKED_P. */
3632 min_profitable_iters); 3694 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3695 {
3696 /* Rewriting the condition above in terms of the number of
3697 vector iterations (vniters) rather than the number of
3698 scalar iterations (niters) gives:
3699
3700 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3701
3702 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3703
3704 For integer N, X and Y when X > 0:
3705
3706 N * X > Y <==> N >= (Y /[floor] X) + 1. */
3707 int outside_overhead = (vec_outside_cost
3708 - scalar_single_iter_cost * peel_iters_prologue
3709 - scalar_single_iter_cost * peel_iters_epilogue
3710 - scalar_outside_cost);
3711 /* We're only interested in cases that require at least one
3712 vector iteration. */
3713 int min_vec_niters = 1;
3714 if (outside_overhead > 0)
3715 min_vec_niters = outside_overhead / saving_per_viter + 1;
3716
3717 if (dump_enabled_p ())
3718 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
3719 min_vec_niters);
3720
3721 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3722 {
3723 /* Now that we know the minimum number of vector iterations,
3724 find the minimum niters for which the scalar cost is larger:
3725
3726 SIC * niters > VIC * vniters + VOC - SOC
3727
3728 We know that the minimum niters is no more than
3729 vniters * VF + NPEEL, but it might be (and often is) less
3730 than that if a partial vector iteration is cheaper than the
3731 equivalent scalar code. */
3732 int threshold = (vec_inside_cost * min_vec_niters
3733 + vec_outside_cost
3734 - scalar_outside_cost);
3735 if (threshold <= 0)
3736 min_profitable_iters = 1;
3737 else
3738 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
3739 }
3740 else
3741 /* Convert the number of vector iterations into a number of
3742 scalar iterations. */
3743 min_profitable_iters = (min_vec_niters * assumed_vf
3744 + peel_iters_prologue
3745 + peel_iters_epilogue);
3746 }
3747 else
3748 {
3749 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3750 * assumed_vf
3751 - vec_inside_cost * peel_iters_prologue
3752 - vec_inside_cost * peel_iters_epilogue);
3753 if (min_profitable_iters <= 0)
3754 min_profitable_iters = 0;
3755 else
3756 {
3757 min_profitable_iters /= saving_per_viter;
3758
3759 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3760 <= (((int) vec_inside_cost * min_profitable_iters)
3761 + (((int) vec_outside_cost - scalar_outside_cost)
3762 * assumed_vf)))
3763 min_profitable_iters++;
3764 }
3765 }
3766
3767 if (dump_enabled_p ())
3768 dump_printf (MSG_NOTE,
3769 " Calculated minimum iters for profitability: %d\n",
3770 min_profitable_iters);
3633 3771
3634 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) 3772 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3635 && min_profitable_iters < (assumed_vf + peel_iters_prologue)) 3773 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3636 /* We want the vectorized loop to execute at least once. */ 3774 /* We want the vectorized loop to execute at least once. */
3637 min_profitable_iters = assumed_vf + peel_iters_prologue; 3775 min_profitable_iters = assumed_vf + peel_iters_prologue;
3646 /* Calculate number of iterations required to make the vector version 3784 /* Calculate number of iterations required to make the vector version
3647 profitable, relative to the loop bodies only. 3785 profitable, relative to the loop bodies only.
3648 3786
3649 Non-vectorized variant is SIC * niters and it must win over vector 3787 Non-vectorized variant is SIC * niters and it must win over vector
3650 variant on the expected loop trip count. The following condition must hold true: 3788 variant on the expected loop trip count. The following condition must hold true:
3651 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */ 3789 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
3652 3790
3653 if (vec_outside_cost <= 0) 3791 if (vec_outside_cost <= 0)
3654 min_profitable_estimate = 0; 3792 min_profitable_estimate = 0;
3793 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3794 {
3795 /* This is a repeat of the code above, but with + SOC rather
3796 than - SOC. */
3797 int outside_overhead = (vec_outside_cost
3798 - scalar_single_iter_cost * peel_iters_prologue
3799 - scalar_single_iter_cost * peel_iters_epilogue
3800 + scalar_outside_cost);
3801 int min_vec_niters = 1;
3802 if (outside_overhead > 0)
3803 min_vec_niters = outside_overhead / saving_per_viter + 1;
3804
3805 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3806 {
3807 int threshold = (vec_inside_cost * min_vec_niters
3808 + vec_outside_cost
3809 + scalar_outside_cost);
3810 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
3811 }
3812 else
3813 min_profitable_estimate = (min_vec_niters * assumed_vf
3814 + peel_iters_prologue
3815 + peel_iters_epilogue);
3816 }
3655 else 3817 else
3656 { 3818 {
3657 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) 3819 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3658 * assumed_vf 3820 * assumed_vf
3659 - vec_inside_cost * peel_iters_prologue 3821 - vec_inside_cost * peel_iters_prologue
3718 generated within the strip-mine loop, the initial definition before 3880 generated within the strip-mine loop, the initial definition before
3719 the loop, and the epilogue code that must be generated. */ 3881 the loop, and the epilogue code that must be generated. */
3720 3882
3721 static void 3883 static void
3722 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn, 3884 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3885 vect_reduction_type reduction_type,
3723 int ncopies, stmt_vector_for_cost *cost_vec) 3886 int ncopies, stmt_vector_for_cost *cost_vec)
3724 { 3887 {
3725 int prologue_cost = 0, epilogue_cost = 0, inside_cost; 3888 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3726 enum tree_code code; 3889 enum tree_code code;
3727 optab optab; 3890 optab optab;
3728 tree vectype; 3891 tree vectype;
3729 machine_mode mode; 3892 machine_mode mode;
3730 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 3893 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3731 struct loop *loop = NULL; 3894 class loop *loop = NULL;
3732 3895
3733 if (loop_vinfo) 3896 if (loop_vinfo)
3734 loop = LOOP_VINFO_LOOP (loop_vinfo); 3897 loop = LOOP_VINFO_LOOP (loop_vinfo);
3735 3898
3736 /* Condition reductions generate two reductions in the loop. */ 3899 /* Condition reductions generate two reductions in the loop. */
3737 vect_reduction_type reduction_type
3738 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3739 if (reduction_type == COND_REDUCTION) 3900 if (reduction_type == COND_REDUCTION)
3740 ncopies *= 2; 3901 ncopies *= 2;
3741 3902
3742 vectype = STMT_VINFO_VECTYPE (stmt_info); 3903 vectype = STMT_VINFO_VECTYPE (stmt_info);
3743 mode = TYPE_MODE (vectype); 3904 mode = TYPE_MODE (vectype);
3744 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info); 3905 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3745 3906
3746 code = gimple_assign_rhs_code (orig_stmt_info->stmt); 3907 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3747 3908
3748 if (reduction_type == EXTRACT_LAST_REDUCTION 3909 if (reduction_type == EXTRACT_LAST_REDUCTION)
3749 || reduction_type == FOLD_LEFT_REDUCTION) 3910 /* No extra instructions are needed in the prologue. The loop body
3911 operations are costed in vectorizable_condition. */
3912 inside_cost = 0;
3913 else if (reduction_type == FOLD_LEFT_REDUCTION)
3750 { 3914 {
3751 /* No extra instructions needed in the prologue. */ 3915 /* No extra instructions needed in the prologue. */
3752 prologue_cost = 0; 3916 prologue_cost = 0;
3753 3917
3754 if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST) 3918 if (reduc_fn != IFN_LAST)
3755 /* Count one reduction-like operation per vector. */ 3919 /* Count one reduction-like operation per vector. */
3756 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar, 3920 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3757 stmt_info, 0, vect_body); 3921 stmt_info, 0, vect_body);
3758 else 3922 else
3759 { 3923 {
3952 initialization vector is simpler (same element in all entries), if 4116 initialization vector is simpler (same element in all entries), if
3953 ADJUSTMENT_DEF is not NULL, and Option2 otherwise. 4117 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3954 4118
3955 A cost model should help decide between these two schemes. */ 4119 A cost model should help decide between these two schemes. */
3956 4120
3957 tree 4121 static tree
3958 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val, 4122 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo,
4123 enum tree_code code, tree init_val,
3959 tree *adjustment_def) 4124 tree *adjustment_def)
3960 { 4125 {
3961 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo); 4126 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3962 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 4127 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3963 tree scalar_type = TREE_TYPE (init_val); 4128 tree scalar_type = TREE_TYPE (init_val);
3964 tree vectype = get_vectype_for_scalar_type (scalar_type); 4129 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
3965 enum tree_code code = gimple_assign_rhs_code (stmt_vinfo->stmt);
3966 tree def_for_init; 4130 tree def_for_init;
3967 tree init_def; 4131 tree init_def;
3968 REAL_VALUE_TYPE real_init_val = dconst0; 4132 REAL_VALUE_TYPE real_init_val = dconst0;
3969 int int_init_val = 0; 4133 int int_init_val = 0;
3970 gimple_seq stmts = NULL; 4134 gimple_seq stmts = NULL;
3975 || SCALAR_FLOAT_TYPE_P (scalar_type)); 4139 || SCALAR_FLOAT_TYPE_P (scalar_type));
3976 4140
3977 gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo) 4141 gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
3978 || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father); 4142 || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
3979 4143
3980 vect_reduction_type reduction_type 4144 /* ADJUSTMENT_DEF is NULL when called from
3981 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo); 4145 vect_create_epilog_for_reduction to vectorize double reduction. */
4146 if (adjustment_def)
4147 *adjustment_def = NULL;
3982 4148
3983 switch (code) 4149 switch (code)
3984 { 4150 {
3985 case WIDEN_SUM_EXPR: 4151 case WIDEN_SUM_EXPR:
3986 case DOT_PROD_EXPR: 4152 case DOT_PROD_EXPR:
3990 case BIT_IOR_EXPR: 4156 case BIT_IOR_EXPR:
3991 case BIT_XOR_EXPR: 4157 case BIT_XOR_EXPR:
3992 case MULT_EXPR: 4158 case MULT_EXPR:
3993 case BIT_AND_EXPR: 4159 case BIT_AND_EXPR:
3994 { 4160 {
3995 /* ADJUSTMENT_DEF is NULL when called from
3996 vect_create_epilog_for_reduction to vectorize double reduction. */
3997 if (adjustment_def)
3998 *adjustment_def = init_val;
3999
4000 if (code == MULT_EXPR) 4161 if (code == MULT_EXPR)
4001 { 4162 {
4002 real_init_val = dconst1; 4163 real_init_val = dconst1;
4003 int_init_val = 1; 4164 int_init_val = 1;
4004 } 4165 }
4009 if (SCALAR_FLOAT_TYPE_P (scalar_type)) 4170 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4010 def_for_init = build_real (scalar_type, real_init_val); 4171 def_for_init = build_real (scalar_type, real_init_val);
4011 else 4172 else
4012 def_for_init = build_int_cst (scalar_type, int_init_val); 4173 def_for_init = build_int_cst (scalar_type, int_init_val);
4013 4174
4014 if (adjustment_def) 4175 if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
4015 /* Option1: the first element is '0' or '1' as well. */ 4176 {
4016 init_def = gimple_build_vector_from_val (&stmts, vectype, 4177 /* Option1: the first element is '0' or '1' as well. */
4017 def_for_init); 4178 if (!operand_equal_p (def_for_init, init_val, 0))
4179 *adjustment_def = init_val;
4180 init_def = gimple_build_vector_from_val (&stmts, vectype,
4181 def_for_init);
4182 }
4018 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ()) 4183 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4019 { 4184 {
4020 /* Option2 (variable length): the first element is INIT_VAL. */ 4185 /* Option2 (variable length): the first element is INIT_VAL. */
4021 init_def = gimple_build_vector_from_val (&stmts, vectype, 4186 init_def = gimple_build_vector_from_val (&stmts, vectype,
4022 def_for_init); 4187 def_for_init);
4036 4201
4037 case MIN_EXPR: 4202 case MIN_EXPR:
4038 case MAX_EXPR: 4203 case MAX_EXPR:
4039 case COND_EXPR: 4204 case COND_EXPR:
4040 { 4205 {
4041 if (adjustment_def)
4042 {
4043 *adjustment_def = NULL_TREE;
4044 if (reduction_type != COND_REDUCTION
4045 && reduction_type != EXTRACT_LAST_REDUCTION)
4046 {
4047 init_def = vect_get_vec_def_for_operand (init_val, stmt_vinfo);
4048 break;
4049 }
4050 }
4051 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val); 4206 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4052 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val); 4207 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4053 } 4208 }
4054 break; 4209 break;
4055 4210
4073 unsigned int number_of_vectors, 4228 unsigned int number_of_vectors,
4074 bool reduc_chain, tree neutral_op) 4229 bool reduc_chain, tree neutral_op)
4075 { 4230 {
4076 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node); 4231 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4077 stmt_vec_info stmt_vinfo = stmts[0]; 4232 stmt_vec_info stmt_vinfo = stmts[0];
4233 vec_info *vinfo = stmt_vinfo->vinfo;
4078 unsigned HOST_WIDE_INT nunits; 4234 unsigned HOST_WIDE_INT nunits;
4079 unsigned j, number_of_places_left_in_vector; 4235 unsigned j, number_of_places_left_in_vector;
4080 tree vector_type; 4236 tree vector_type;
4081 tree vop; 4237 unsigned int group_size = stmts.length ();
4082 int group_size = stmts.length (); 4238 unsigned int i;
4083 unsigned int vec_num, i; 4239 class loop *loop;
4084 unsigned number_of_copies = 1;
4085 vec<tree> voprnds;
4086 voprnds.create (number_of_vectors);
4087 struct loop *loop;
4088 auto_vec<tree, 16> permute_results;
4089 4240
4090 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo); 4241 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4091 4242
4092 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def); 4243 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4093 4244
4114 {s5, s6, s7, s8}. */ 4265 {s5, s6, s7, s8}. */
4115 4266
4116 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits)) 4267 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4117 nunits = group_size; 4268 nunits = group_size;
4118 4269
4119 number_of_copies = nunits * number_of_vectors / group_size;
4120
4121 number_of_places_left_in_vector = nunits; 4270 number_of_places_left_in_vector = nunits;
4122 bool constant_p = true; 4271 bool constant_p = true;
4123 tree_vector_builder elts (vector_type, nunits, 1); 4272 tree_vector_builder elts (vector_type, nunits, 1);
4124 elts.quick_grow (nunits); 4273 elts.quick_grow (nunits);
4125 for (j = 0; j < number_of_copies; j++) 4274 gimple_seq ctor_seq = NULL;
4126 { 4275 for (j = 0; j < nunits * number_of_vectors; ++j)
4127 for (i = group_size - 1; stmts.iterate (i, &stmt_vinfo); i--) 4276 {
4128 { 4277 tree op;
4129 tree op; 4278 i = j % group_size;
4130 /* Get the def before the loop. In reduction chain we have only 4279 stmt_vinfo = stmts[i];
4131 one initial value. */ 4280
4132 if ((j != (number_of_copies - 1) 4281 /* Get the def before the loop. In reduction chain we have only
4133 || (reduc_chain && i != 0)) 4282 one initial value. Else we have as many as PHIs in the group. */
4134 && neutral_op) 4283 if (reduc_chain)
4135 op = neutral_op; 4284 op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4285 else if (((vec_oprnds->length () + 1) * nunits
4286 - number_of_places_left_in_vector >= group_size)
4287 && neutral_op)
4288 op = neutral_op;
4289 else
4290 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4291
4292 /* Create 'vect_ = {op0,op1,...,opn}'. */
4293 number_of_places_left_in_vector--;
4294 elts[nunits - number_of_places_left_in_vector - 1] = op;
4295 if (!CONSTANT_CLASS_P (op))
4296 constant_p = false;
4297
4298 if (number_of_places_left_in_vector == 0)
4299 {
4300 tree init;
4301 if (constant_p && !neutral_op
4302 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4303 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4304 /* Build the vector directly from ELTS. */
4305 init = gimple_build_vector (&ctor_seq, &elts);
4306 else if (neutral_op)
4307 {
4308 /* Build a vector of the neutral value and shift the
4309 other elements into place. */
4310 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4311 neutral_op);
4312 int k = nunits;
4313 while (k > 0 && elts[k - 1] == neutral_op)
4314 k -= 1;
4315 while (k > 0)
4316 {
4317 k -= 1;
4318 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4319 vector_type, init, elts[k]);
4320 }
4321 }
4136 else 4322 else
4137 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4138
4139 /* Create 'vect_ = {op0,op1,...,opn}'. */
4140 number_of_places_left_in_vector--;
4141 elts[number_of_places_left_in_vector] = op;
4142 if (!CONSTANT_CLASS_P (op))
4143 constant_p = false;
4144
4145 if (number_of_places_left_in_vector == 0)
4146 {
4147 gimple_seq ctor_seq = NULL;
4148 tree init;
4149 if (constant_p && !neutral_op
4150 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4151 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4152 /* Build the vector directly from ELTS. */
4153 init = gimple_build_vector (&ctor_seq, &elts);
4154 else if (neutral_op)
4155 {
4156 /* Build a vector of the neutral value and shift the
4157 other elements into place. */
4158 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4159 neutral_op);
4160 int k = nunits;
4161 while (k > 0 && elts[k - 1] == neutral_op)
4162 k -= 1;
4163 while (k > 0)
4164 {
4165 k -= 1;
4166 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4167 vector_type, init, elts[k]);
4168 }
4169 }
4170 else
4171 {
4172 /* First time round, duplicate ELTS to fill the
4173 required number of vectors, then cherry pick the
4174 appropriate result for each iteration. */
4175 if (vec_oprnds->is_empty ())
4176 duplicate_and_interleave (&ctor_seq, vector_type, elts,
4177 number_of_vectors,
4178 permute_results);
4179 init = permute_results[number_of_vectors - j - 1];
4180 }
4181 if (ctor_seq != NULL)
4182 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4183 voprnds.quick_push (init);
4184
4185 number_of_places_left_in_vector = nunits;
4186 elts.new_vector (vector_type, nunits, 1);
4187 elts.quick_grow (nunits);
4188 constant_p = true;
4189 }
4190 }
4191 }
4192
4193 /* Since the vectors are created in the reverse order, we should invert
4194 them. */
4195 vec_num = voprnds.length ();
4196 for (j = vec_num; j != 0; j--)
4197 {
4198 vop = voprnds[j - 1];
4199 vec_oprnds->quick_push (vop);
4200 }
4201
4202 voprnds.release ();
4203
4204 /* In case that VF is greater than the unrolling factor needed for the SLP
4205 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4206 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4207 to replicate the vectors. */
4208 tree neutral_vec = NULL;
4209 while (number_of_vectors > vec_oprnds->length ())
4210 {
4211 if (neutral_op)
4212 {
4213 if (!neutral_vec)
4214 { 4323 {
4215 gimple_seq ctor_seq = NULL; 4324 /* First time round, duplicate ELTS to fill the
4216 neutral_vec = gimple_build_vector_from_val 4325 required number of vectors. */
4217 (&ctor_seq, vector_type, neutral_op); 4326 duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
4218 if (ctor_seq != NULL) 4327 number_of_vectors, *vec_oprnds);
4219 gsi_insert_seq_on_edge_immediate (pe, ctor_seq); 4328 break;
4220 } 4329 }
4221 vec_oprnds->quick_push (neutral_vec); 4330 vec_oprnds->quick_push (init);
4222 } 4331
4223 else 4332 number_of_places_left_in_vector = nunits;
4224 { 4333 elts.new_vector (vector_type, nunits, 1);
4225 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++) 4334 elts.quick_grow (nunits);
4226 vec_oprnds->quick_push (vop); 4335 constant_p = true;
4227 } 4336 }
4228 } 4337 }
4338 if (ctor_seq != NULL)
4339 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4229 } 4340 }
4230 4341
4342 /* For a statement STMT_INFO taking part in a reduction operation return
4343 the stmt_vec_info the meta information is stored on. */
4344
4345 stmt_vec_info
4346 info_for_reduction (stmt_vec_info stmt_info)
4347 {
4348 stmt_info = vect_orig_stmt (stmt_info);
4349 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4350 if (!is_a <gphi *> (stmt_info->stmt))
4351 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4352 gphi *phi = as_a <gphi *> (stmt_info->stmt);
4353 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4354 {
4355 if (gimple_phi_num_args (phi) == 1)
4356 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4357 }
4358 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4359 {
4360 edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
4361 stmt_vec_info info
4362 = stmt_info->vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
4363 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4364 stmt_info = info;
4365 }
4366 return stmt_info;
4367 }
4231 4368
4232 /* Function vect_create_epilog_for_reduction 4369 /* Function vect_create_epilog_for_reduction
4233 4370
4234 Create code at the loop-epilog to finalize the result of a reduction 4371 Create code at the loop-epilog to finalize the result of a reduction
4235 computation. 4372 computation.
4236 4373
4237 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4238 reduction statements.
4239 STMT_INFO is the scalar reduction stmt that is being vectorized. 4374 STMT_INFO is the scalar reduction stmt that is being vectorized.
4240 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4241 number of elements that we can fit in a vectype (nunits). In this case
4242 we have to generate more than one vector stmt - i.e - we need to "unroll"
4243 the vector stmt by a factor VF/nunits. For more details see documentation
4244 in vectorizable_operation.
4245 REDUC_FN is the internal function for the epilog reduction.
4246 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4247 computation.
4248 REDUC_INDEX is the index of the operand in the right hand side of the
4249 statement that is defined by REDUCTION_PHI.
4250 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4251 SLP_NODE is an SLP node containing a group of reduction statements. The 4375 SLP_NODE is an SLP node containing a group of reduction statements. The
4252 first one in this group is STMT_INFO. 4376 first one in this group is STMT_INFO.
4253 INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case 4377 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4254 when the COND_EXPR is never true in the loop. For MAX_EXPR, it needs to 4378 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4255 be smaller than any value of the IV in the loop, for MIN_EXPR larger than 4379 (counting from 0)
4256 any value of the IV in the loop.
4257 INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4258 NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4259 null if this is not an SLP reduction
4260 4380
4261 This function: 4381 This function:
4262 1. Creates the reduction def-use cycles: sets the arguments for 4382 1. Completes the reduction def-use cycles.
4263 REDUCTION_PHIS:
4264 The loop-entry argument is the vectorized initial-value of the reduction.
4265 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4266 sums.
4267 2. "Reduces" each vector of partial results VECT_DEFS into a single result, 4383 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4268 by calling the function specified by REDUC_FN if available, or by 4384 by calling the function specified by REDUC_FN if available, or by
4269 other means (whole-vector shifts or a scalar loop). 4385 other means (whole-vector shifts or a scalar loop).
4270 The function also creates a new phi node at the loop exit to preserve 4386 The function also creates a new phi node at the loop exit to preserve
4271 loop-closed form, as illustrated below. 4387 loop-closed form, as illustrated below.
4272 4388
4273 The flow at the entry to this function: 4389 The flow at the entry to this function:
4274 4390
4275 loop: 4391 loop:
4276 vec_def = phi <null, null> # REDUCTION_PHI 4392 vec_def = phi <vec_init, null> # REDUCTION_PHI
4277 VECT_DEF = vector_stmt # vectorized form of STMT_INFO 4393 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4278 s_loop = scalar_stmt # (scalar) STMT_INFO 4394 s_loop = scalar_stmt # (scalar) STMT_INFO
4279 loop_exit: 4395 loop_exit:
4280 s_out0 = phi <s_loop> # (scalar) EXIT_PHI 4396 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4281 use <s_out0> 4397 use <s_out0>
4296 use <s_out4> 4412 use <s_out4>
4297 use <s_out4> 4413 use <s_out4>
4298 */ 4414 */
4299 4415
4300 static void 4416 static void
4301 vect_create_epilog_for_reduction (vec<tree> vect_defs, 4417 vect_create_epilog_for_reduction (stmt_vec_info stmt_info,
4302 stmt_vec_info stmt_info,
4303 gimple *reduc_def_stmt,
4304 int ncopies, internal_fn reduc_fn,
4305 vec<stmt_vec_info> reduction_phis,
4306 bool double_reduc,
4307 slp_tree slp_node, 4418 slp_tree slp_node,
4308 slp_instance slp_node_instance, 4419 slp_instance slp_node_instance)
4309 tree induc_val, enum tree_code induc_code,
4310 tree neutral_op)
4311 { 4420 {
4421 stmt_vec_info reduc_info = info_for_reduction (stmt_info);
4422 gcc_assert (reduc_info->is_reduc_info);
4423 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4424 /* For double reductions we need to get at the inner loop reduction
4425 stmt which has the meta info attached. Our stmt_info is that of the
4426 loop-closed PHI of the inner loop which we remember as
4427 def for the reduction PHI generation. */
4428 bool double_reduc = false;
4429 stmt_vec_info rdef_info = stmt_info;
4430 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4431 {
4432 gcc_assert (!slp_node);
4433 double_reduc = true;
4434 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
4435 (stmt_info->stmt, 0));
4436 stmt_info = vect_stmt_to_vectorize (stmt_info);
4437 }
4438 gphi *reduc_def_stmt
4439 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
4440 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4441 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
4312 stmt_vec_info prev_phi_info; 4442 stmt_vec_info prev_phi_info;
4313 tree vectype; 4443 tree vectype;
4314 machine_mode mode; 4444 machine_mode mode;
4315 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 4445 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4316 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4317 basic_block exit_bb; 4446 basic_block exit_bb;
4318 tree scalar_dest; 4447 tree scalar_dest;
4319 tree scalar_type; 4448 tree scalar_type;
4320 gimple *new_phi = NULL, *phi; 4449 gimple *new_phi = NULL, *phi;
4321 stmt_vec_info phi_info; 4450 stmt_vec_info phi_info;
4322 gimple_stmt_iterator exit_gsi; 4451 gimple_stmt_iterator exit_gsi;
4323 tree vec_dest; 4452 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
4324 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4325 gimple *epilog_stmt = NULL; 4453 gimple *epilog_stmt = NULL;
4326 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
4327 gimple *exit_phi; 4454 gimple *exit_phi;
4328 tree bitsize; 4455 tree bitsize;
4329 tree adjustment_def = NULL; 4456 tree def;
4330 tree vec_initial_def = NULL;
4331 tree expr, def, initial_def = NULL;
4332 tree orig_name, scalar_result; 4457 tree orig_name, scalar_result;
4333 imm_use_iterator imm_iter, phi_imm_iter; 4458 imm_use_iterator imm_iter, phi_imm_iter;
4334 use_operand_p use_p, phi_use_p; 4459 use_operand_p use_p, phi_use_p;
4335 gimple *use_stmt; 4460 gimple *use_stmt;
4336 stmt_vec_info reduction_phi_info = NULL;
4337 bool nested_in_vect_loop = false; 4461 bool nested_in_vect_loop = false;
4338 auto_vec<gimple *> new_phis; 4462 auto_vec<gimple *> new_phis;
4339 auto_vec<stmt_vec_info> inner_phis;
4340 int j, i; 4463 int j, i;
4341 auto_vec<tree> scalar_results; 4464 auto_vec<tree> scalar_results;
4342 unsigned int group_size = 1, k, ratio; 4465 unsigned int group_size = 1, k;
4343 auto_vec<tree> vec_initial_defs;
4344 auto_vec<gimple *> phis; 4466 auto_vec<gimple *> phis;
4345 bool slp_reduc = false; 4467 bool slp_reduc = false;
4346 bool direct_slp_reduc; 4468 bool direct_slp_reduc;
4347 tree new_phi_result; 4469 tree new_phi_result;
4348 stmt_vec_info inner_phi = NULL;
4349 tree induction_index = NULL_TREE; 4470 tree induction_index = NULL_TREE;
4350 4471
4351 if (slp_node) 4472 if (slp_node)
4352 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length (); 4473 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4353 4474
4356 outer_loop = loop; 4477 outer_loop = loop;
4357 loop = loop->inner; 4478 loop = loop->inner;
4358 nested_in_vect_loop = true; 4479 nested_in_vect_loop = true;
4359 gcc_assert (!slp_node); 4480 gcc_assert (!slp_node);
4360 } 4481 }
4361 4482 gcc_assert (!nested_in_vect_loop || double_reduc);
4362 vectype = STMT_VINFO_VECTYPE (stmt_info); 4483
4484 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
4363 gcc_assert (vectype); 4485 gcc_assert (vectype);
4364 mode = TYPE_MODE (vectype); 4486 mode = TYPE_MODE (vectype);
4365 4487
4366 /* 1. Create the reduction def-use cycle: 4488 tree initial_def = NULL;
4367 Set the arguments of REDUCTION_PHIS, i.e., transform 4489 tree induc_val = NULL_TREE;
4368 4490 tree adjustment_def = NULL;
4369 loop:
4370 vec_def = phi <null, null> # REDUCTION_PHI
4371 VECT_DEF = vector_stmt # vectorized form of STMT
4372 ...
4373
4374 into:
4375
4376 loop:
4377 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4378 VECT_DEF = vector_stmt # vectorized form of STMT
4379 ...
4380
4381 (in case of SLP, do it for all the phis). */
4382
4383 /* Get the loop-entry arguments. */
4384 enum vect_def_type initial_def_dt = vect_unknown_def_type;
4385 if (slp_node) 4491 if (slp_node)
4386 { 4492 ;
4387 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4388 vec_initial_defs.reserve (vec_num);
4389 get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4390 &vec_initial_defs, vec_num,
4391 REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4392 neutral_op);
4393 }
4394 else 4493 else
4395 { 4494 {
4396 /* Get at the scalar def before the loop, that defines the initial value 4495 /* Get at the scalar def before the loop, that defines the initial value
4397 of the reduction variable. */ 4496 of the reduction variable. */
4398 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt, 4497 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4399 loop_preheader_edge (loop)); 4498 loop_preheader_edge (loop));
4400 /* Optimize: if initial_def is for REDUC_MAX smaller than the base 4499 /* Optimize: for induction condition reduction, if we can't use zero
4401 and we can't use zero for induc_val, use initial_def. Similarly 4500 for induc_val, use initial_def. */
4402 for REDUC_MIN and initial_def larger than the base. */ 4501 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
4403 if (TREE_CODE (initial_def) == INTEGER_CST 4502 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
4404 && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) 4503 else if (double_reduc)
4405 == INTEGER_INDUC_COND_REDUCTION) 4504 ;
4406 && !integer_zerop (induc_val)
4407 && ((induc_code == MAX_EXPR
4408 && tree_int_cst_lt (initial_def, induc_val))
4409 || (induc_code == MIN_EXPR
4410 && tree_int_cst_lt (induc_val, initial_def))))
4411 induc_val = initial_def;
4412
4413 if (double_reduc)
4414 /* In case of double reduction we only create a vector variable
4415 to be put in the reduction phi node. The actual statement
4416 creation is done later in this function. */
4417 vec_initial_def = vect_create_destination_var (initial_def, vectype);
4418 else if (nested_in_vect_loop) 4505 else if (nested_in_vect_loop)
4419 { 4506 ;
4420 /* Do not use an adjustment def as that case is not supported
4421 correctly if ncopies is not one. */
4422 vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
4423 vec_initial_def = vect_get_vec_def_for_operand (initial_def,
4424 stmt_info);
4425 }
4426 else 4507 else
4427 vec_initial_def 4508 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
4428 = get_initial_def_for_reduction (stmt_info, initial_def, 4509 }
4429 &adjustment_def); 4510
4430 vec_initial_defs.create (1); 4511 unsigned vec_num;
4431 vec_initial_defs.quick_push (vec_initial_def); 4512 int ncopies;
4432 } 4513 if (slp_node)
4433 4514 {
4434 /* Set phi nodes arguments. */ 4515 vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
4435 FOR_EACH_VEC_ELT (reduction_phis, i, phi_info) 4516 ncopies = 1;
4436 { 4517 }
4437 tree vec_init_def = vec_initial_defs[i]; 4518 else
4438 tree def = vect_defs[i]; 4519 {
4439 for (j = 0; j < ncopies; j++) 4520 vec_num = 1;
4440 { 4521 ncopies = 0;
4441 if (j != 0) 4522 phi_info = STMT_VINFO_VEC_STMT (loop_vinfo->lookup_stmt (reduc_def_stmt));
4442 { 4523 do
4443 phi_info = STMT_VINFO_RELATED_STMT (phi_info); 4524 {
4444 if (nested_in_vect_loop) 4525 ncopies++;
4445 vec_init_def 4526 phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4446 = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_init_def); 4527 }
4447 } 4528 while (phi_info);
4448
4449 /* Set the loop-entry arg of the reduction-phi. */
4450
4451 gphi *phi = as_a <gphi *> (phi_info->stmt);
4452 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4453 == INTEGER_INDUC_COND_REDUCTION)
4454 {
4455 /* Initialise the reduction phi to zero. This prevents initial
4456 values of non-zero interferring with the reduction op. */
4457 gcc_assert (ncopies == 1);
4458 gcc_assert (i == 0);
4459
4460 tree vec_init_def_type = TREE_TYPE (vec_init_def);
4461 tree induc_val_vec
4462 = build_vector_from_val (vec_init_def_type, induc_val);
4463
4464 add_phi_arg (phi, induc_val_vec, loop_preheader_edge (loop),
4465 UNKNOWN_LOCATION);
4466 }
4467 else
4468 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
4469 UNKNOWN_LOCATION);
4470
4471 /* Set the loop-latch arg for the reduction-phi. */
4472 if (j > 0)
4473 def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4474
4475 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
4476
4477 if (dump_enabled_p ())
4478 dump_printf_loc (MSG_NOTE, vect_location,
4479 "transform reduction: created def-use cycle: %G%G",
4480 phi, SSA_NAME_DEF_STMT (def));
4481 }
4482 } 4529 }
4483 4530
4484 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR) 4531 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4485 which is updated with the current index of the loop for every match of 4532 which is updated with the current index of the loop for every match of
4486 the original loop's cond_expr (VEC_STMT). This results in a vector 4533 the original loop's cond_expr (VEC_STMT). This results in a vector
4487 containing the last time the condition passed for that vector lane. 4534 containing the last time the condition passed for that vector lane.
4488 The first match will be a 1 to allow 0 to be used for non-matching 4535 The first match will be a 1 to allow 0 to be used for non-matching
4489 indexes. If there are no matches at all then the vector will be all 4536 indexes. If there are no matches at all then the vector will be all
4490 zeroes. */ 4537 zeroes.
4491 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION) 4538
4492 { 4539 PR92772: This algorithm is broken for architectures that support
4540 masked vectors, but do not provide fold_extract_last. */
4541 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
4542 {
4543 auto_vec<std::pair<tree, bool>, 2> ccompares;
4544 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
4545 cond_info = vect_stmt_to_vectorize (cond_info);
4546 while (cond_info != reduc_info)
4547 {
4548 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
4549 {
4550 gimple *vec_stmt = STMT_VINFO_VEC_STMT (cond_info)->stmt;
4551 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4552 ccompares.safe_push
4553 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
4554 STMT_VINFO_REDUC_IDX (cond_info) == 2));
4555 }
4556 cond_info
4557 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
4558 1 + STMT_VINFO_REDUC_IDX
4559 (cond_info)));
4560 cond_info = vect_stmt_to_vectorize (cond_info);
4561 }
4562 gcc_assert (ccompares.length () != 0);
4563
4493 tree indx_before_incr, indx_after_incr; 4564 tree indx_before_incr, indx_after_incr;
4494 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype); 4565 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4495
4496 gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
4497 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4498
4499 int scalar_precision 4566 int scalar_precision
4500 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype))); 4567 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4501 tree cr_index_scalar_type = make_unsigned_type (scalar_precision); 4568 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4502 tree cr_index_vector_type = build_vector_type 4569 tree cr_index_vector_type = get_related_vectype_for_scalar_type
4503 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype)); 4570 (TYPE_MODE (vectype), cr_index_scalar_type,
4571 TYPE_VECTOR_SUBPARTS (vectype));
4504 4572
4505 /* First we create a simple vector induction variable which starts 4573 /* First we create a simple vector induction variable which starts
4506 with the values {1,2,3,...} (SERIES_VECT) and increments by the 4574 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4507 vector size (STEP). */ 4575 vector size (STEP). */
4508 4576
4532 new_phi = create_phi_node (new_phi_tree, loop->header); 4600 new_phi = create_phi_node (new_phi_tree, loop->header);
4533 loop_vinfo->add_stmt (new_phi); 4601 loop_vinfo->add_stmt (new_phi);
4534 add_phi_arg (as_a <gphi *> (new_phi), vec_zero, 4602 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4535 loop_preheader_edge (loop), UNKNOWN_LOCATION); 4603 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4536 4604
4537 /* Now take the condition from the loops original cond_expr 4605 /* Now take the condition from the loops original cond_exprs
4538 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for 4606 and produce a new cond_exprs (INDEX_COND_EXPR) which for
4539 every match uses values from the induction variable 4607 every match uses values from the induction variable
4540 (INDEX_BEFORE_INCR) otherwise uses values from the phi node 4608 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4541 (NEW_PHI_TREE). 4609 (NEW_PHI_TREE).
4542 Finally, we update the phi (NEW_PHI_TREE) to take the value of 4610 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4543 the new cond_expr (INDEX_COND_EXPR). */ 4611 the new cond_expr (INDEX_COND_EXPR). */
4544 4612 gimple_seq stmts = NULL;
4545 /* Duplicate the condition from vec_stmt. */ 4613 for (int i = ccompares.length () - 1; i != -1; --i)
4546 tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt)); 4614 {
4547 4615 tree ccompare = ccompares[i].first;
4548 /* Create a conditional, where the condition is taken from vec_stmt 4616 if (ccompares[i].second)
4549 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and 4617 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4550 else is the phi (NEW_PHI_TREE). */ 4618 cr_index_vector_type,
4551 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type, 4619 ccompare,
4552 ccompare, indx_before_incr, 4620 indx_before_incr, new_phi_tree);
4553 new_phi_tree); 4621 else
4554 induction_index = make_ssa_name (cr_index_vector_type); 4622 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4555 gimple *index_condition = gimple_build_assign (induction_index, 4623 cr_index_vector_type,
4556 index_cond_expr); 4624 ccompare,
4557 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT); 4625 new_phi_tree, indx_before_incr);
4558 stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition); 4626 }
4627 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
4628 stmt_vec_info index_vec_info
4629 = loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (new_phi_tree));
4559 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type; 4630 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4560 4631
4561 /* Update the phi with the vec cond. */ 4632 /* Update the phi with the vec cond. */
4633 induction_index = new_phi_tree;
4562 add_phi_arg (as_a <gphi *> (new_phi), induction_index, 4634 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4563 loop_latch_edge (loop), UNKNOWN_LOCATION); 4635 loop_latch_edge (loop), UNKNOWN_LOCATION);
4564 } 4636 }
4565 4637
4566 /* 2. Create epilog code. 4638 /* 2. Create epilog code.
4591 4663
4592 4664
4593 /* 2.1 Create new loop-exit-phis to preserve loop-closed form: 4665 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4594 v_out1 = phi <VECT_DEF> 4666 v_out1 = phi <VECT_DEF>
4595 Store them in NEW_PHIS. */ 4667 Store them in NEW_PHIS. */
4596 4668 if (double_reduc)
4669 loop = outer_loop;
4597 exit_bb = single_exit (loop)->dest; 4670 exit_bb = single_exit (loop)->dest;
4598 prev_phi_info = NULL; 4671 prev_phi_info = NULL;
4599 new_phis.create (vect_defs.length ()); 4672 new_phis.create (slp_node ? vec_num : ncopies);
4600 FOR_EACH_VEC_ELT (vect_defs, i, def) 4673 for (unsigned i = 0; i < vec_num; i++)
4601 { 4674 {
4675 if (slp_node)
4676 def = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]->stmt);
4677 else
4678 def = gimple_get_lhs (STMT_VINFO_VEC_STMT (rdef_info)->stmt);
4602 for (j = 0; j < ncopies; j++) 4679 for (j = 0; j < ncopies; j++)
4603 { 4680 {
4604 tree new_def = copy_ssa_name (def); 4681 tree new_def = copy_ssa_name (def);
4605 phi = create_phi_node (new_def, exit_bb); 4682 phi = create_phi_node (new_def, exit_bb);
4606 stmt_vec_info phi_info = loop_vinfo->add_stmt (phi); 4683 stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4615 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def); 4692 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4616 prev_phi_info = phi_info; 4693 prev_phi_info = phi_info;
4617 } 4694 }
4618 } 4695 }
4619 4696
4620 /* The epilogue is created for the outer-loop, i.e., for the loop being
4621 vectorized. Create exit phis for the outer loop. */
4622 if (double_reduc)
4623 {
4624 loop = outer_loop;
4625 exit_bb = single_exit (loop)->dest;
4626 inner_phis.create (vect_defs.length ());
4627 FOR_EACH_VEC_ELT (new_phis, i, phi)
4628 {
4629 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
4630 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4631 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4632 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4633 PHI_RESULT (phi));
4634 prev_phi_info = loop_vinfo->add_stmt (outer_phi);
4635 inner_phis.quick_push (phi_info);
4636 new_phis[i] = outer_phi;
4637 while (STMT_VINFO_RELATED_STMT (phi_info))
4638 {
4639 phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4640 new_result = copy_ssa_name (PHI_RESULT (phi_info->stmt));
4641 outer_phi = create_phi_node (new_result, exit_bb);
4642 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4643 PHI_RESULT (phi_info->stmt));
4644 stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi);
4645 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi_info;
4646 prev_phi_info = outer_phi_info;
4647 }
4648 }
4649 }
4650
4651 exit_gsi = gsi_after_labels (exit_bb); 4697 exit_gsi = gsi_after_labels (exit_bb);
4652 4698
4653 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3 4699 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4654 (i.e. when reduc_fn is not available) and in the final adjustment 4700 (i.e. when reduc_fn is not available) and in the final adjustment
4655 code (if needed). Also get the original scalar reduction variable as 4701 code (if needed). Also get the original scalar reduction variable as
4664 { 4710 {
4665 /* Reduction pattern */ 4711 /* Reduction pattern */
4666 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)); 4712 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4667 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info); 4713 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4668 } 4714 }
4669
4670 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4671 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4672 partial results are added and not subtracted. */
4673 if (code == MINUS_EXPR)
4674 code = PLUS_EXPR;
4675 4715
4676 scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt); 4716 scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4677 scalar_type = TREE_TYPE (scalar_dest); 4717 scalar_type = TREE_TYPE (scalar_dest);
4678 scalar_results.create (group_size); 4718 scalar_results.create (group_size);
4679 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL); 4719 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4680 bitsize = TYPE_SIZE (scalar_type); 4720 bitsize = TYPE_SIZE (scalar_type);
4681 4721
4682 /* In case this is a reduction in an inner-loop while vectorizing an outer
4683 loop - we don't need to extract a single scalar result at the end of the
4684 inner-loop (unless it is double reduction, i.e., the use of reduction is
4685 outside the outer-loop). The final vector of partial results will be used
4686 in the vectorized outer-loop, or reduced to a scalar result at the end of
4687 the outer-loop. */
4688 if (nested_in_vect_loop && !double_reduc)
4689 goto vect_finalize_reduction;
4690
4691 /* SLP reduction without reduction chain, e.g., 4722 /* SLP reduction without reduction chain, e.g.,
4692 # a1 = phi <a2, a0> 4723 # a1 = phi <a2, a0>
4693 # b1 = phi <b2, b0> 4724 # b1 = phi <b2, b0>
4694 a2 = operation (a1) 4725 a2 = operation (a1)
4695 b2 = operation (b1) */ 4726 b2 = operation (b1) */
4708 4739
4709 we may end up with more than one vector result. Here we reduce them to 4740 we may end up with more than one vector result. Here we reduce them to
4710 one vector. */ 4741 one vector. */
4711 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc) 4742 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4712 { 4743 {
4744 gimple_seq stmts = NULL;
4713 tree first_vect = PHI_RESULT (new_phis[0]); 4745 tree first_vect = PHI_RESULT (new_phis[0]);
4714 gassign *new_vec_stmt = NULL; 4746 first_vect = gimple_convert (&stmts, vectype, first_vect);
4715 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4716 for (k = 1; k < new_phis.length (); k++) 4747 for (k = 1; k < new_phis.length (); k++)
4717 { 4748 {
4718 gimple *next_phi = new_phis[k]; 4749 gimple *next_phi = new_phis[k];
4719 tree second_vect = PHI_RESULT (next_phi); 4750 tree second_vect = PHI_RESULT (next_phi);
4720 tree tem = make_ssa_name (vec_dest, new_vec_stmt); 4751 second_vect = gimple_convert (&stmts, vectype, second_vect);
4721 new_vec_stmt = gimple_build_assign (tem, code, 4752 first_vect = gimple_build (&stmts, code, vectype,
4722 first_vect, second_vect); 4753 first_vect, second_vect);
4723 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4724 first_vect = tem;
4725 } 4754 }
4755 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4726 4756
4727 new_phi_result = first_vect; 4757 new_phi_result = first_vect;
4728 if (new_vec_stmt) 4758 new_phis.truncate (0);
4729 { 4759 new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
4730 new_phis.truncate (0);
4731 new_phis.safe_push (new_vec_stmt);
4732 }
4733 } 4760 }
4734 /* Likewise if we couldn't use a single defuse cycle. */ 4761 /* Likewise if we couldn't use a single defuse cycle. */
4735 else if (ncopies > 1) 4762 else if (ncopies > 1)
4736 { 4763 {
4737 gcc_assert (new_phis.length () == 1); 4764 gcc_assert (new_phis.length () == 1);
4765 gimple_seq stmts = NULL;
4738 tree first_vect = PHI_RESULT (new_phis[0]); 4766 tree first_vect = PHI_RESULT (new_phis[0]);
4739 gassign *new_vec_stmt = NULL; 4767 first_vect = gimple_convert (&stmts, vectype, first_vect);
4740 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4741 stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]); 4768 stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4742 for (int k = 1; k < ncopies; ++k) 4769 for (int k = 1; k < ncopies; ++k)
4743 { 4770 {
4744 next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info); 4771 next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4745 tree second_vect = PHI_RESULT (next_phi_info->stmt); 4772 tree second_vect = PHI_RESULT (next_phi_info->stmt);
4746 tree tem = make_ssa_name (vec_dest, new_vec_stmt); 4773 second_vect = gimple_convert (&stmts, vectype, second_vect);
4747 new_vec_stmt = gimple_build_assign (tem, code, 4774 first_vect = gimple_build (&stmts, code, vectype,
4748 first_vect, second_vect); 4775 first_vect, second_vect);
4749 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT); 4776 }
4750 first_vect = tem; 4777 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4751 }
4752 new_phi_result = first_vect; 4778 new_phi_result = first_vect;
4753 new_phis.truncate (0); 4779 new_phis.truncate (0);
4754 new_phis.safe_push (new_vec_stmt); 4780 new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
4755 } 4781 }
4756 else 4782 else
4757 new_phi_result = PHI_RESULT (new_phis[0]); 4783 new_phi_result = PHI_RESULT (new_phis[0]);
4758 4784
4759 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION 4785 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
4760 && reduc_fn != IFN_LAST) 4786 && reduc_fn != IFN_LAST)
4761 { 4787 {
4762 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing 4788 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4763 various data values where the condition matched and another vector 4789 various data values where the condition matched and another vector
4764 (INDUCTION_INDEX) containing all the indexes of those matches. We 4790 (INDUCTION_INDEX) containing all the indexes of those matches. We
4769 4795
4770 /* Get various versions of the type of the vector of indexes. */ 4796 /* Get various versions of the type of the vector of indexes. */
4771 tree index_vec_type = TREE_TYPE (induction_index); 4797 tree index_vec_type = TREE_TYPE (induction_index);
4772 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type)); 4798 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4773 tree index_scalar_type = TREE_TYPE (index_vec_type); 4799 tree index_scalar_type = TREE_TYPE (index_vec_type);
4774 tree index_vec_cmp_type = build_same_sized_truth_vector_type 4800 tree index_vec_cmp_type = truth_type_for (index_vec_type);
4775 (index_vec_type);
4776 4801
4777 /* Get an unsigned integer version of the type of the data vector. */ 4802 /* Get an unsigned integer version of the type of the data vector. */
4778 int scalar_precision 4803 int scalar_precision
4779 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type)); 4804 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4780 tree scalar_type_unsigned = make_unsigned_type (scalar_precision); 4805 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4786 can create using a MAX reduction and then expanding. 4811 can create using a MAX reduction and then expanding.
4787 In the case where the loop never made any matches, the max index will 4812 In the case where the loop never made any matches, the max index will
4788 be zero. */ 4813 be zero. */
4789 4814
4790 /* Vector of {0, 0, 0,...}. */ 4815 /* Vector of {0, 0, 0,...}. */
4791 tree zero_vec = make_ssa_name (vectype); 4816 tree zero_vec = build_zero_cst (vectype);
4792 tree zero_vec_rhs = build_zero_cst (vectype); 4817
4793 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs); 4818 gimple_seq stmts = NULL;
4794 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT); 4819 new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
4820 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4795 4821
4796 /* Find maximum value from the vector of found indexes. */ 4822 /* Find maximum value from the vector of found indexes. */
4797 tree max_index = make_ssa_name (index_scalar_type); 4823 tree max_index = make_ssa_name (index_scalar_type);
4798 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX, 4824 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4799 1, induction_index); 4825 1, induction_index);
4857 gimple_call_set_lhs (data_reduc_stmt, data_reduc); 4883 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4858 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT); 4884 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4859 4885
4860 /* Convert the reduced value back to the result type and set as the 4886 /* Convert the reduced value back to the result type and set as the
4861 result. */ 4887 result. */
4862 gimple_seq stmts = NULL; 4888 stmts = NULL;
4863 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type, 4889 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4864 data_reduc); 4890 data_reduc);
4865 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); 4891 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4866 scalar_results.safe_push (new_temp); 4892 scalar_results.safe_push (new_temp);
4867 } 4893 }
4868 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION 4894 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
4869 && reduc_fn == IFN_LAST) 4895 && reduc_fn == IFN_LAST)
4870 { 4896 {
4871 /* Condition reduction without supported IFN_REDUC_MAX. Generate 4897 /* Condition reduction without supported IFN_REDUC_MAX. Generate
4872 idx = 0; 4898 idx = 0;
4873 idx_val = induction_index[0]; 4899 idx_val = induction_index[0];
4906 bitsize_int (off))); 4932 bitsize_int (off)));
4907 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 4933 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4908 if (off != 0) 4934 if (off != 0)
4909 { 4935 {
4910 tree new_idx_val = idx_val; 4936 tree new_idx_val = idx_val;
4911 tree new_val = val;
4912 if (off != v_size - el_size) 4937 if (off != v_size - el_size)
4913 { 4938 {
4914 new_idx_val = make_ssa_name (idx_eltype); 4939 new_idx_val = make_ssa_name (idx_eltype);
4915 epilog_stmt = gimple_build_assign (new_idx_val, 4940 epilog_stmt = gimple_build_assign (new_idx_val,
4916 MAX_EXPR, idx_val, 4941 MAX_EXPR, idx_val,
4917 old_idx_val); 4942 old_idx_val);
4918 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 4943 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4919 } 4944 }
4920 new_val = make_ssa_name (data_eltype); 4945 tree new_val = make_ssa_name (data_eltype);
4921 epilog_stmt = gimple_build_assign (new_val, 4946 epilog_stmt = gimple_build_assign (new_val,
4922 COND_EXPR, 4947 COND_EXPR,
4923 build2 (GT_EXPR, 4948 build2 (GT_EXPR,
4924 boolean_type_node, 4949 boolean_type_node,
4925 idx_val, 4950 idx_val,
4951 4976
4952 if (dump_enabled_p ()) 4977 if (dump_enabled_p ())
4953 dump_printf_loc (MSG_NOTE, vect_location, 4978 dump_printf_loc (MSG_NOTE, vect_location,
4954 "Reduce using direct vector reduction.\n"); 4979 "Reduce using direct vector reduction.\n");
4955 4980
4981 gimple_seq stmts = NULL;
4982 new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
4956 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result)); 4983 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4957 if (!useless_type_conversion_p (scalar_type, vec_elem_type)) 4984 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
4958 { 4985 vec_elem_type, new_phi_result);
4959 tree tmp_dest 4986 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
4960 = vect_create_destination_var (scalar_dest, vec_elem_type); 4987 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4961 epilog_stmt = gimple_build_call_internal (reduc_fn, 1, 4988
4962 new_phi_result); 4989 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
4963 gimple_set_lhs (epilog_stmt, tmp_dest); 4990 && induc_val)
4964 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4965 gimple_set_lhs (epilog_stmt, new_temp);
4966 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4967
4968 epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
4969 new_temp);
4970 }
4971 else
4972 {
4973 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
4974 new_phi_result);
4975 gimple_set_lhs (epilog_stmt, new_scalar_dest);
4976 }
4977
4978 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4979 gimple_set_lhs (epilog_stmt, new_temp);
4980 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4981
4982 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4983 == INTEGER_INDUC_COND_REDUCTION)
4984 && !operand_equal_p (initial_def, induc_val, 0))
4985 { 4991 {
4986 /* Earlier we set the initial value to be a vector if induc_val 4992 /* Earlier we set the initial value to be a vector if induc_val
4987 values. Check the result and if it is induc_val then replace 4993 values. Check the result and if it is induc_val then replace
4988 with the original initial value, unless induc_val is 4994 with the original initial value, unless induc_val is
4989 the same as initial_def already. */ 4995 the same as initial_def already. */
5017 /* Build a vector {0, 1, 2, ...}, with the same number of elements 5023 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5018 and the same element size as VECTYPE. */ 5024 and the same element size as VECTYPE. */
5019 tree index = build_index_vector (vectype, 0, 1); 5025 tree index = build_index_vector (vectype, 0, 1);
5020 tree index_type = TREE_TYPE (index); 5026 tree index_type = TREE_TYPE (index);
5021 tree index_elt_type = TREE_TYPE (index_type); 5027 tree index_elt_type = TREE_TYPE (index_type);
5022 tree mask_type = build_same_sized_truth_vector_type (index_type); 5028 tree mask_type = truth_type_for (index_type);
5023 5029
5024 /* Create a vector that, for each element, identifies which of 5030 /* Create a vector that, for each element, identifies which of
5025 the REDUC_GROUP_SIZE results should use it. */ 5031 the REDUC_GROUP_SIZE results should use it. */
5026 tree index_mask = build_int_cst (index_elt_type, group_size - 1); 5032 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5027 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index, 5033 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5029 5035
5030 /* Get a neutral vector value. This is simply a splat of the neutral 5036 /* Get a neutral vector value. This is simply a splat of the neutral
5031 scalar value if we have one, otherwise the initial scalar value 5037 scalar value if we have one, otherwise the initial scalar value
5032 is itself a neutral value. */ 5038 is itself a neutral value. */
5033 tree vector_identity = NULL_TREE; 5039 tree vector_identity = NULL_TREE;
5040 tree neutral_op = NULL_TREE;
5041 if (slp_node)
5042 {
5043 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
5044 neutral_op
5045 = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis,
5046 vectype, code, first != NULL);
5047 }
5034 if (neutral_op) 5048 if (neutral_op)
5035 vector_identity = gimple_build_vector_from_val (&seq, vectype, 5049 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5036 neutral_op); 5050 neutral_op);
5037 for (unsigned int i = 0; i < group_size; ++i) 5051 for (unsigned int i = 0; i < group_size; ++i)
5038 { 5052 {
5042 if (!neutral_op) 5056 if (!neutral_op)
5043 { 5057 {
5044 tree scalar_value 5058 tree scalar_value
5045 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt, 5059 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5046 loop_preheader_edge (loop)); 5060 loop_preheader_edge (loop));
5061 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5062 scalar_value);
5047 vector_identity = gimple_build_vector_from_val (&seq, vectype, 5063 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5048 scalar_value); 5064 scalar_value);
5049 } 5065 }
5050 5066
5051 /* Calculate the equivalent of: 5067 /* Calculate the equivalent of:
5078 else 5094 else
5079 { 5095 {
5080 bool reduce_with_shift; 5096 bool reduce_with_shift;
5081 tree vec_temp; 5097 tree vec_temp;
5082 5098
5083 /* COND reductions all do the final reduction with MAX_EXPR 5099 gcc_assert (slp_reduc || new_phis.length () == 1);
5084 or MIN_EXPR. */
5085 if (code == COND_EXPR)
5086 {
5087 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5088 == INTEGER_INDUC_COND_REDUCTION)
5089 code = induc_code;
5090 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5091 == CONST_COND_REDUCTION)
5092 code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
5093 else
5094 code = MAX_EXPR;
5095 }
5096 5100
5097 /* See if the target wants to do the final (shift) reduction 5101 /* See if the target wants to do the final (shift) reduction
5098 in a vector mode of smaller size and first reduce upper/lower 5102 in a vector mode of smaller size and first reduce upper/lower
5099 halves against each other. */ 5103 halves against each other. */
5100 enum machine_mode mode1 = mode; 5104 enum machine_mode mode1 = mode;
5101 tree vectype1 = vectype; 5105 tree stype = TREE_TYPE (vectype);
5102 unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype)); 5106 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5103 unsigned sz1 = sz; 5107 unsigned nunits1 = nunits;
5108 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5109 && new_phis.length () == 1)
5110 {
5111 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5112 /* For SLP reductions we have to make sure lanes match up, but
5113 since we're doing individual element final reduction reducing
5114 vector width here is even more important.
5115 ??? We can also separate lanes with permutes, for the common
5116 case of power-of-two group-size odd/even extracts would work. */
5117 if (slp_reduc && nunits != nunits1)
5118 {
5119 nunits1 = least_common_multiple (nunits1, group_size);
5120 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5121 }
5122 }
5104 if (!slp_reduc 5123 if (!slp_reduc
5105 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode) 5124 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5106 sz1 = GET_MODE_SIZE (mode1).to_constant (); 5125 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5107 5126
5108 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1); 5127 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5128 stype, nunits1);
5109 reduce_with_shift = have_whole_vector_shift (mode1); 5129 reduce_with_shift = have_whole_vector_shift (mode1);
5110 if (!VECTOR_MODE_P (mode1)) 5130 if (!VECTOR_MODE_P (mode1))
5111 reduce_with_shift = false; 5131 reduce_with_shift = false;
5112 else 5132 else
5113 { 5133 {
5117 } 5137 }
5118 5138
5119 /* First reduce the vector to the desired vector size we should 5139 /* First reduce the vector to the desired vector size we should
5120 do shift reduction on by combining upper and lower halves. */ 5140 do shift reduction on by combining upper and lower halves. */
5121 new_temp = new_phi_result; 5141 new_temp = new_phi_result;
5122 while (sz > sz1) 5142 while (nunits > nunits1)
5123 { 5143 {
5124 gcc_assert (!slp_reduc); 5144 nunits /= 2;
5125 sz /= 2; 5145 vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5126 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz); 5146 stype, nunits);
5147 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5127 5148
5128 /* The target has to make sure we support lowpart/highpart 5149 /* The target has to make sure we support lowpart/highpart
5129 extraction, either via direct vector extract or through 5150 extraction, either via direct vector extract or through
5130 an integer mode punning. */ 5151 an integer mode punning. */
5131 tree dst1, dst2; 5152 tree dst1, dst2;
5146 dst2 = make_ssa_name (vectype1); 5167 dst2 = make_ssa_name (vectype1);
5147 epilog_stmt 5168 epilog_stmt
5148 = gimple_build_assign (dst2, BIT_FIELD_REF, 5169 = gimple_build_assign (dst2, BIT_FIELD_REF,
5149 build3 (BIT_FIELD_REF, vectype1, 5170 build3 (BIT_FIELD_REF, vectype1,
5150 new_temp, TYPE_SIZE (vectype1), 5171 new_temp, TYPE_SIZE (vectype1),
5151 bitsize_int (sz * BITS_PER_UNIT))); 5172 bitsize_int (bitsize)));
5152 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5173 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5153 } 5174 }
5154 else 5175 else
5155 { 5176 {
5156 /* Extract via punning to appropriately sized integer mode 5177 /* Extract via punning to appropriately sized integer mode
5157 vector. */ 5178 vector. */
5158 tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT, 5179 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5159 1);
5160 tree etype = build_vector_type (eltype, 2); 5180 tree etype = build_vector_type (eltype, 2);
5161 gcc_assert (convert_optab_handler (vec_extract_optab, 5181 gcc_assert (convert_optab_handler (vec_extract_optab,
5162 TYPE_MODE (etype), 5182 TYPE_MODE (etype),
5163 TYPE_MODE (eltype)) 5183 TYPE_MODE (eltype))
5164 != CODE_FOR_nothing); 5184 != CODE_FOR_nothing);
5183 tem = make_ssa_name (eltype); 5203 tem = make_ssa_name (eltype);
5184 epilog_stmt 5204 epilog_stmt
5185 = gimple_build_assign (tem, BIT_FIELD_REF, 5205 = gimple_build_assign (tem, BIT_FIELD_REF,
5186 build3 (BIT_FIELD_REF, eltype, 5206 build3 (BIT_FIELD_REF, eltype,
5187 new_temp, TYPE_SIZE (eltype), 5207 new_temp, TYPE_SIZE (eltype),
5188 bitsize_int (sz * BITS_PER_UNIT))); 5208 bitsize_int (bitsize)));
5189 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5209 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5190 dst2 = make_ssa_name (vectype1); 5210 dst2 = make_ssa_name (vectype1);
5191 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR, 5211 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5192 build1 (VIEW_CONVERT_EXPR, 5212 build1 (VIEW_CONVERT_EXPR,
5193 vectype1, tem)); 5213 vectype1, tem));
5195 } 5215 }
5196 5216
5197 new_temp = make_ssa_name (vectype1); 5217 new_temp = make_ssa_name (vectype1);
5198 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2); 5218 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5199 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5219 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5220 new_phis[0] = epilog_stmt;
5200 } 5221 }
5201 5222
5202 if (reduce_with_shift && !slp_reduc) 5223 if (reduce_with_shift && !slp_reduc)
5203 { 5224 {
5204 int element_bitsize = tree_to_uhwi (bitsize); 5225 int element_bitsize = tree_to_uhwi (bitsize);
5224 5245
5225 if (dump_enabled_p ()) 5246 if (dump_enabled_p ())
5226 dump_printf_loc (MSG_NOTE, vect_location, 5247 dump_printf_loc (MSG_NOTE, vect_location,
5227 "Reduce using vector shifts\n"); 5248 "Reduce using vector shifts\n");
5228 5249
5229 mode1 = TYPE_MODE (vectype1); 5250 gimple_seq stmts = NULL;
5230 vec_dest = vect_create_destination_var (scalar_dest, vectype1); 5251 new_temp = gimple_convert (&stmts, vectype1, new_temp);
5231 for (elt_offset = nelements / 2; 5252 for (elt_offset = nelements / 2;
5232 elt_offset >= 1; 5253 elt_offset >= 1;
5233 elt_offset /= 2) 5254 elt_offset /= 2)
5234 { 5255 {
5235 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel); 5256 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5236 indices.new_vector (sel, 2, nelements); 5257 indices.new_vector (sel, 2, nelements);
5237 tree mask = vect_gen_perm_mask_any (vectype1, indices); 5258 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5238 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR, 5259 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5239 new_temp, zero_vec, mask); 5260 new_temp, zero_vec, mask);
5240 new_name = make_ssa_name (vec_dest, epilog_stmt); 5261 new_temp = gimple_build (&stmts, code,
5241 gimple_assign_set_lhs (epilog_stmt, new_name); 5262 vectype1, new_name, new_temp);
5242 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5243
5244 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5245 new_temp);
5246 new_temp = make_ssa_name (vec_dest, epilog_stmt);
5247 gimple_assign_set_lhs (epilog_stmt, new_temp);
5248 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5249 } 5263 }
5264 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5250 5265
5251 /* 2.4 Extract the final scalar result. Create: 5266 /* 2.4 Extract the final scalar result. Create:
5252 s_out3 = extract_field <v_out2, bitpos> */ 5267 s_out3 = extract_field <v_out2, bitpos> */
5253 5268
5254 if (dump_enabled_p ()) 5269 if (dump_enabled_p ())
5279 dump_printf_loc (MSG_NOTE, vect_location, 5294 dump_printf_loc (MSG_NOTE, vect_location,
5280 "Reduce using scalar code.\n"); 5295 "Reduce using scalar code.\n");
5281 5296
5282 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1)); 5297 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5283 int element_bitsize = tree_to_uhwi (bitsize); 5298 int element_bitsize = tree_to_uhwi (bitsize);
5299 tree compute_type = TREE_TYPE (vectype);
5300 gimple_seq stmts = NULL;
5284 FOR_EACH_VEC_ELT (new_phis, i, new_phi) 5301 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5285 { 5302 {
5286 int bit_offset; 5303 int bit_offset;
5287 if (gimple_code (new_phi) == GIMPLE_PHI) 5304 if (gimple_code (new_phi) == GIMPLE_PHI)
5288 vec_temp = PHI_RESULT (new_phi); 5305 vec_temp = PHI_RESULT (new_phi);
5289 else 5306 else
5290 vec_temp = gimple_assign_lhs (new_phi); 5307 vec_temp = gimple_assign_lhs (new_phi);
5291 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize, 5308 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5292 bitsize_zero_node); 5309 vec_temp, bitsize, bitsize_zero_node);
5293 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5294 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5295 gimple_assign_set_lhs (epilog_stmt, new_temp);
5296 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5297 5310
5298 /* In SLP we don't need to apply reduction operation, so we just 5311 /* In SLP we don't need to apply reduction operation, so we just
5299 collect s' values in SCALAR_RESULTS. */ 5312 collect s' values in SCALAR_RESULTS. */
5300 if (slp_reduc) 5313 if (slp_reduc)
5301 scalar_results.safe_push (new_temp); 5314 scalar_results.safe_push (new_temp);
5303 for (bit_offset = element_bitsize; 5316 for (bit_offset = element_bitsize;
5304 bit_offset < vec_size_in_bits; 5317 bit_offset < vec_size_in_bits;
5305 bit_offset += element_bitsize) 5318 bit_offset += element_bitsize)
5306 { 5319 {
5307 tree bitpos = bitsize_int (bit_offset); 5320 tree bitpos = bitsize_int (bit_offset);
5308 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, 5321 new_name = gimple_build (&stmts, BIT_FIELD_REF,
5309 bitsize, bitpos); 5322 compute_type, vec_temp,
5310 5323 bitsize, bitpos);
5311 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5312 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5313 gimple_assign_set_lhs (epilog_stmt, new_name);
5314 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5315
5316 if (slp_reduc) 5324 if (slp_reduc)
5317 { 5325 {
5318 /* In SLP we don't need to apply reduction operation, so 5326 /* In SLP we don't need to apply reduction operation, so
5319 we just collect s' values in SCALAR_RESULTS. */ 5327 we just collect s' values in SCALAR_RESULTS. */
5320 new_temp = new_name; 5328 new_temp = new_name;
5321 scalar_results.safe_push (new_name); 5329 scalar_results.safe_push (new_name);
5322 } 5330 }
5323 else 5331 else
5324 { 5332 new_temp = gimple_build (&stmts, code, compute_type,
5325 epilog_stmt = gimple_build_assign (new_scalar_dest, code, 5333 new_name, new_temp);
5326 new_name, new_temp);
5327 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5328 gimple_assign_set_lhs (epilog_stmt, new_temp);
5329 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5330 }
5331 } 5334 }
5332 } 5335 }
5333 5336
5334 /* The only case where we need to reduce scalar results in SLP, is 5337 /* The only case where we need to reduce scalar results in SLP, is
5335 unrolling. If the size of SCALAR_RESULTS is greater than 5338 unrolling. If the size of SCALAR_RESULTS is greater than
5336 REDUC_GROUP_SIZE, we reduce them combining elements modulo 5339 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5337 REDUC_GROUP_SIZE. */ 5340 REDUC_GROUP_SIZE. */
5338 if (slp_reduc) 5341 if (slp_reduc)
5339 { 5342 {
5340 tree res, first_res, new_res; 5343 tree res, first_res, new_res;
5341 gimple *new_stmt;
5342 5344
5343 /* Reduce multiple scalar results in case of SLP unrolling. */ 5345 /* Reduce multiple scalar results in case of SLP unrolling. */
5344 for (j = group_size; scalar_results.iterate (j, &res); 5346 for (j = group_size; scalar_results.iterate (j, &res);
5345 j++) 5347 j++)
5346 { 5348 {
5347 first_res = scalar_results[j % group_size]; 5349 first_res = scalar_results[j % group_size];
5348 new_stmt = gimple_build_assign (new_scalar_dest, code, 5350 new_res = gimple_build (&stmts, code, compute_type,
5349 first_res, res); 5351 first_res, res);
5350 new_res = make_ssa_name (new_scalar_dest, new_stmt);
5351 gimple_assign_set_lhs (new_stmt, new_res);
5352 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5353 scalar_results[j % group_size] = new_res; 5352 scalar_results[j % group_size] = new_res;
5354 } 5353 }
5354 for (k = 0; k < group_size; k++)
5355 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5356 scalar_results[k]);
5355 } 5357 }
5356 else 5358 else
5357 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */ 5359 {
5358 scalar_results.safe_push (new_temp); 5360 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5361 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5362 scalar_results.safe_push (new_temp);
5363 }
5364
5365 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5359 } 5366 }
5360 5367
5361 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) 5368 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5362 == INTEGER_INDUC_COND_REDUCTION) 5369 && induc_val)
5363 && !operand_equal_p (initial_def, induc_val, 0))
5364 { 5370 {
5365 /* Earlier we set the initial value to be a vector if induc_val 5371 /* Earlier we set the initial value to be a vector if induc_val
5366 values. Check the result and if it is induc_val then replace 5372 values. Check the result and if it is induc_val then replace
5367 with the original initial value, unless induc_val is 5373 with the original initial value, unless induc_val is
5368 the same as initial_def already. */ 5374 the same as initial_def already. */
5374 initial_def, new_temp); 5380 initial_def, new_temp);
5375 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5381 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5376 scalar_results[0] = tmp; 5382 scalar_results[0] = tmp;
5377 } 5383 }
5378 } 5384 }
5379 5385
5380 vect_finalize_reduction:
5381
5382 if (double_reduc)
5383 loop = loop->inner;
5384
5385 /* 2.5 Adjust the final result by the initial value of the reduction 5386 /* 2.5 Adjust the final result by the initial value of the reduction
5386 variable. (When such adjustment is not needed, then 5387 variable. (When such adjustment is not needed, then
5387 'adjustment_def' is zero). For example, if code is PLUS we create: 5388 'adjustment_def' is zero). For example, if code is PLUS we create:
5388 new_temp = loop_exit_def + adjustment_def */ 5389 new_temp = loop_exit_def + adjustment_def */
5389 5390
5390 if (adjustment_def) 5391 if (adjustment_def)
5391 { 5392 {
5392 gcc_assert (!slp_reduc); 5393 gcc_assert (!slp_reduc);
5394 gimple_seq stmts = NULL;
5393 if (nested_in_vect_loop) 5395 if (nested_in_vect_loop)
5394 { 5396 {
5395 new_phi = new_phis[0]; 5397 new_phi = new_phis[0];
5396 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE); 5398 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5397 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def); 5399 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5398 new_dest = vect_create_destination_var (scalar_dest, vectype); 5400 new_temp = gimple_build (&stmts, code, vectype,
5401 PHI_RESULT (new_phi), adjustment_def);
5399 } 5402 }
5400 else 5403 else
5401 { 5404 {
5402 new_temp = scalar_results[0]; 5405 new_temp = scalar_results[0];
5403 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE); 5406 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5404 expr = build2 (code, scalar_type, new_temp, adjustment_def); 5407 adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5405 new_dest = vect_create_destination_var (scalar_dest, scalar_type); 5408 new_temp = gimple_build (&stmts, code, scalar_type,
5406 } 5409 new_temp, adjustment_def);
5407 5410 }
5408 epilog_stmt = gimple_build_assign (new_dest, expr); 5411
5409 new_temp = make_ssa_name (new_dest, epilog_stmt); 5412 epilog_stmt = gimple_seq_last_stmt (stmts);
5410 gimple_assign_set_lhs (epilog_stmt, new_temp); 5413 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5411 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5412 if (nested_in_vect_loop) 5414 if (nested_in_vect_loop)
5413 { 5415 {
5414 stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt); 5416 stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5415 STMT_VINFO_RELATED_STMT (epilog_stmt_info) 5417 STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5416 = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi)); 5418 = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5423 else 5425 else
5424 scalar_results[0] = new_temp; 5426 scalar_results[0] = new_temp;
5425 5427
5426 new_phis[0] = epilog_stmt; 5428 new_phis[0] = epilog_stmt;
5427 } 5429 }
5430
5431 if (double_reduc)
5432 loop = loop->inner;
5428 5433
5429 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit 5434 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5430 phis with new adjusted scalar results, i.e., replace use <s_out0> 5435 phis with new adjusted scalar results, i.e., replace use <s_out0>
5431 with use <s_out4>. 5436 with use <s_out4>.
5432 5437
5469 Therefore, we need to match SCALAR_RESULTS with corresponding statements. 5474 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5470 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results 5475 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5471 correspond to the first vector stmt, etc. 5476 correspond to the first vector stmt, etc.
5472 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */ 5477 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
5473 if (group_size > new_phis.length ()) 5478 if (group_size > new_phis.length ())
5474 { 5479 gcc_assert (!(group_size % new_phis.length ()));
5475 ratio = group_size / new_phis.length (); 5480
5476 gcc_assert (!(group_size % new_phis.length ()));
5477 }
5478 else
5479 ratio = 1;
5480
5481 stmt_vec_info epilog_stmt_info = NULL;
5482 for (k = 0; k < group_size; k++) 5481 for (k = 0; k < group_size; k++)
5483 { 5482 {
5484 if (k % ratio == 0)
5485 {
5486 epilog_stmt_info = loop_vinfo->lookup_stmt (new_phis[k / ratio]);
5487 reduction_phi_info = reduction_phis[k / ratio];
5488 if (double_reduc)
5489 inner_phi = inner_phis[k / ratio];
5490 }
5491
5492 if (slp_reduc) 5483 if (slp_reduc)
5493 { 5484 {
5494 stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k]; 5485 stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5495 5486
5496 orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info); 5487 orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5497 /* SLP statements can't participate in patterns. */ 5488 /* SLP statements can't participate in patterns. */
5498 gcc_assert (!orig_stmt_info); 5489 gcc_assert (!orig_stmt_info);
5499 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt); 5490 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5500 } 5491 }
5501 5492
5502 phis.create (3);
5503 /* Find the loop-closed-use at the loop exit of the original scalar
5504 result. (The reduction result is expected to have two immediate uses -
5505 one at the latch block, and one at the loop exit). */
5506 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5507 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5508 && !is_gimple_debug (USE_STMT (use_p)))
5509 phis.safe_push (USE_STMT (use_p));
5510
5511 /* While we expect to have found an exit_phi because of loop-closed-ssa
5512 form we can end up without one if the scalar cycle is dead. */
5513
5514 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5515 {
5516 if (outer_loop)
5517 {
5518 stmt_vec_info exit_phi_vinfo
5519 = loop_vinfo->lookup_stmt (exit_phi);
5520 gphi *vect_phi;
5521
5522 /* FORNOW. Currently not supporting the case that an inner-loop
5523 reduction is not used in the outer-loop (but only outside the
5524 outer-loop), unless it is double reduction. */
5525 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5526 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5527 || double_reduc);
5528
5529 if (double_reduc)
5530 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5531 else
5532 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt_info;
5533 if (!double_reduc
5534 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5535 != vect_double_reduction_def)
5536 continue;
5537
5538 /* Handle double reduction:
5539
5540 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
5541 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5542 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
5543 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
5544
5545 At that point the regular reduction (stmt2 and stmt3) is
5546 already vectorized, as well as the exit phi node, stmt4.
5547 Here we vectorize the phi node of double reduction, stmt1, and
5548 update all relevant statements. */
5549
5550 /* Go through all the uses of s2 to find double reduction phi
5551 node, i.e., stmt1 above. */
5552 orig_name = PHI_RESULT (exit_phi);
5553 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5554 {
5555 stmt_vec_info use_stmt_vinfo;
5556 tree vect_phi_init, preheader_arg, vect_phi_res;
5557 basic_block bb = gimple_bb (use_stmt);
5558
5559 /* Check that USE_STMT is really double reduction phi
5560 node. */
5561 if (gimple_code (use_stmt) != GIMPLE_PHI
5562 || gimple_phi_num_args (use_stmt) != 2
5563 || bb->loop_father != outer_loop)
5564 continue;
5565 use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt);
5566 if (!use_stmt_vinfo
5567 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5568 != vect_double_reduction_def)
5569 continue;
5570
5571 /* Create vector phi node for double reduction:
5572 vs1 = phi <vs0, vs2>
5573 vs1 was created previously in this function by a call to
5574 vect_get_vec_def_for_operand and is stored in
5575 vec_initial_def;
5576 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5577 vs0 is created here. */
5578
5579 /* Create vector phi node. */
5580 vect_phi = create_phi_node (vec_initial_def, bb);
5581 loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi);
5582
5583 /* Create vs0 - initial def of the double reduction phi. */
5584 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5585 loop_preheader_edge (outer_loop));
5586 vect_phi_init = get_initial_def_for_reduction
5587 (stmt_info, preheader_arg, NULL);
5588
5589 /* Update phi node arguments with vs0 and vs2. */
5590 add_phi_arg (vect_phi, vect_phi_init,
5591 loop_preheader_edge (outer_loop),
5592 UNKNOWN_LOCATION);
5593 add_phi_arg (vect_phi, PHI_RESULT (inner_phi->stmt),
5594 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5595 if (dump_enabled_p ())
5596 dump_printf_loc (MSG_NOTE, vect_location,
5597 "created double reduction phi node: %G",
5598 vect_phi);
5599
5600 vect_phi_res = PHI_RESULT (vect_phi);
5601
5602 /* Replace the use, i.e., set the correct vs1 in the regular
5603 reduction phi node. FORNOW, NCOPIES is always 1, so the
5604 loop is redundant. */
5605 stmt_vec_info use_info = reduction_phi_info;
5606 for (j = 0; j < ncopies; j++)
5607 {
5608 edge pr_edge = loop_preheader_edge (loop);
5609 SET_PHI_ARG_DEF (as_a <gphi *> (use_info->stmt),
5610 pr_edge->dest_idx, vect_phi_res);
5611 use_info = STMT_VINFO_RELATED_STMT (use_info);
5612 }
5613 }
5614 }
5615 }
5616
5617 phis.release ();
5618 if (nested_in_vect_loop) 5493 if (nested_in_vect_loop)
5619 { 5494 {
5620 if (double_reduc) 5495 if (double_reduc)
5621 loop = outer_loop; 5496 loop = outer_loop;
5622 else 5497 else
5623 continue; 5498 gcc_unreachable ();
5624 } 5499 }
5625 5500
5626 phis.create (3); 5501 phis.create (3);
5627 /* Find the loop-closed-use at the loop exit of the original scalar 5502 /* Find the loop-closed-use at the loop exit of the original scalar
5628 result. (The reduction result is expected to have two immediate uses, 5503 result. (The reduction result is expected to have two immediate uses,
5656 { 5531 {
5657 /* Replace the uses: */ 5532 /* Replace the uses: */
5658 orig_name = PHI_RESULT (exit_phi); 5533 orig_name = PHI_RESULT (exit_phi);
5659 scalar_result = scalar_results[k]; 5534 scalar_result = scalar_results[k];
5660 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name) 5535 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5661 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) 5536 {
5662 SET_USE (use_p, scalar_result); 5537 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5538 SET_USE (use_p, scalar_result);
5539 update_stmt (use_stmt);
5540 }
5663 } 5541 }
5664 5542
5665 phis.release (); 5543 phis.release ();
5666 } 5544 }
5667 } 5545 }
5716 lhs = new_name; 5594 lhs = new_name;
5717 } 5595 }
5718 return lhs; 5596 return lhs;
5719 } 5597 }
5720 5598
5599 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
5600 type of the vector input. */
5601
5602 static internal_fn
5603 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
5604 {
5605 internal_fn mask_reduc_fn;
5606
5607 switch (reduc_fn)
5608 {
5609 case IFN_FOLD_LEFT_PLUS:
5610 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
5611 break;
5612
5613 default:
5614 return IFN_LAST;
5615 }
5616
5617 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
5618 OPTIMIZE_FOR_SPEED))
5619 return mask_reduc_fn;
5620 return IFN_LAST;
5621 }
5622
5721 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the 5623 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
5722 statement that sets the live-out value. REDUC_DEF_STMT is the phi 5624 statement that sets the live-out value. REDUC_DEF_STMT is the phi
5723 statement. CODE is the operation performed by STMT_INFO and OPS are 5625 statement. CODE is the operation performed by STMT_INFO and OPS are
5724 its scalar operands. REDUC_INDEX is the index of the operand in 5626 its scalar operands. REDUC_INDEX is the index of the operand in
5725 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that 5627 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
5735 tree_code code, internal_fn reduc_fn, 5637 tree_code code, internal_fn reduc_fn,
5736 tree ops[3], tree vectype_in, 5638 tree ops[3], tree vectype_in,
5737 int reduc_index, vec_loop_masks *masks) 5639 int reduc_index, vec_loop_masks *masks)
5738 { 5640 {
5739 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 5641 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5740 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 5642 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5741 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); 5643 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5742 stmt_vec_info new_stmt_info = NULL; 5644 stmt_vec_info new_stmt_info = NULL;
5645 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
5743 5646
5744 int ncopies; 5647 int ncopies;
5745 if (slp_node) 5648 if (slp_node)
5746 ncopies = 1; 5649 ncopies = 1;
5747 else 5650 else
5748 ncopies = vect_get_num_copies (loop_vinfo, vectype_in); 5651 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5749 5652
5750 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info)); 5653 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5751 gcc_assert (ncopies == 1); 5654 gcc_assert (ncopies == 1);
5752 gcc_assert (TREE_CODE_LENGTH (code) == binary_op); 5655 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5753 gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5754 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5755 == FOLD_LEFT_REDUCTION);
5756 5656
5757 if (slp_node) 5657 if (slp_node)
5758 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out), 5658 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5759 TYPE_VECTOR_SUBPARTS (vectype_in))); 5659 TYPE_VECTOR_SUBPARTS (vectype_in)));
5760 5660
5763 int group_size = 1; 5663 int group_size = 1;
5764 stmt_vec_info scalar_dest_def_info; 5664 stmt_vec_info scalar_dest_def_info;
5765 auto_vec<tree> vec_oprnds0; 5665 auto_vec<tree> vec_oprnds0;
5766 if (slp_node) 5666 if (slp_node)
5767 { 5667 {
5768 vect_get_vec_defs (op0, NULL_TREE, stmt_info, &vec_oprnds0, NULL, 5668 auto_vec<vec<tree> > vec_defs (2);
5769 slp_node); 5669 vect_get_slp_defs (slp_node, &vec_defs);
5670 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
5671 vec_defs[0].release ();
5672 vec_defs[1].release ();
5770 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length (); 5673 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5771 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]; 5674 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5772 } 5675 }
5773 else 5676 else
5774 { 5677 {
5808 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0); 5711 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5809 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); 5712 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5810 def0 = negated; 5713 def0 = negated;
5811 } 5714 }
5812 5715
5813 if (mask) 5716 if (mask && mask_reduc_fn == IFN_LAST)
5814 def0 = merge_with_identity (gsi, mask, vectype_out, def0, 5717 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5815 vector_identity); 5718 vector_identity);
5816 5719
5817 /* On the first iteration the input is simply the scalar phi 5720 /* On the first iteration the input is simply the scalar phi
5818 result, and for subsequent iterations it is the output of 5721 result, and for subsequent iterations it is the output of
5819 the preceding operation. */ 5722 the preceding operation. */
5820 if (reduc_fn != IFN_LAST) 5723 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
5821 { 5724 {
5822 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0); 5725 if (mask && mask_reduc_fn != IFN_LAST)
5726 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
5727 def0, mask);
5728 else
5729 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
5730 def0);
5823 /* For chained SLP reductions the output of the previous reduction 5731 /* For chained SLP reductions the output of the previous reduction
5824 operation serves as the input of the next. For the final statement 5732 operation serves as the input of the next. For the final statement
5825 the output cannot be a temporary - we reuse the original 5733 the output cannot be a temporary - we reuse the original
5826 scalar destination of the last statement. */ 5734 scalar destination of the last statement. */
5827 if (i != vec_num - 1) 5735 if (i != vec_num - 1)
5837 reduc_var, def0); 5745 reduc_var, def0);
5838 new_stmt = SSA_NAME_DEF_STMT (reduc_var); 5746 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5839 /* Remove the statement, so that we can use the same code paths 5747 /* Remove the statement, so that we can use the same code paths
5840 as for statements that we've just created. */ 5748 as for statements that we've just created. */
5841 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt); 5749 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5842 gsi_remove (&tmp_gsi, false); 5750 gsi_remove (&tmp_gsi, true);
5843 } 5751 }
5844 5752
5845 if (i == vec_num - 1) 5753 if (i == vec_num - 1)
5846 { 5754 {
5847 gimple_set_lhs (new_stmt, scalar_dest); 5755 gimple_set_lhs (new_stmt, scalar_dest);
5866 5774
5867 Check if STMT_VINO (which is part of loop LOOP) both increments and 5775 Check if STMT_VINO (which is part of loop LOOP) both increments and
5868 does not cause overflow. */ 5776 does not cause overflow. */
5869 5777
5870 static bool 5778 static bool
5871 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop) 5779 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
5872 { 5780 {
5873 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt); 5781 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5874 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo); 5782 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5875 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo); 5783 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5876 tree lhs_type = TREE_TYPE (gimple_phi_result (phi)); 5784 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5900 if (overflow) 5808 if (overflow)
5901 return false; 5809 return false;
5902 5810
5903 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type)) 5811 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5904 <= TYPE_PRECISION (lhs_type)); 5812 <= TYPE_PRECISION (lhs_type));
5813 }
5814
5815 /* Check if masking can be supported by inserting a conditional expression.
5816 CODE is the code for the operation. COND_FN is the conditional internal
5817 function, if it exists. VECTYPE_IN is the type of the vector input. */
5818 static bool
5819 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
5820 tree vectype_in)
5821 {
5822 if (cond_fn != IFN_LAST
5823 && direct_internal_fn_supported_p (cond_fn, vectype_in,
5824 OPTIMIZE_FOR_SPEED))
5825 return false;
5826
5827 switch (code)
5828 {
5829 case DOT_PROD_EXPR:
5830 case SAD_EXPR:
5831 return true;
5832
5833 default:
5834 return false;
5835 }
5836 }
5837
5838 /* Insert a conditional expression to enable masked vectorization. CODE is the
5839 code for the operation. VOP is the array of operands. MASK is the loop
5840 mask. GSI is a statement iterator used to place the new conditional
5841 expression. */
5842 static void
5843 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
5844 gimple_stmt_iterator *gsi)
5845 {
5846 switch (code)
5847 {
5848 case DOT_PROD_EXPR:
5849 {
5850 tree vectype = TREE_TYPE (vop[1]);
5851 tree zero = build_zero_cst (vectype);
5852 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
5853 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
5854 mask, vop[1], zero);
5855 gsi_insert_before (gsi, select, GSI_SAME_STMT);
5856 vop[1] = masked_op1;
5857 break;
5858 }
5859
5860 case SAD_EXPR:
5861 {
5862 tree vectype = TREE_TYPE (vop[1]);
5863 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
5864 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
5865 mask, vop[1], vop[0]);
5866 gsi_insert_before (gsi, select, GSI_SAME_STMT);
5867 vop[1] = masked_op1;
5868 break;
5869 }
5870
5871 default:
5872 gcc_unreachable ();
5873 }
5905 } 5874 }
5906 5875
5907 /* Function vectorizable_reduction. 5876 /* Function vectorizable_reduction.
5908 5877
5909 Check if STMT_INFO performs a reduction operation that can be vectorized. 5878 Check if STMT_INFO performs a reduction operation that can be vectorized.
5945 indicates what is the actual level of parallelism (V8HI in the example), so 5914 indicates what is the actual level of parallelism (V8HI in the example), so
5946 that the right vectorization factor would be derived. This vectype 5915 that the right vectorization factor would be derived. This vectype
5947 corresponds to the type of arguments to the reduction stmt, and should *NOT* 5916 corresponds to the type of arguments to the reduction stmt, and should *NOT*
5948 be used to create the vectorized stmt. The right vectype for the vectorized 5917 be used to create the vectorized stmt. The right vectype for the vectorized
5949 stmt is obtained from the type of the result X: 5918 stmt is obtained from the type of the result X:
5950 get_vectype_for_scalar_type (TREE_TYPE (X)) 5919 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
5951 5920
5952 This means that, contrary to "regular" reductions (or "regular" stmts in 5921 This means that, contrary to "regular" reductions (or "regular" stmts in
5953 general), the following equation: 5922 general), the following equation:
5954 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X)) 5923 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
5955 does *NOT* necessarily hold for reduction patterns. */ 5924 does *NOT* necessarily hold for reduction patterns. */
5956 5925
5957 bool 5926 bool
5958 vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, 5927 vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node,
5959 stmt_vec_info *vec_stmt, slp_tree slp_node,
5960 slp_instance slp_node_instance, 5928 slp_instance slp_node_instance,
5961 stmt_vector_for_cost *cost_vec) 5929 stmt_vector_for_cost *cost_vec)
5962 { 5930 {
5963 tree vec_dest;
5964 tree scalar_dest; 5931 tree scalar_dest;
5965 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5966 tree vectype_in = NULL_TREE; 5932 tree vectype_in = NULL_TREE;
5967 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 5933 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5968 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 5934 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5969 enum tree_code code, orig_code; 5935 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
5970 internal_fn reduc_fn;
5971 machine_mode vec_mode;
5972 int op_type;
5973 optab optab;
5974 tree new_temp = NULL_TREE;
5975 enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
5976 stmt_vec_info cond_stmt_vinfo = NULL; 5936 stmt_vec_info cond_stmt_vinfo = NULL;
5977 enum tree_code cond_reduc_op_code = ERROR_MARK;
5978 tree scalar_type; 5937 tree scalar_type;
5979 bool is_simple_use;
5980 int i; 5938 int i;
5981 int ncopies; 5939 int ncopies;
5982 int epilog_copies;
5983 stmt_vec_info prev_stmt_info, prev_phi_info;
5984 bool single_defuse_cycle = false; 5940 bool single_defuse_cycle = false;
5985 stmt_vec_info new_stmt_info = NULL; 5941 bool nested_cycle = false;
5986 int j;
5987 tree ops[3];
5988 enum vect_def_type dts[3];
5989 bool nested_cycle = false, found_nested_cycle_def = false;
5990 bool double_reduc = false; 5942 bool double_reduc = false;
5991 basic_block def_bb;
5992 struct loop * def_stmt_loop;
5993 tree def_arg;
5994 auto_vec<tree> vec_oprnds0;
5995 auto_vec<tree> vec_oprnds1;
5996 auto_vec<tree> vec_oprnds2;
5997 auto_vec<tree> vect_defs;
5998 auto_vec<stmt_vec_info> phis;
5999 int vec_num; 5943 int vec_num;
6000 tree def0, tem; 5944 tree tem;
6001 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE; 5945 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6002 tree cond_reduc_val = NULL_TREE; 5946 tree cond_reduc_val = NULL_TREE;
6003 5947
6004 /* Make sure it was already recognized as a reduction computation. */ 5948 /* Make sure it was already recognized as a reduction computation. */
6005 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def 5949 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
5950 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6006 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle) 5951 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6007 return false; 5952 return false;
6008 5953
5954 /* The stmt we store reduction analysis meta on. */
5955 stmt_vec_info reduc_info = info_for_reduction (stmt_info);
5956 reduc_info->is_reduc_info = true;
5957
5958 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5959 {
5960 if (is_a <gphi *> (stmt_info->stmt))
5961 /* Analysis for double-reduction is done on the outer
5962 loop PHI, nested cycles have no further restrictions. */
5963 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
5964 else
5965 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5966 return true;
5967 }
5968
5969 stmt_vec_info orig_stmt_of_analysis = stmt_info;
5970 stmt_vec_info phi_info = stmt_info;
5971 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
5972 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5973 {
5974 if (!is_a <gphi *> (stmt_info->stmt))
5975 {
5976 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5977 return true;
5978 }
5979 if (slp_node)
5980 {
5981 slp_node_instance->reduc_phis = slp_node;
5982 /* ??? We're leaving slp_node to point to the PHIs, we only
5983 need it to get at the number of vector stmts which wasn't
5984 yet initialized for the instance root. */
5985 }
5986 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5987 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
5988 else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
5989 {
5990 use_operand_p use_p;
5991 gimple *use_stmt;
5992 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
5993 &use_p, &use_stmt);
5994 gcc_assert (res);
5995 phi_info = loop_vinfo->lookup_stmt (use_stmt);
5996 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
5997 }
5998 }
5999
6000 /* PHIs should not participate in patterns. */
6001 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6002 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6003
6004 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6005 and compute the reduction chain length. */
6006 tree reduc_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6007 loop_latch_edge (loop));
6008 unsigned reduc_chain_length = 0;
6009 bool only_slp_reduc_chain = true;
6010 stmt_info = NULL;
6011 while (reduc_def != PHI_RESULT (reduc_def_phi))
6012 {
6013 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6014 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6015 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6016 {
6017 if (dump_enabled_p ())
6018 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6019 "reduction chain broken by patterns.\n");
6020 return false;
6021 }
6022 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6023 only_slp_reduc_chain = false;
6024 /* ??? For epilogue generation live members of the chain need
6025 to point back to the PHI via their original stmt for
6026 info_for_reduction to work. */
6027 if (STMT_VINFO_LIVE_P (vdef))
6028 STMT_VINFO_REDUC_DEF (def) = phi_info;
6029 gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6030 if (!assign)
6031 {
6032 if (dump_enabled_p ())
6033 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6034 "reduction chain includes calls.\n");
6035 return false;
6036 }
6037 if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6038 {
6039 if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6040 TREE_TYPE (gimple_assign_rhs1 (assign))))
6041 {
6042 if (dump_enabled_p ())
6043 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6044 "conversion in the reduction chain.\n");
6045 return false;
6046 }
6047 }
6048 else if (!stmt_info)
6049 /* First non-conversion stmt. */
6050 stmt_info = vdef;
6051 reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6052 reduc_chain_length++;
6053 }
6054 /* PHIs should not participate in patterns. */
6055 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6056
6009 if (nested_in_vect_loop_p (loop, stmt_info)) 6057 if (nested_in_vect_loop_p (loop, stmt_info))
6010 { 6058 {
6011 loop = loop->inner; 6059 loop = loop->inner;
6012 nested_cycle = true; 6060 nested_cycle = true;
6013 } 6061 }
6014 6062
6063 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6064 element. */
6065 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6066 {
6067 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6068 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6069 }
6015 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)) 6070 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6016 gcc_assert (slp_node 6071 gcc_assert (slp_node
6017 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info); 6072 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6018
6019 if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
6020 {
6021 tree phi_result = gimple_phi_result (phi);
6022 /* Analysis is fully done on the reduction stmt invocation. */
6023 if (! vec_stmt)
6024 {
6025 if (slp_node)
6026 slp_node_instance->reduc_phis = slp_node;
6027
6028 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6029 return true;
6030 }
6031
6032 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6033 /* Leave the scalar phi in place. Note that checking
6034 STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6035 for reductions involving a single statement. */
6036 return true;
6037
6038 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
6039 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
6040
6041 if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
6042 == EXTRACT_LAST_REDUCTION)
6043 /* Leave the scalar phi in place. */
6044 return true;
6045
6046 gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt);
6047 for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6048 {
6049 tree op = gimple_op (reduc_stmt, k);
6050 if (op == phi_result)
6051 continue;
6052 if (k == 1
6053 && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6054 continue;
6055 if (!vectype_in
6056 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6057 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6058 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6059 break;
6060 }
6061 gcc_assert (vectype_in);
6062
6063 if (slp_node)
6064 ncopies = 1;
6065 else
6066 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6067
6068 stmt_vec_info use_stmt_info;
6069 if (ncopies > 1
6070 && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live
6071 && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result))
6072 && vect_stmt_to_vectorize (use_stmt_info) == reduc_stmt_info)
6073 single_defuse_cycle = true;
6074
6075 /* Create the destination vector */
6076 scalar_dest = gimple_assign_lhs (reduc_stmt);
6077 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6078
6079 if (slp_node)
6080 /* The size vect_schedule_slp_instance computes is off for us. */
6081 vec_num = vect_get_num_vectors
6082 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6083 * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6084 vectype_in);
6085 else
6086 vec_num = 1;
6087
6088 /* Generate the reduction PHIs upfront. */
6089 prev_phi_info = NULL;
6090 for (j = 0; j < ncopies; j++)
6091 {
6092 if (j == 0 || !single_defuse_cycle)
6093 {
6094 for (i = 0; i < vec_num; i++)
6095 {
6096 /* Create the reduction-phi that defines the reduction
6097 operand. */
6098 gimple *new_phi = create_phi_node (vec_dest, loop->header);
6099 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6100
6101 if (slp_node)
6102 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6103 else
6104 {
6105 if (j == 0)
6106 STMT_VINFO_VEC_STMT (stmt_info)
6107 = *vec_stmt = new_phi_info;
6108 else
6109 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6110 prev_phi_info = new_phi_info;
6111 }
6112 }
6113 }
6114 }
6115
6116 return true;
6117 }
6118 6073
6119 /* 1. Is vectorizable reduction? */ 6074 /* 1. Is vectorizable reduction? */
6120 /* Not supportable if the reduction variable is used in the loop, unless 6075 /* Not supportable if the reduction variable is used in the loop, unless
6121 it's a reduction chain. */ 6076 it's a reduction chain. */
6122 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer 6077 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6145 6100
6146 /* 3. Check the operands of the operation. The first operands are defined 6101 /* 3. Check the operands of the operation. The first operands are defined
6147 inside the loop body. The last operand is the reduction variable, 6102 inside the loop body. The last operand is the reduction variable,
6148 which is defined by the loop-header-phi. */ 6103 which is defined by the loop-header-phi. */
6149 6104
6105 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6106 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6150 gassign *stmt = as_a <gassign *> (stmt_info->stmt); 6107 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6151 6108 enum tree_code code = gimple_assign_rhs_code (stmt);
6152 /* Flatten RHS. */ 6109 bool lane_reduc_code_p
6153 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt))) 6110 = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6154 { 6111 int op_type = TREE_CODE_LENGTH (code);
6155 case GIMPLE_BINARY_RHS:
6156 code = gimple_assign_rhs_code (stmt);
6157 op_type = TREE_CODE_LENGTH (code);
6158 gcc_assert (op_type == binary_op);
6159 ops[0] = gimple_assign_rhs1 (stmt);
6160 ops[1] = gimple_assign_rhs2 (stmt);
6161 break;
6162
6163 case GIMPLE_TERNARY_RHS:
6164 code = gimple_assign_rhs_code (stmt);
6165 op_type = TREE_CODE_LENGTH (code);
6166 gcc_assert (op_type == ternary_op);
6167 ops[0] = gimple_assign_rhs1 (stmt);
6168 ops[1] = gimple_assign_rhs2 (stmt);
6169 ops[2] = gimple_assign_rhs3 (stmt);
6170 break;
6171
6172 case GIMPLE_UNARY_RHS:
6173 return false;
6174
6175 default:
6176 gcc_unreachable ();
6177 }
6178
6179 if (code == COND_EXPR && slp_node)
6180 return false;
6181 6112
6182 scalar_dest = gimple_assign_lhs (stmt); 6113 scalar_dest = gimple_assign_lhs (stmt);
6183 scalar_type = TREE_TYPE (scalar_dest); 6114 scalar_type = TREE_TYPE (scalar_dest);
6184 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type) 6115 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6185 && !SCALAR_FLOAT_TYPE_P (scalar_type)) 6116 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6187 6118
6188 /* Do not try to vectorize bit-precision reductions. */ 6119 /* Do not try to vectorize bit-precision reductions. */
6189 if (!type_has_mode_precision_p (scalar_type)) 6120 if (!type_has_mode_precision_p (scalar_type))
6190 return false; 6121 return false;
6191 6122
6123 /* For lane-reducing ops we're reducing the number of reduction PHIs
6124 which means the only use of that may be in the lane-reducing operation. */
6125 if (lane_reduc_code_p
6126 && reduc_chain_length != 1
6127 && !only_slp_reduc_chain)
6128 {
6129 if (dump_enabled_p ())
6130 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6131 "lane-reducing reduction with extra stmts.\n");
6132 return false;
6133 }
6134
6192 /* All uses but the last are expected to be defined in the loop. 6135 /* All uses but the last are expected to be defined in the loop.
6193 The last use is the reduction variable. In case of nested cycle this 6136 The last use is the reduction variable. In case of nested cycle this
6194 assumption is not true: we use reduc_index to record the index of the 6137 assumption is not true: we use reduc_index to record the index of the
6195 reduction variable. */ 6138 reduction variable. */
6196 stmt_vec_info reduc_def_info = NULL; 6139 reduc_def = PHI_RESULT (reduc_def_phi);
6197 int reduc_index = -1;
6198 for (i = 0; i < op_type; i++) 6140 for (i = 0; i < op_type; i++)
6199 { 6141 {
6142 tree op = gimple_op (stmt, i + 1);
6200 /* The condition of COND_EXPR is checked in vectorizable_condition(). */ 6143 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6201 if (i == 0 && code == COND_EXPR) 6144 if (i == 0 && code == COND_EXPR)
6202 continue; 6145 continue;
6203 6146
6204 stmt_vec_info def_stmt_info; 6147 stmt_vec_info def_stmt_info;
6205 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem, 6148 enum vect_def_type dt;
6206 &def_stmt_info); 6149 if (!vect_is_simple_use (op, loop_vinfo, &dt, &tem,
6207 dt = dts[i]; 6150 &def_stmt_info))
6208 gcc_assert (is_simple_use); 6151 {
6209 if (dt == vect_reduction_def) 6152 if (dump_enabled_p ())
6210 { 6153 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6211 reduc_def_info = def_stmt_info; 6154 "use not simple.\n");
6212 reduc_index = i; 6155 return false;
6213 continue; 6156 }
6214 } 6157 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6215 else if (tem) 6158 continue;
6216 { 6159
6217 /* To properly compute ncopies we are interested in the widest 6160 /* There should be only one cycle def in the stmt, the one
6218 input type in case we're looking at a widening accumulation. */ 6161 leading to reduc_def. */
6219 if (!vectype_in 6162 if (VECTORIZABLE_CYCLE_DEF (dt))
6163 return false;
6164
6165 /* To properly compute ncopies we are interested in the widest
6166 non-reduction input type in case we're looking at a widening
6167 accumulation that we later handle in vect_transform_reduction. */
6168 if (lane_reduc_code_p
6169 && tem
6170 && (!vectype_in
6220 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in))) 6171 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6221 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))) 6172 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6222 vectype_in = tem; 6173 vectype_in = tem;
6223 } 6174
6224 6175 if (code == COND_EXPR)
6225 if (dt != vect_internal_def 6176 {
6226 && dt != vect_external_def 6177 /* Record how the non-reduction-def value of COND_EXPR is defined. */
6227 && dt != vect_constant_def
6228 && dt != vect_induction_def
6229 && !(dt == vect_nested_cycle && nested_cycle))
6230 return false;
6231
6232 if (dt == vect_nested_cycle)
6233 {
6234 found_nested_cycle_def = true;
6235 reduc_def_info = def_stmt_info;
6236 reduc_index = i;
6237 }
6238
6239 if (i == 1 && code == COND_EXPR)
6240 {
6241 /* Record how value of COND_EXPR is defined. */
6242 if (dt == vect_constant_def) 6178 if (dt == vect_constant_def)
6243 { 6179 {
6244 cond_reduc_dt = dt; 6180 cond_reduc_dt = dt;
6245 cond_reduc_val = ops[i]; 6181 cond_reduc_val = op;
6246 } 6182 }
6247 if (dt == vect_induction_def 6183 if (dt == vect_induction_def
6248 && def_stmt_info 6184 && def_stmt_info
6249 && is_nonwrapping_integer_induction (def_stmt_info, loop)) 6185 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6250 { 6186 {
6251 cond_reduc_dt = dt; 6187 cond_reduc_dt = dt;
6252 cond_stmt_vinfo = def_stmt_info; 6188 cond_stmt_vinfo = def_stmt_info;
6253 } 6189 }
6254 } 6190 }
6255 } 6191 }
6256
6257 if (!vectype_in) 6192 if (!vectype_in)
6258 vectype_in = vectype_out; 6193 vectype_in = STMT_VINFO_VECTYPE (phi_info);
6259 6194 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6260 /* When vectorizing a reduction chain w/o SLP the reduction PHI is not 6195
6261 directy used in stmt. */ 6196 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6262 if (reduc_index == -1) 6197 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6263 { 6198 /* If we have a condition reduction, see if we can simplify it further. */
6264 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION) 6199 if (v_reduc_type == COND_REDUCTION)
6200 {
6201 if (slp_node)
6202 return false;
6203
6204 /* When the condition uses the reduction value in the condition, fail. */
6205 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6265 { 6206 {
6266 if (dump_enabled_p ()) 6207 if (dump_enabled_p ())
6267 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6208 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6268 "in-order reduction chain without SLP.\n"); 6209 "condition depends on previous iteration\n");
6269 return false; 6210 return false;
6270 } 6211 }
6271 6212
6272 if (orig_stmt_info) 6213 if (reduc_chain_length == 1
6273 reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info); 6214 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6274 else 6215 vectype_in, OPTIMIZE_FOR_SPEED))
6275 reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info);
6276 }
6277
6278 if (! reduc_def_info)
6279 return false;
6280
6281 gphi *reduc_def_phi = dyn_cast <gphi *> (reduc_def_info->stmt);
6282 if (!reduc_def_phi)
6283 return false;
6284
6285 if (!(reduc_index == -1
6286 || dts[reduc_index] == vect_reduction_def
6287 || dts[reduc_index] == vect_nested_cycle
6288 || ((dts[reduc_index] == vect_internal_def
6289 || dts[reduc_index] == vect_external_def
6290 || dts[reduc_index] == vect_constant_def
6291 || dts[reduc_index] == vect_induction_def)
6292 && nested_cycle && found_nested_cycle_def)))
6293 {
6294 /* For pattern recognized stmts, orig_stmt might be a reduction,
6295 but some helper statements for the pattern might not, or
6296 might be COND_EXPRs with reduction uses in the condition. */
6297 gcc_assert (orig_stmt_info);
6298 return false;
6299 }
6300
6301 /* PHIs should not participate in patterns. */
6302 gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
6303 enum vect_reduction_type v_reduc_type
6304 = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6305 stmt_vec_info tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6306
6307 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6308 /* If we have a condition reduction, see if we can simplify it further. */
6309 if (v_reduc_type == COND_REDUCTION)
6310 {
6311 /* TODO: We can't yet handle reduction chains, since we need to treat
6312 each COND_EXPR in the chain specially, not just the last one.
6313 E.g. for:
6314
6315 x_1 = PHI <x_3, ...>
6316 x_2 = a_2 ? ... : x_1;
6317 x_3 = a_3 ? ... : x_2;
6318
6319 we're interested in the last element in x_3 for which a_2 || a_3
6320 is true, whereas the current reduction chain handling would
6321 vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6322 as a reduction operation. */
6323 if (reduc_index == -1)
6324 {
6325 if (dump_enabled_p ())
6326 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6327 "conditional reduction chains not supported\n");
6328 return false;
6329 }
6330
6331 /* vect_is_simple_reduction ensured that operand 2 is the
6332 loop-carried operand. */
6333 gcc_assert (reduc_index == 2);
6334
6335 /* Loop peeling modifies initial value of reduction PHI, which
6336 makes the reduction stmt to be transformed different to the
6337 original stmt analyzed. We need to record reduction code for
6338 CONST_COND_REDUCTION type reduction at analyzing stage, thus
6339 it can be used directly at transform stage. */
6340 if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6341 || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6342 {
6343 /* Also set the reduction type to CONST_COND_REDUCTION. */
6344 gcc_assert (cond_reduc_dt == vect_constant_def);
6345 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6346 }
6347 else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6348 vectype_in, OPTIMIZE_FOR_SPEED))
6349 { 6216 {
6350 if (dump_enabled_p ()) 6217 if (dump_enabled_p ())
6351 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6218 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6352 "optimizing condition reduction with" 6219 "optimizing condition reduction with"
6353 " FOLD_EXTRACT_LAST.\n"); 6220 " FOLD_EXTRACT_LAST.\n");
6354 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION; 6221 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6355 } 6222 }
6356 else if (cond_reduc_dt == vect_induction_def) 6223 else if (cond_reduc_dt == vect_induction_def)
6357 { 6224 {
6358 tree base 6225 tree base
6359 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo); 6226 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6360 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo); 6227 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6361 6228
6362 gcc_assert (TREE_CODE (base) == INTEGER_CST 6229 gcc_assert (TREE_CODE (base) == INTEGER_CST
6363 && TREE_CODE (step) == INTEGER_CST); 6230 && TREE_CODE (step) == INTEGER_CST);
6364 cond_reduc_val = NULL_TREE; 6231 cond_reduc_val = NULL_TREE;
6232 enum tree_code cond_reduc_op_code = ERROR_MARK;
6233 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6234 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6235 ;
6365 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR 6236 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6366 above base; punt if base is the minimum value of the type for 6237 above base; punt if base is the minimum value of the type for
6367 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */ 6238 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6368 if (tree_int_cst_sgn (step) == -1) 6239 else if (tree_int_cst_sgn (step) == -1)
6369 { 6240 {
6370 cond_reduc_op_code = MIN_EXPR; 6241 cond_reduc_op_code = MIN_EXPR;
6371 if (tree_int_cst_sgn (base) == -1) 6242 if (tree_int_cst_sgn (base) == -1)
6372 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0); 6243 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6373 else if (tree_int_cst_lt (base, 6244 else if (tree_int_cst_lt (base,
6389 { 6260 {
6390 if (dump_enabled_p ()) 6261 if (dump_enabled_p ())
6391 dump_printf_loc (MSG_NOTE, vect_location, 6262 dump_printf_loc (MSG_NOTE, vect_location,
6392 "condition expression based on " 6263 "condition expression based on "
6393 "integer induction.\n"); 6264 "integer induction.\n");
6394 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) 6265 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6395 = INTEGER_INDUC_COND_REDUCTION; 6266 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6267 = cond_reduc_val;
6268 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6396 } 6269 }
6397 } 6270 }
6398 else if (cond_reduc_dt == vect_constant_def) 6271 else if (cond_reduc_dt == vect_constant_def)
6399 { 6272 {
6400 enum vect_def_type cond_initial_dt; 6273 enum vect_def_type cond_initial_dt;
6401 gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6402 tree cond_initial_val 6274 tree cond_initial_val
6403 = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop)); 6275 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
6404 6276
6405 gcc_assert (cond_reduc_val != NULL_TREE); 6277 gcc_assert (cond_reduc_val != NULL_TREE);
6406 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt); 6278 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6407 if (cond_initial_dt == vect_constant_def 6279 if (cond_initial_dt == vect_constant_def
6408 && types_compatible_p (TREE_TYPE (cond_initial_val), 6280 && types_compatible_p (TREE_TYPE (cond_initial_val),
6415 if (dump_enabled_p ()) 6287 if (dump_enabled_p ())
6416 dump_printf_loc (MSG_NOTE, vect_location, 6288 dump_printf_loc (MSG_NOTE, vect_location,
6417 "condition expression based on " 6289 "condition expression based on "
6418 "compile time constant.\n"); 6290 "compile time constant.\n");
6419 /* Record reduction code at analysis stage. */ 6291 /* Record reduction code at analysis stage. */
6420 STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) 6292 STMT_VINFO_REDUC_CODE (reduc_info)
6421 = integer_onep (e) ? MAX_EXPR : MIN_EXPR; 6293 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6422 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) 6294 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6423 = CONST_COND_REDUCTION;
6424 } 6295 }
6425 } 6296 }
6426 } 6297 }
6427 } 6298 }
6428 6299
6429 if (orig_stmt_info) 6300 if (STMT_VINFO_LIVE_P (phi_info))
6430 gcc_assert (tmp == orig_stmt_info
6431 || REDUC_GROUP_FIRST_ELEMENT (tmp) == orig_stmt_info);
6432 else
6433 /* We changed STMT to be the first stmt in reduction chain, hence we
6434 check that in this case the first element in the chain is STMT. */
6435 gcc_assert (tmp == stmt_info
6436 || REDUC_GROUP_FIRST_ELEMENT (tmp) == stmt_info);
6437
6438 if (STMT_VINFO_LIVE_P (reduc_def_info))
6439 return false; 6301 return false;
6440 6302
6441 if (slp_node) 6303 if (slp_node)
6442 ncopies = 1; 6304 ncopies = 1;
6443 else 6305 else
6444 ncopies = vect_get_num_copies (loop_vinfo, vectype_in); 6306 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6445 6307
6446 gcc_assert (ncopies >= 1); 6308 gcc_assert (ncopies >= 1);
6447 6309
6448 vec_mode = TYPE_MODE (vectype_in);
6449 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out); 6310 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6450 6311
6451 if (code == COND_EXPR) 6312 if (nested_cycle)
6452 { 6313 {
6453 /* Only call during the analysis stage, otherwise we'll lose 6314 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6454 STMT_VINFO_TYPE. */ 6315 == vect_double_reduction_def);
6455 if (!vec_stmt && !vectorizable_condition (stmt_info, gsi, NULL, 6316 double_reduc = true;
6456 ops[reduc_index], 0, NULL,
6457 cost_vec))
6458 {
6459 if (dump_enabled_p ())
6460 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6461 "unsupported condition in reduction\n");
6462 return false;
6463 }
6464 }
6465 else
6466 {
6467 /* 4. Supportable by target? */
6468
6469 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6470 || code == LROTATE_EXPR || code == RROTATE_EXPR)
6471 {
6472 /* Shifts and rotates are only supported by vectorizable_shifts,
6473 not vectorizable_reduction. */
6474 if (dump_enabled_p ())
6475 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6476 "unsupported shift or rotation.\n");
6477 return false;
6478 }
6479
6480 /* 4.1. check support for the operation in the loop */
6481 optab = optab_for_tree_code (code, vectype_in, optab_default);
6482 if (!optab)
6483 {
6484 if (dump_enabled_p ())
6485 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6486 "no optab.\n");
6487
6488 return false;
6489 }
6490
6491 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6492 {
6493 if (dump_enabled_p ())
6494 dump_printf (MSG_NOTE, "op not supported by target.\n");
6495
6496 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6497 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6498 return false;
6499
6500 if (dump_enabled_p ())
6501 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6502 }
6503
6504 /* Worthwhile without SIMD support? */
6505 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6506 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6507 {
6508 if (dump_enabled_p ())
6509 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6510 "not worthwhile without SIMD support.\n");
6511
6512 return false;
6513 }
6514 } 6317 }
6515 6318
6516 /* 4.2. Check support for the epilog operation. 6319 /* 4.2. Check support for the epilog operation.
6517 6320
6518 If STMT represents a reduction pattern, then the type of the 6321 If STMT represents a reduction pattern, then the type of the
6546 the arguments are the same as the type of the reduction variable. 6349 the arguments are the same as the type of the reduction variable.
6547 For "regular" reductions we can therefore use the same vector type 6350 For "regular" reductions we can therefore use the same vector type
6548 (and also the same tree-code) when generating the epilog code and 6351 (and also the same tree-code) when generating the epilog code and
6549 when generating the code inside the loop. */ 6352 when generating the code inside the loop. */
6550 6353
6551 vect_reduction_type reduction_type 6354 enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6552 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info); 6355 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6553 if (orig_stmt_info 6356
6554 && (reduction_type == TREE_CODE_REDUCTION 6357 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6555 || reduction_type == FOLD_LEFT_REDUCTION)) 6358 if (reduction_type == TREE_CODE_REDUCTION)
6556 { 6359 {
6557 /* This is a reduction pattern: get the vectype from the type of the 6360 /* Check whether it's ok to change the order of the computation.
6558 reduction variable, and get the tree-code from orig_stmt. */ 6361 Generally, when vectorizing a reduction we change the order of the
6559 orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt); 6362 computation. This may change the behavior of the program in some
6560 gcc_assert (vectype_out); 6363 cases, so we need to check that this is ok. One exception is when
6561 vec_mode = TYPE_MODE (vectype_out); 6364 vectorizing an outer-loop: the inner-loop is executed sequentially,
6562 } 6365 and therefore vectorizing reductions in the inner-loop during
6563 else 6366 outer-loop vectorization is safe. */
6564 { 6367 if (needs_fold_left_reduction_p (scalar_type, orig_code))
6565 /* Regular reduction: use the same vectype and tree-code as used for 6368 {
6566 the vector code inside the loop can be used for the epilog code. */ 6369 /* When vectorizing a reduction chain w/o SLP the reduction PHI
6567 orig_code = code; 6370 is not directy used in stmt. */
6568 6371 if (!only_slp_reduc_chain
6569 if (code == MINUS_EXPR) 6372 && reduc_chain_length != 1)
6570 orig_code = PLUS_EXPR; 6373 {
6571 6374 if (dump_enabled_p ())
6572 /* For simple condition reductions, replace with the actual expression 6375 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6573 we want to base our reduction around. */ 6376 "in-order reduction chain without SLP.\n");
6574 if (reduction_type == CONST_COND_REDUCTION) 6377 return false;
6575 { 6378 }
6576 orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info); 6379 STMT_VINFO_REDUC_TYPE (reduc_info)
6577 gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR); 6380 = reduction_type = FOLD_LEFT_REDUCTION;
6578 } 6381 }
6579 else if (reduction_type == INTEGER_INDUC_COND_REDUCTION) 6382 else if (!commutative_tree_code (orig_code)
6580 orig_code = cond_reduc_op_code; 6383 || !associative_tree_code (orig_code))
6581 } 6384 {
6582 6385 if (dump_enabled_p ())
6583 if (nested_cycle) 6386 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6584 { 6387 "reduction: not commutative/associative");
6585 def_bb = gimple_bb (reduc_def_phi); 6388 return false;
6586 def_stmt_loop = def_bb->loop_father; 6389 }
6587 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, 6390 }
6588 loop_preheader_edge (def_stmt_loop)); 6391
6589 stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg); 6392 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6590 if (def_arg_stmt_info 6393 && ncopies > 1)
6591 && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info) 6394 {
6592 == vect_double_reduction_def)) 6395 if (dump_enabled_p ())
6593 double_reduc = true; 6396 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6594 } 6397 "multiple types in double reduction or condition "
6595 6398 "reduction or fold-left reduction.\n");
6596 reduc_fn = IFN_LAST; 6399 return false;
6597 6400 }
6401
6402 internal_fn reduc_fn = IFN_LAST;
6598 if (reduction_type == TREE_CODE_REDUCTION 6403 if (reduction_type == TREE_CODE_REDUCTION
6599 || reduction_type == FOLD_LEFT_REDUCTION 6404 || reduction_type == FOLD_LEFT_REDUCTION
6600 || reduction_type == INTEGER_INDUC_COND_REDUCTION 6405 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6601 || reduction_type == CONST_COND_REDUCTION) 6406 || reduction_type == CONST_COND_REDUCTION)
6602 { 6407 {
6637 6442
6638 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type, 6443 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6639 OPTIMIZE_FOR_SPEED)) 6444 OPTIMIZE_FOR_SPEED))
6640 reduc_fn = IFN_REDUC_MAX; 6445 reduc_fn = IFN_REDUC_MAX;
6641 } 6446 }
6447 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
6642 6448
6643 if (reduction_type != EXTRACT_LAST_REDUCTION 6449 if (reduction_type != EXTRACT_LAST_REDUCTION
6644 && (!nested_cycle || double_reduc) 6450 && (!nested_cycle || double_reduc)
6645 && reduc_fn == IFN_LAST 6451 && reduc_fn == IFN_LAST
6646 && !nunits_out.is_constant ()) 6452 && !nunits_out.is_constant ())
6650 "missing target support for reduction on" 6456 "missing target support for reduction on"
6651 " variable-length vectors.\n"); 6457 " variable-length vectors.\n");
6652 return false; 6458 return false;
6653 } 6459 }
6654 6460
6655 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6656 && ncopies > 1)
6657 {
6658 if (dump_enabled_p ())
6659 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6660 "multiple types in double reduction or condition "
6661 "reduction.\n");
6662 return false;
6663 }
6664
6665 /* For SLP reductions, see if there is a neutral value we can use. */ 6461 /* For SLP reductions, see if there is a neutral value we can use. */
6666 tree neutral_op = NULL_TREE; 6462 tree neutral_op = NULL_TREE;
6667 if (slp_node) 6463 if (slp_node)
6668 neutral_op = neutral_op_for_slp_reduction 6464 neutral_op = neutral_op_for_slp_reduction
6669 (slp_node_instance->reduc_phis, code, 6465 (slp_node_instance->reduc_phis, vectype_out, orig_code,
6670 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL); 6466 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6671 6467
6672 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION) 6468 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6673 { 6469 {
6674 /* We can't support in-order reductions of code such as this: 6470 /* We can't support in-order reductions of code such as this:
6729 /* We checked above that we could build the initial vector when 6525 /* We checked above that we could build the initial vector when
6730 there's a neutral element value. Check here for the case in 6526 there's a neutral element value. Check here for the case in
6731 which each SLP statement has its own initial value and in which 6527 which each SLP statement has its own initial value and in which
6732 that value needs to be repeated for every instance of the 6528 that value needs to be repeated for every instance of the
6733 statement within the initial vector. */ 6529 statement within the initial vector. */
6734 unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length (); 6530 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
6735 scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6736 if (!neutral_op 6531 if (!neutral_op
6737 && !can_duplicate_and_interleave_p (group_size, elt_mode)) 6532 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
6533 TREE_TYPE (vectype_out)))
6738 { 6534 {
6739 if (dump_enabled_p ()) 6535 if (dump_enabled_p ())
6740 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6741 "unsupported form of SLP reduction for" 6537 "unsupported form of SLP reduction for"
6742 " variable-length vectors: cannot build" 6538 " variable-length vectors: cannot build"
6753 "unsupported form of SLP reduction for" 6549 "unsupported form of SLP reduction for"
6754 " variable-length vectors: the vector size" 6550 " variable-length vectors: the vector size"
6755 " is not a multiple of the number of results.\n"); 6551 " is not a multiple of the number of results.\n");
6756 return false; 6552 return false;
6757 } 6553 }
6758 }
6759
6760 /* In case of widenning multiplication by a constant, we update the type
6761 of the constant to be the type of the other operand. We check that the
6762 constant fits the type in the pattern recognition pass. */
6763 if (code == DOT_PROD_EXPR
6764 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6765 {
6766 if (TREE_CODE (ops[0]) == INTEGER_CST)
6767 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6768 else if (TREE_CODE (ops[1]) == INTEGER_CST)
6769 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6770 else
6771 {
6772 if (dump_enabled_p ())
6773 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6774 "invalid types in dot-prod\n");
6775
6776 return false;
6777 }
6778 } 6554 }
6779 6555
6780 if (reduction_type == COND_REDUCTION) 6556 if (reduction_type == COND_REDUCTION)
6781 { 6557 {
6782 widest_int ni; 6558 widest_int ni;
6832 from the vectorized reduction operation generated in the previous iteration. 6608 from the vectorized reduction operation generated in the previous iteration.
6833 6609
6834 This only works when we see both the reduction PHI and its only consumer 6610 This only works when we see both the reduction PHI and its only consumer
6835 in vectorizable_reduction and there are no intermediate stmts 6611 in vectorizable_reduction and there are no intermediate stmts
6836 participating. */ 6612 participating. */
6837 stmt_vec_info use_stmt_info;
6838 tree reduc_phi_result = gimple_phi_result (reduc_def_phi);
6839 if (ncopies > 1 6613 if (ncopies > 1
6840 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live) 6614 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6841 && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result)) 6615 && reduc_chain_length == 1)
6842 && vect_stmt_to_vectorize (use_stmt_info) == stmt_info) 6616 single_defuse_cycle = true;
6843 { 6617
6844 single_defuse_cycle = true; 6618 if (single_defuse_cycle || lane_reduc_code_p)
6845 epilog_copies = 1; 6619 {
6846 } 6620 gcc_assert (code != COND_EXPR);
6847 else 6621
6848 epilog_copies = ncopies; 6622 /* 4. Supportable by target? */
6623 bool ok = true;
6624
6625 /* 4.1. check support for the operation in the loop */
6626 optab optab = optab_for_tree_code (code, vectype_in, optab_vector);
6627 if (!optab)
6628 {
6629 if (dump_enabled_p ())
6630 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6631 "no optab.\n");
6632 ok = false;
6633 }
6634
6635 machine_mode vec_mode = TYPE_MODE (vectype_in);
6636 if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6637 {
6638 if (dump_enabled_p ())
6639 dump_printf (MSG_NOTE, "op not supported by target.\n");
6640 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6641 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6642 ok = false;
6643 else
6644 if (dump_enabled_p ())
6645 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6646 }
6647
6648 /* Worthwhile without SIMD support? */
6649 if (ok
6650 && !VECTOR_MODE_P (TYPE_MODE (vectype_in))
6651 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6652 {
6653 if (dump_enabled_p ())
6654 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6655 "not worthwhile without SIMD support.\n");
6656 ok = false;
6657 }
6658
6659 /* lane-reducing operations have to go through vect_transform_reduction.
6660 For the other cases try without the single cycle optimization. */
6661 if (!ok)
6662 {
6663 if (lane_reduc_code_p)
6664 return false;
6665 else
6666 single_defuse_cycle = false;
6667 }
6668 }
6669 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
6849 6670
6850 /* If the reduction stmt is one of the patterns that have lane 6671 /* If the reduction stmt is one of the patterns that have lane
6851 reduction embedded we cannot handle the case of ! single_defuse_cycle. */ 6672 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
6852 if ((ncopies > 1 6673 if ((ncopies > 1 && ! single_defuse_cycle)
6853 && ! single_defuse_cycle) 6674 && lane_reduc_code_p)
6854 && (code == DOT_PROD_EXPR
6855 || code == WIDEN_SUM_EXPR
6856 || code == SAD_EXPR))
6857 { 6675 {
6858 if (dump_enabled_p ()) 6676 if (dump_enabled_p ())
6859 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6677 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6860 "multi def-use cycle not possible for lane-reducing " 6678 "multi def-use cycle not possible for lane-reducing "
6861 "reduction operation\n"); 6679 "reduction operation\n");
6865 if (slp_node) 6683 if (slp_node)
6866 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); 6684 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6867 else 6685 else
6868 vec_num = 1; 6686 vec_num = 1;
6869 6687
6688 vect_model_reduction_cost (stmt_info, reduc_fn, reduction_type, ncopies,
6689 cost_vec);
6690 if (dump_enabled_p ()
6691 && reduction_type == FOLD_LEFT_REDUCTION)
6692 dump_printf_loc (MSG_NOTE, vect_location,
6693 "using an in-order (fold-left) reduction.\n");
6694 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
6695 /* All but single defuse-cycle optimized, lane-reducing and fold-left
6696 reductions go through their own vectorizable_* routines. */
6697 if (!single_defuse_cycle
6698 && code != DOT_PROD_EXPR
6699 && code != WIDEN_SUM_EXPR
6700 && code != SAD_EXPR
6701 && reduction_type != FOLD_LEFT_REDUCTION)
6702 {
6703 stmt_vec_info tem
6704 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6705 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
6706 {
6707 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
6708 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
6709 }
6710 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
6711 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
6712 }
6713 else if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6714 {
6715 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6716 internal_fn cond_fn = get_conditional_internal_fn (code);
6717
6718 if (reduction_type != FOLD_LEFT_REDUCTION
6719 && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
6720 && (cond_fn == IFN_LAST
6721 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6722 OPTIMIZE_FOR_SPEED)))
6723 {
6724 if (dump_enabled_p ())
6725 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6726 "can't use a fully-masked loop because no"
6727 " conditional operation is available.\n");
6728 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6729 }
6730 else if (reduction_type == FOLD_LEFT_REDUCTION
6731 && reduc_fn == IFN_LAST
6732 && !expand_vec_cond_expr_p (vectype_in,
6733 truth_type_for (vectype_in),
6734 SSA_NAME))
6735 {
6736 if (dump_enabled_p ())
6737 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6738 "can't use a fully-masked loop because no"
6739 " conditional operation is available.\n");
6740 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6741 }
6742 else
6743 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6744 vectype_in, NULL);
6745 }
6746 return true;
6747 }
6748
6749 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
6750 value. */
6751
6752 bool
6753 vect_transform_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6754 stmt_vec_info *vec_stmt, slp_tree slp_node)
6755 {
6756 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6757 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6758 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6759 int i;
6760 int ncopies;
6761 int j;
6762 int vec_num;
6763
6764 stmt_vec_info reduc_info = info_for_reduction (stmt_info);
6765 gcc_assert (reduc_info->is_reduc_info);
6766
6767 if (nested_in_vect_loop_p (loop, stmt_info))
6768 {
6769 loop = loop->inner;
6770 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
6771 }
6772
6773 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6774 enum tree_code code = gimple_assign_rhs_code (stmt);
6775 int op_type = TREE_CODE_LENGTH (code);
6776
6777 /* Flatten RHS. */
6778 tree ops[3];
6779 switch (get_gimple_rhs_class (code))
6780 {
6781 case GIMPLE_TERNARY_RHS:
6782 ops[2] = gimple_assign_rhs3 (stmt);
6783 /* Fall thru. */
6784 case GIMPLE_BINARY_RHS:
6785 ops[0] = gimple_assign_rhs1 (stmt);
6786 ops[1] = gimple_assign_rhs2 (stmt);
6787 break;
6788 default:
6789 gcc_unreachable ();
6790 }
6791
6792 /* All uses but the last are expected to be defined in the loop.
6793 The last use is the reduction variable. In case of nested cycle this
6794 assumption is not true: we use reduc_index to record the index of the
6795 reduction variable. */
6796 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
6797 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6798 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
6799 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
6800
6801 if (slp_node)
6802 {
6803 ncopies = 1;
6804 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6805 }
6806 else
6807 {
6808 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6809 vec_num = 1;
6810 }
6811
6870 internal_fn cond_fn = get_conditional_internal_fn (code); 6812 internal_fn cond_fn = get_conditional_internal_fn (code);
6871 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); 6813 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6872 6814 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
6873 if (!vec_stmt) /* transformation not required. */
6874 {
6875 vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
6876 if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6877 {
6878 if (reduction_type != FOLD_LEFT_REDUCTION
6879 && (cond_fn == IFN_LAST
6880 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6881 OPTIMIZE_FOR_SPEED)))
6882 {
6883 if (dump_enabled_p ())
6884 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6885 "can't use a fully-masked loop because no"
6886 " conditional operation is available.\n");
6887 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6888 }
6889 else if (reduc_index == -1)
6890 {
6891 if (dump_enabled_p ())
6892 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6893 "can't use a fully-masked loop for chained"
6894 " reductions.\n");
6895 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6896 }
6897 else
6898 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6899 vectype_in);
6900 }
6901 if (dump_enabled_p ()
6902 && reduction_type == FOLD_LEFT_REDUCTION)
6903 dump_printf_loc (MSG_NOTE, vect_location,
6904 "using an in-order (fold-left) reduction.\n");
6905 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6906 return true;
6907 }
6908 6815
6909 /* Transform. */ 6816 /* Transform. */
6817 stmt_vec_info new_stmt_info = NULL;
6818 stmt_vec_info prev_stmt_info;
6819 tree new_temp = NULL_TREE;
6820 auto_vec<tree> vec_oprnds0;
6821 auto_vec<tree> vec_oprnds1;
6822 auto_vec<tree> vec_oprnds2;
6823 tree def0;
6910 6824
6911 if (dump_enabled_p ()) 6825 if (dump_enabled_p ())
6912 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n"); 6826 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6913 6827
6914 /* FORNOW: Multiple types are not supported for condition. */ 6828 /* FORNOW: Multiple types are not supported for condition. */
6915 if (code == COND_EXPR) 6829 if (code == COND_EXPR)
6916 gcc_assert (ncopies == 1); 6830 gcc_assert (ncopies == 1);
6917 6831
6918 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo); 6832 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
6919 6833
6834 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6920 if (reduction_type == FOLD_LEFT_REDUCTION) 6835 if (reduction_type == FOLD_LEFT_REDUCTION)
6921 return vectorize_fold_left_reduction 6836 {
6922 (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code, 6837 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
6923 reduc_fn, ops, vectype_in, reduc_index, masks); 6838 return vectorize_fold_left_reduction
6924 6839 (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
6925 if (reduction_type == EXTRACT_LAST_REDUCTION) 6840 reduc_fn, ops, vectype_in, reduc_index, masks);
6926 { 6841 }
6927 gcc_assert (!slp_node); 6842
6928 return vectorizable_condition (stmt_info, gsi, vec_stmt, 6843 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
6929 NULL, reduc_index, NULL, NULL); 6844 gcc_assert (single_defuse_cycle
6930 } 6845 || code == DOT_PROD_EXPR
6846 || code == WIDEN_SUM_EXPR
6847 || code == SAD_EXPR);
6931 6848
6932 /* Create the destination vector */ 6849 /* Create the destination vector */
6933 vec_dest = vect_create_destination_var (scalar_dest, vectype_out); 6850 tree scalar_dest = gimple_assign_lhs (stmt);
6851 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6934 6852
6935 prev_stmt_info = NULL; 6853 prev_stmt_info = NULL;
6936 prev_phi_info = NULL;
6937 if (!slp_node) 6854 if (!slp_node)
6938 { 6855 {
6939 vec_oprnds0.create (1); 6856 vec_oprnds0.create (1);
6940 vec_oprnds1.create (1); 6857 vec_oprnds1.create (1);
6941 if (op_type == ternary_op) 6858 if (op_type == ternary_op)
6942 vec_oprnds2.create (1); 6859 vec_oprnds2.create (1);
6943 } 6860 }
6944 6861
6945 phis.create (vec_num);
6946 vect_defs.create (vec_num);
6947 if (!slp_node)
6948 vect_defs.quick_push (NULL_TREE);
6949
6950 if (slp_node)
6951 phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
6952 else
6953 phis.quick_push (STMT_VINFO_VEC_STMT (reduc_def_info));
6954
6955 for (j = 0; j < ncopies; j++) 6862 for (j = 0; j < ncopies; j++)
6956 { 6863 {
6957 if (code == COND_EXPR)
6958 {
6959 gcc_assert (!slp_node);
6960 vectorizable_condition (stmt_info, gsi, vec_stmt,
6961 PHI_RESULT (phis[0]->stmt),
6962 reduc_index, NULL, NULL);
6963 /* Multiple types are not supported for condition. */
6964 break;
6965 }
6966
6967 /* Handle uses. */ 6864 /* Handle uses. */
6968 if (j == 0) 6865 if (j == 0)
6969 { 6866 {
6970 if (slp_node) 6867 if (slp_node)
6971 { 6868 {
6972 /* Get vec defs for all the operands except the reduction index, 6869 /* Get vec defs for all the operands except the reduction index,
6973 ensuring the ordering of the ops in the vector is kept. */ 6870 ensuring the ordering of the ops in the vector is kept. */
6974 auto_vec<tree, 3> slp_ops;
6975 auto_vec<vec<tree>, 3> vec_defs; 6871 auto_vec<vec<tree>, 3> vec_defs;
6976 6872 vect_get_slp_defs (slp_node, &vec_defs);
6977 slp_ops.quick_push (ops[0]);
6978 slp_ops.quick_push (ops[1]);
6979 if (op_type == ternary_op)
6980 slp_ops.quick_push (ops[2]);
6981
6982 vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
6983
6984 vec_oprnds0.safe_splice (vec_defs[0]); 6873 vec_oprnds0.safe_splice (vec_defs[0]);
6985 vec_defs[0].release (); 6874 vec_defs[0].release ();
6986 vec_oprnds1.safe_splice (vec_defs[1]); 6875 vec_oprnds1.safe_splice (vec_defs[1]);
6987 vec_defs[1].release (); 6876 vec_defs[1].release ();
6988 if (op_type == ternary_op) 6877 if (op_type == ternary_op)
7033 } 6922 }
7034 6923
7035 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0) 6924 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7036 { 6925 {
7037 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE }; 6926 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7038 if (masked_loop_p) 6927 if (masked_loop_p && !mask_by_cond_expr)
7039 { 6928 {
7040 /* Make sure that the reduction accumulator is vop[0]. */ 6929 /* Make sure that the reduction accumulator is vop[0]. */
7041 if (reduc_index == 1) 6930 if (reduc_index == 1)
7042 { 6931 {
7043 gcc_assert (commutative_tree_code (code)); 6932 gcc_assert (commutative_tree_code (code));
7057 else 6946 else
7058 { 6947 {
7059 if (op_type == ternary_op) 6948 if (op_type == ternary_op)
7060 vop[2] = vec_oprnds2[i]; 6949 vop[2] = vec_oprnds2[i];
7061 6950
6951 if (masked_loop_p && mask_by_cond_expr)
6952 {
6953 tree mask = vect_get_loop_mask (gsi, masks,
6954 vec_num * ncopies,
6955 vectype_in, i * ncopies + j);
6956 build_vect_cond_expr (code, vop, mask, gsi);
6957 }
6958
7062 gassign *new_stmt = gimple_build_assign (vec_dest, code, 6959 gassign *new_stmt = gimple_build_assign (vec_dest, code,
7063 vop[0], vop[1], vop[2]); 6960 vop[0], vop[1], vop[2]);
7064 new_temp = make_ssa_name (vec_dest, new_stmt); 6961 new_temp = make_ssa_name (vec_dest, new_stmt);
7065 gimple_assign_set_lhs (new_stmt, new_temp); 6962 gimple_assign_set_lhs (new_stmt, new_temp);
7066 new_stmt_info 6963 new_stmt_info
7067 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi); 6964 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
7068 } 6965 }
7069 6966
7070 if (slp_node) 6967 if (slp_node)
7071 { 6968 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7072 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7073 vect_defs.quick_push (new_temp);
7074 }
7075 else
7076 vect_defs[0] = new_temp;
7077 } 6969 }
7078 6970
7079 if (slp_node) 6971 if (slp_node || single_defuse_cycle)
7080 continue; 6972 continue;
7081 6973
7082 if (j == 0) 6974 if (j == 0)
7083 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info; 6975 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7084 else 6976 else
7085 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info; 6977 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7086 6978
7087 prev_stmt_info = new_stmt_info; 6979 prev_stmt_info = new_stmt_info;
7088 } 6980 }
7089 6981
7090 /* Finalize the reduction-phi (set its arguments) and create the 6982 if (single_defuse_cycle && !slp_node)
7091 epilog reduction code. */ 6983 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7092 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7093 vect_defs[0] = gimple_get_lhs ((*vec_stmt)->stmt);
7094
7095 vect_create_epilog_for_reduction (vect_defs, stmt_info, reduc_def_phi,
7096 epilog_copies, reduc_fn, phis,
7097 double_reduc, slp_node, slp_node_instance,
7098 cond_reduc_val, cond_reduc_op_code,
7099 neutral_op);
7100 6984
7101 return true; 6985 return true;
7102 } 6986 }
6987
6988 /* Transform phase of a cycle PHI. */
6989
6990 bool
6991 vect_transform_cycle_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
6992 slp_tree slp_node, slp_instance slp_node_instance)
6993 {
6994 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6995 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6996 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6997 int i;
6998 int ncopies;
6999 stmt_vec_info prev_phi_info;
7000 int j;
7001 bool nested_cycle = false;
7002 int vec_num;
7003
7004 if (nested_in_vect_loop_p (loop, stmt_info))
7005 {
7006 loop = loop->inner;
7007 nested_cycle = true;
7008 }
7009
7010 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7011 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7012 stmt_vec_info reduc_info = info_for_reduction (stmt_info);
7013 gcc_assert (reduc_info->is_reduc_info);
7014
7015 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7016 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7017 /* Leave the scalar phi in place. */
7018 return true;
7019
7020 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7021 /* For a nested cycle we do not fill the above. */
7022 if (!vectype_in)
7023 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7024 gcc_assert (vectype_in);
7025
7026 if (slp_node)
7027 {
7028 /* The size vect_schedule_slp_instance computes is off for us. */
7029 vec_num = vect_get_num_vectors
7030 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7031 * SLP_TREE_SCALAR_STMTS (slp_node).length (), vectype_in);
7032 ncopies = 1;
7033 }
7034 else
7035 {
7036 vec_num = 1;
7037 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7038 }
7039
7040 /* Check whether we should use a single PHI node and accumulate
7041 vectors to one before the backedge. */
7042 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7043 ncopies = 1;
7044
7045 /* Create the destination vector */
7046 gphi *phi = as_a <gphi *> (stmt_info->stmt);
7047 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7048 vectype_out);
7049
7050 /* Get the loop-entry arguments. */
7051 tree vec_initial_def;
7052 auto_vec<tree> vec_initial_defs;
7053 if (slp_node)
7054 {
7055 vec_initial_defs.reserve (vec_num);
7056 gcc_assert (slp_node == slp_node_instance->reduc_phis);
7057 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
7058 tree neutral_op
7059 = neutral_op_for_slp_reduction (slp_node, vectype_out,
7060 STMT_VINFO_REDUC_CODE (reduc_info),
7061 first != NULL);
7062 get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
7063 &vec_initial_defs, vec_num,
7064 first != NULL, neutral_op);
7065 }
7066 else
7067 {
7068 /* Get at the scalar def before the loop, that defines the initial
7069 value of the reduction variable. */
7070 tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
7071 loop_preheader_edge (loop));
7072 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7073 and we can't use zero for induc_val, use initial_def. Similarly
7074 for REDUC_MIN and initial_def larger than the base. */
7075 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7076 {
7077 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7078 if (TREE_CODE (initial_def) == INTEGER_CST
7079 && !integer_zerop (induc_val)
7080 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7081 && tree_int_cst_lt (initial_def, induc_val))
7082 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7083 && tree_int_cst_lt (induc_val, initial_def))))
7084 {
7085 induc_val = initial_def;
7086 /* Communicate we used the initial_def to epilouge
7087 generation. */
7088 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7089 }
7090 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7091 }
7092 else if (nested_cycle)
7093 {
7094 /* Do not use an adjustment def as that case is not supported
7095 correctly if ncopies is not one. */
7096 vec_initial_def = vect_get_vec_def_for_operand (initial_def,
7097 reduc_stmt_info);
7098 }
7099 else
7100 {
7101 tree adjustment_def = NULL_TREE;
7102 tree *adjustment_defp = &adjustment_def;
7103 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7104 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7105 adjustment_defp = NULL;
7106 vec_initial_def
7107 = get_initial_def_for_reduction (reduc_stmt_info, code,
7108 initial_def, adjustment_defp);
7109 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
7110 }
7111 vec_initial_defs.create (1);
7112 vec_initial_defs.quick_push (vec_initial_def);
7113 }
7114
7115 /* Generate the reduction PHIs upfront. */
7116 prev_phi_info = NULL;
7117 for (i = 0; i < vec_num; i++)
7118 {
7119 tree vec_init_def = vec_initial_defs[i];
7120 for (j = 0; j < ncopies; j++)
7121 {
7122 /* Create the reduction-phi that defines the reduction
7123 operand. */
7124 gphi *new_phi = create_phi_node (vec_dest, loop->header);
7125 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
7126
7127 /* Set the loop-entry arg of the reduction-phi. */
7128 if (j != 0 && nested_cycle)
7129 vec_init_def = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7130 vec_init_def);
7131 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7132 UNKNOWN_LOCATION);
7133
7134 /* The loop-latch arg is set in epilogue processing. */
7135
7136 if (slp_node)
7137 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
7138 else
7139 {
7140 if (j == 0)
7141 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
7142 else
7143 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
7144 prev_phi_info = new_phi_info;
7145 }
7146 }
7147 }
7148
7149 return true;
7150 }
7151
7152 /* Vectorizes LC PHIs. */
7153
7154 bool
7155 vectorizable_lc_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
7156 slp_tree slp_node)
7157 {
7158 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7159 if (!loop_vinfo
7160 || !is_a <gphi *> (stmt_info->stmt)
7161 || gimple_phi_num_args (stmt_info->stmt) != 1)
7162 return false;
7163
7164 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7165 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7166 return false;
7167
7168 if (!vec_stmt) /* transformation not required. */
7169 {
7170 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7171 return true;
7172 }
7173
7174 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7175 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7176 basic_block bb = gimple_bb (stmt_info->stmt);
7177 edge e = single_pred_edge (bb);
7178 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7179 vec<tree> vec_oprnds = vNULL;
7180 vect_get_vec_defs (gimple_phi_arg_def (stmt_info->stmt, 0), NULL_TREE,
7181 stmt_info, &vec_oprnds, NULL, slp_node);
7182 if (slp_node)
7183 {
7184 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7185 gcc_assert (vec_oprnds.length () == vec_num);
7186 for (unsigned i = 0; i < vec_num; i++)
7187 {
7188 /* Create the vectorized LC PHI node. */
7189 gphi *new_phi = create_phi_node (vec_dest, bb);
7190 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7191 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
7192 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
7193 }
7194 }
7195 else
7196 {
7197 unsigned ncopies = vect_get_num_copies (loop_vinfo, vectype);
7198 stmt_vec_info prev_phi_info = NULL;
7199 for (unsigned i = 0; i < ncopies; i++)
7200 {
7201 if (i != 0)
7202 vect_get_vec_defs_for_stmt_copy (loop_vinfo, &vec_oprnds, NULL);
7203 /* Create the vectorized LC PHI node. */
7204 gphi *new_phi = create_phi_node (vec_dest, bb);
7205 add_phi_arg (new_phi, vec_oprnds[0], e, UNKNOWN_LOCATION);
7206 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
7207 if (i == 0)
7208 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
7209 else
7210 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
7211 prev_phi_info = new_phi_info;
7212 }
7213 }
7214 vec_oprnds.release ();
7215
7216 return true;
7217 }
7218
7103 7219
7104 /* Function vect_min_worthwhile_factor. 7220 /* Function vect_min_worthwhile_factor.
7105 7221
7106 For a loop where we could vectorize the operation indicated by CODE, 7222 For a loop where we could vectorize the operation indicated by CODE,
7107 return the minimum vectorization factor that makes it worthwhile 7223 return the minimum vectorization factor that makes it worthwhile
7153 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED, 7269 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7154 stmt_vec_info *vec_stmt, slp_tree slp_node, 7270 stmt_vec_info *vec_stmt, slp_tree slp_node,
7155 stmt_vector_for_cost *cost_vec) 7271 stmt_vector_for_cost *cost_vec)
7156 { 7272 {
7157 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 7273 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7158 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 7274 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7159 unsigned ncopies; 7275 unsigned ncopies;
7160 bool nested_in_vect_loop = false; 7276 bool nested_in_vect_loop = false;
7161 struct loop *iv_loop; 7277 class loop *iv_loop;
7162 tree vec_def; 7278 tree vec_def;
7163 edge pe = loop_preheader_edge (loop); 7279 edge pe = loop_preheader_edge (loop);
7164 basic_block new_bb; 7280 basic_block new_bb;
7165 tree new_vec, vec_init, vec_step, t; 7281 tree new_vec, vec_init, vec_step, t;
7166 tree new_name; 7282 tree new_name;
7287 latch_e = loop_latch_edge (iv_loop); 7403 latch_e = loop_latch_edge (iv_loop);
7288 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e); 7404 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7289 7405
7290 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info); 7406 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7291 gcc_assert (step_expr != NULL_TREE); 7407 gcc_assert (step_expr != NULL_TREE);
7408 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
7292 7409
7293 pe = loop_preheader_edge (iv_loop); 7410 pe = loop_preheader_edge (iv_loop);
7294 init_expr = PHI_ARG_DEF_FROM_EDGE (phi, 7411 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7295 loop_preheader_edge (iv_loop)); 7412 loop_preheader_edge (iv_loop));
7296 7413
7297 stmts = NULL; 7414 stmts = NULL;
7298 if (!nested_in_vect_loop) 7415 if (!nested_in_vect_loop)
7299 { 7416 {
7300 /* Convert the initial value to the desired type. */ 7417 /* Convert the initial value to the IV update type. */
7301 tree new_type = TREE_TYPE (vectype); 7418 tree new_type = TREE_TYPE (step_expr);
7302 init_expr = gimple_convert (&stmts, new_type, init_expr); 7419 init_expr = gimple_convert (&stmts, new_type, init_expr);
7303 7420
7304 /* If we are using the loop mask to "peel" for alignment then we need 7421 /* If we are using the loop mask to "peel" for alignment then we need
7305 to adjust the start value here. */ 7422 to adjust the start value here. */
7306 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo); 7423 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7316 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type, 7433 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7317 init_expr, skip_step); 7434 init_expr, skip_step);
7318 } 7435 }
7319 } 7436 }
7320 7437
7321 /* Convert the step to the desired type. */
7322 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7323
7324 if (stmts) 7438 if (stmts)
7325 { 7439 {
7326 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); 7440 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7327 gcc_assert (!new_bb); 7441 gcc_assert (!new_bb);
7328 } 7442 }
7351 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr), 7465 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7352 expr, step_expr); 7466 expr, step_expr);
7353 if (! CONSTANT_CLASS_P (new_name)) 7467 if (! CONSTANT_CLASS_P (new_name))
7354 new_name = vect_init_vector (stmt_info, new_name, 7468 new_name = vect_init_vector (stmt_info, new_name,
7355 TREE_TYPE (step_expr), NULL); 7469 TREE_TYPE (step_expr), NULL);
7356 new_vec = build_vector_from_val (vectype, new_name); 7470 new_vec = build_vector_from_val (step_vectype, new_name);
7357 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL); 7471 vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7358 7472
7359 /* Now generate the IVs. */ 7473 /* Now generate the IVs. */
7360 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length (); 7474 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7361 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); 7475 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7362 unsigned elts = const_nunits * nvects; 7476 unsigned elts = const_nunits * nvects;
7365 gcc_assert (elts % group_size == 0); 7479 gcc_assert (elts % group_size == 0);
7366 tree elt = init_expr; 7480 tree elt = init_expr;
7367 unsigned ivn; 7481 unsigned ivn;
7368 for (ivn = 0; ivn < nivs; ++ivn) 7482 for (ivn = 0; ivn < nivs; ++ivn)
7369 { 7483 {
7370 tree_vector_builder elts (vectype, const_nunits, 1); 7484 tree_vector_builder elts (step_vectype, const_nunits, 1);
7371 stmts = NULL; 7485 stmts = NULL;
7372 for (unsigned eltn = 0; eltn < const_nunits; ++eltn) 7486 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7373 { 7487 {
7374 if (ivn*const_nunits + eltn >= group_size 7488 if (ivn*const_nunits + eltn >= group_size
7375 && (ivn * const_nunits + eltn) % group_size == 0) 7489 && (ivn * const_nunits + eltn) % group_size == 0)
7376 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt), 7490 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7377 elt, step_expr); 7491 elt, step_expr);
7378 elts.quick_push (elt); 7492 elts.quick_push (elt);
7379 } 7493 }
7380 vec_init = gimple_build_vector (&stmts, &elts); 7494 vec_init = gimple_build_vector (&stmts, &elts);
7495 vec_init = gimple_convert (&stmts, vectype, vec_init);
7381 if (stmts) 7496 if (stmts)
7382 { 7497 {
7383 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); 7498 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7384 gcc_assert (!new_bb); 7499 gcc_assert (!new_bb);
7385 } 7500 }
7390 stmt_vec_info induction_phi_info 7505 stmt_vec_info induction_phi_info
7391 = loop_vinfo->add_stmt (induction_phi); 7506 = loop_vinfo->add_stmt (induction_phi);
7392 induc_def = PHI_RESULT (induction_phi); 7507 induc_def = PHI_RESULT (induction_phi);
7393 7508
7394 /* Create the iv update inside the loop */ 7509 /* Create the iv update inside the loop */
7395 vec_def = make_ssa_name (vec_dest); 7510 gimple_seq stmts = NULL;
7396 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step); 7511 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7397 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT); 7512 vec_def = gimple_build (&stmts,
7398 loop_vinfo->add_stmt (new_stmt); 7513 PLUS_EXPR, step_vectype, vec_def, vec_step);
7514 vec_def = gimple_convert (&stmts, vectype, vec_def);
7515 loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (vec_def));
7516 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7399 7517
7400 /* Set the arguments of the phi node: */ 7518 /* Set the arguments of the phi node: */
7401 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION); 7519 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7402 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop), 7520 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7403 UNKNOWN_LOCATION); 7521 UNKNOWN_LOCATION);
7421 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr), 7539 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7422 expr, step_expr); 7540 expr, step_expr);
7423 if (! CONSTANT_CLASS_P (new_name)) 7541 if (! CONSTANT_CLASS_P (new_name))
7424 new_name = vect_init_vector (stmt_info, new_name, 7542 new_name = vect_init_vector (stmt_info, new_name,
7425 TREE_TYPE (step_expr), NULL); 7543 TREE_TYPE (step_expr), NULL);
7426 new_vec = build_vector_from_val (vectype, new_name); 7544 new_vec = build_vector_from_val (step_vectype, new_name);
7427 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL); 7545 vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7428 for (; ivn < nvects; ++ivn) 7546 for (; ivn < nvects; ++ivn)
7429 { 7547 {
7430 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt; 7548 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7431 tree def; 7549 tree def;
7432 if (gimple_code (iv) == GIMPLE_PHI) 7550 if (gimple_code (iv) == GIMPLE_PHI)
7433 def = gimple_phi_result (iv); 7551 def = gimple_phi_result (iv);
7434 else 7552 else
7435 def = gimple_assign_lhs (iv); 7553 def = gimple_assign_lhs (iv);
7436 new_stmt = gimple_build_assign (make_ssa_name (vectype), 7554 gimple_seq stmts = NULL;
7437 PLUS_EXPR, 7555 def = gimple_convert (&stmts, step_vectype, def);
7438 def, vec_step); 7556 def = gimple_build (&stmts,
7557 PLUS_EXPR, step_vectype, def, vec_step);
7558 def = gimple_convert (&stmts, vectype, def);
7439 if (gimple_code (iv) == GIMPLE_PHI) 7559 if (gimple_code (iv) == GIMPLE_PHI)
7440 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT); 7560 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7441 else 7561 else
7442 { 7562 {
7443 gimple_stmt_iterator tgsi = gsi_for_stmt (iv); 7563 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7444 gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING); 7564 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
7445 } 7565 }
7446 SLP_TREE_VEC_STMTS (slp_node).quick_push 7566 SLP_TREE_VEC_STMTS (slp_node).quick_push
7447 (loop_vinfo->add_stmt (new_stmt)); 7567 (loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (def)));
7448 } 7568 }
7449 } 7569 }
7450 7570
7451 return true; 7571 return true;
7452 } 7572 }
7478 else 7598 else
7479 { 7599 {
7480 /* iv_loop is the loop to be vectorized. Create: 7600 /* iv_loop is the loop to be vectorized. Create:
7481 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */ 7601 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7482 stmts = NULL; 7602 stmts = NULL;
7483 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr); 7603 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
7484 7604
7485 unsigned HOST_WIDE_INT const_nunits; 7605 unsigned HOST_WIDE_INT const_nunits;
7486 if (nunits.is_constant (&const_nunits)) 7606 if (nunits.is_constant (&const_nunits))
7487 { 7607 {
7488 tree_vector_builder elts (vectype, const_nunits, 1); 7608 tree_vector_builder elts (step_vectype, const_nunits, 1);
7489 elts.quick_push (new_name); 7609 elts.quick_push (new_name);
7490 for (i = 1; i < const_nunits; i++) 7610 for (i = 1; i < const_nunits; i++)
7491 { 7611 {
7492 /* Create: new_name_i = new_name + step_expr */ 7612 /* Create: new_name_i = new_name + step_expr */
7493 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name), 7613 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7498 new_name_nunits-1] */ 7618 new_name_nunits-1] */
7499 vec_init = gimple_build_vector (&stmts, &elts); 7619 vec_init = gimple_build_vector (&stmts, &elts);
7500 } 7620 }
7501 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))) 7621 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7502 /* Build the initial value directly from a VEC_SERIES_EXPR. */ 7622 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7503 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype, 7623 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
7504 new_name, step_expr); 7624 new_name, step_expr);
7505 else 7625 else
7506 { 7626 {
7507 /* Build: 7627 /* Build:
7508 [base, base, base, ...] 7628 [base, base, base, ...]
7509 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */ 7629 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7510 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))); 7630 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7511 gcc_assert (flag_associative_math); 7631 gcc_assert (flag_associative_math);
7512 tree index = build_index_vector (vectype, 0, 1); 7632 tree index = build_index_vector (step_vectype, 0, 1);
7513 tree base_vec = gimple_build_vector_from_val (&stmts, vectype, 7633 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7514 new_name); 7634 new_name);
7515 tree step_vec = gimple_build_vector_from_val (&stmts, vectype, 7635 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7516 step_expr); 7636 step_expr);
7517 vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index); 7637 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
7518 vec_init = gimple_build (&stmts, MULT_EXPR, vectype, 7638 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
7519 vec_init, step_vec); 7639 vec_init, step_vec);
7520 vec_init = gimple_build (&stmts, PLUS_EXPR, vectype, 7640 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
7521 vec_init, base_vec); 7641 vec_init, base_vec);
7522 } 7642 }
7643 vec_init = gimple_convert (&stmts, vectype, vec_init);
7523 7644
7524 if (stmts) 7645 if (stmts)
7525 { 7646 {
7526 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); 7647 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7527 gcc_assert (!new_bb); 7648 gcc_assert (!new_bb);
7556 } 7677 }
7557 7678
7558 t = unshare_expr (new_name); 7679 t = unshare_expr (new_name);
7559 gcc_assert (CONSTANT_CLASS_P (new_name) 7680 gcc_assert (CONSTANT_CLASS_P (new_name)
7560 || TREE_CODE (new_name) == SSA_NAME); 7681 || TREE_CODE (new_name) == SSA_NAME);
7561 new_vec = build_vector_from_val (vectype, t); 7682 new_vec = build_vector_from_val (step_vectype, t);
7562 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL); 7683 vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7563 7684
7564 7685
7565 /* Create the following def-use cycle: 7686 /* Create the following def-use cycle:
7566 loop prolog: 7687 loop prolog:
7567 vec_init = ... 7688 vec_init = ...
7578 induction_phi = create_phi_node (vec_dest, iv_loop->header); 7699 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7579 stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi); 7700 stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7580 induc_def = PHI_RESULT (induction_phi); 7701 induc_def = PHI_RESULT (induction_phi);
7581 7702
7582 /* Create the iv update inside the loop */ 7703 /* Create the iv update inside the loop */
7583 vec_def = make_ssa_name (vec_dest); 7704 stmts = NULL;
7584 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step); 7705 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7585 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT); 7706 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
7707 vec_def = gimple_convert (&stmts, vectype, vec_def);
7708 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7709 new_stmt = SSA_NAME_DEF_STMT (vec_def);
7586 stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt); 7710 stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7587 7711
7588 /* Set the arguments of the phi node: */ 7712 /* Set the arguments of the phi node: */
7589 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION); 7713 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7590 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop), 7714 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7622 } 7746 }
7623 7747
7624 t = unshare_expr (new_name); 7748 t = unshare_expr (new_name);
7625 gcc_assert (CONSTANT_CLASS_P (new_name) 7749 gcc_assert (CONSTANT_CLASS_P (new_name)
7626 || TREE_CODE (new_name) == SSA_NAME); 7750 || TREE_CODE (new_name) == SSA_NAME);
7627 new_vec = build_vector_from_val (vectype, t); 7751 new_vec = build_vector_from_val (step_vectype, t);
7628 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL); 7752 vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7629 7753
7630 vec_def = induc_def; 7754 vec_def = induc_def;
7631 prev_stmt_vinfo = induction_phi_info; 7755 prev_stmt_vinfo = induction_phi_info;
7632 for (i = 1; i < ncopies; i++) 7756 for (i = 1; i < ncopies; i++)
7633 { 7757 {
7634 /* vec_i = vec_prev + vec_step */ 7758 /* vec_i = vec_prev + vec_step */
7635 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, 7759 gimple_seq stmts = NULL;
7636 vec_def, vec_step); 7760 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
7637 vec_def = make_ssa_name (vec_dest, new_stmt); 7761 vec_def = gimple_build (&stmts,
7638 gimple_assign_set_lhs (new_stmt, vec_def); 7762 PLUS_EXPR, step_vectype, vec_def, vec_step);
7763 vec_def = gimple_convert (&stmts, vectype, vec_def);
7639 7764
7640 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT); 7765 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7766 new_stmt = SSA_NAME_DEF_STMT (vec_def);
7641 new_stmt_info = loop_vinfo->add_stmt (new_stmt); 7767 new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7642 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info; 7768 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7643 prev_stmt_vinfo = new_stmt_info; 7769 prev_stmt_vinfo = new_stmt_info;
7644 } 7770 }
7645 } 7771 }
7691 STMT_INFO computes a value that is used outside the loop. Check if 7817 STMT_INFO computes a value that is used outside the loop. Check if
7692 it can be supported. */ 7818 it can be supported. */
7693 7819
7694 bool 7820 bool
7695 vectorizable_live_operation (stmt_vec_info stmt_info, 7821 vectorizable_live_operation (stmt_vec_info stmt_info,
7696 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED, 7822 gimple_stmt_iterator *gsi,
7697 slp_tree slp_node, int slp_index, 7823 slp_tree slp_node, slp_instance slp_node_instance,
7698 stmt_vec_info *vec_stmt, 7824 int slp_index, bool vec_stmt_p,
7699 stmt_vector_for_cost *) 7825 stmt_vector_for_cost *)
7700 { 7826 {
7701 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 7827 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7702 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 7828 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7703 imm_use_iterator imm_iter; 7829 imm_use_iterator imm_iter;
7704 tree lhs, lhs_type, bitsize, vec_bitsize; 7830 tree lhs, lhs_type, bitsize, vec_bitsize;
7705 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 7831 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7706 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); 7832 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7707 int ncopies; 7833 int ncopies;
7710 int vec_entry = 0; 7836 int vec_entry = 0;
7711 poly_uint64 vec_index = 0; 7837 poly_uint64 vec_index = 0;
7712 7838
7713 gcc_assert (STMT_VINFO_LIVE_P (stmt_info)); 7839 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7714 7840
7715 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def) 7841 /* If a stmt of a reduction is live, vectorize it via
7716 return false; 7842 vect_create_epilog_for_reduction. vectorizable_reduction assessed
7843 validity so just trigger the transform here. */
7844 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
7845 {
7846 if (!vec_stmt_p)
7847 return true;
7848 if (slp_node)
7849 {
7850 /* For reduction chains the meta-info is attached to
7851 the group leader. */
7852 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7853 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7854 /* For SLP reductions we vectorize the epilogue for
7855 all involved stmts together. */
7856 else if (slp_index != 0)
7857 return true;
7858 }
7859 stmt_vec_info reduc_info = info_for_reduction (stmt_info);
7860 gcc_assert (reduc_info->is_reduc_info);
7861 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
7862 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
7863 return true;
7864 vect_create_epilog_for_reduction (stmt_info, slp_node,
7865 slp_node_instance);
7866 return true;
7867 }
7717 7868
7718 /* FORNOW. CHECKME. */ 7869 /* FORNOW. CHECKME. */
7719 if (nested_in_vect_loop_p (loop, stmt_info)) 7870 if (nested_in_vect_loop_p (loop, stmt_info))
7720 return false; 7871 return false;
7721 7872
7759 " final result.\n"); 7910 " final result.\n");
7760 return false; 7911 return false;
7761 } 7912 }
7762 } 7913 }
7763 7914
7764 if (!vec_stmt) 7915 if (!vec_stmt_p)
7765 { 7916 {
7766 /* No transformation required. */ 7917 /* No transformation required. */
7767 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)) 7918 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7768 { 7919 {
7769 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype, 7920 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7795 else 7946 else
7796 { 7947 {
7797 gcc_assert (ncopies == 1 && !slp_node); 7948 gcc_assert (ncopies == 1 && !slp_node);
7798 vect_record_loop_mask (loop_vinfo, 7949 vect_record_loop_mask (loop_vinfo,
7799 &LOOP_VINFO_MASKS (loop_vinfo), 7950 &LOOP_VINFO_MASKS (loop_vinfo),
7800 1, vectype); 7951 1, vectype, NULL);
7801 } 7952 }
7802 } 7953 }
7803 return true; 7954 return true;
7804 } 7955 }
7805 7956
7905 } 8056 }
7906 8057
7907 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */ 8058 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
7908 8059
7909 static void 8060 static void
7910 vect_loop_kill_debug_uses (struct loop *loop, stmt_vec_info stmt_info) 8061 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
7911 { 8062 {
7912 ssa_op_iter op_iter; 8063 ssa_op_iter op_iter;
7913 imm_use_iterator imm_iter; 8064 imm_use_iterator imm_iter;
7914 def_operand_p def_p; 8065 def_operand_p def_p;
7915 gimple *ustmt; 8066 gimple *ustmt;
7961 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters)) 8112 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
7962 return true; 8113 return true;
7963 } 8114 }
7964 8115
7965 widest_int max; 8116 widest_int max;
7966 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 8117 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7967 /* Check the upper bound of loop niters. */ 8118 /* Check the upper bound of loop niters. */
7968 if (get_max_loop_iterations (loop, &max)) 8119 if (get_max_loop_iterations (loop, &max))
7969 { 8120 {
7970 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)); 8121 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
7971 signop sgn = TYPE_SIGN (type); 8122 signop sgn = TYPE_SIGN (type);
7974 return true; 8125 return true;
7975 } 8126 }
7976 return false; 8127 return false;
7977 } 8128 }
7978 8129
7979 /* Return a mask type with half the number of elements as TYPE. */ 8130 /* Return a mask type with half the number of elements as OLD_TYPE,
8131 given that it should have mode NEW_MODE. */
7980 8132
7981 tree 8133 tree
7982 vect_halve_mask_nunits (tree type) 8134 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
7983 { 8135 {
7984 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2); 8136 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
7985 return build_truth_vector_type (nunits, current_vector_size); 8137 return build_truth_vector_type_for_mode (nunits, new_mode);
7986 } 8138 }
7987 8139
7988 /* Return a mask type with twice as many elements as TYPE. */ 8140 /* Return a mask type with twice as many elements as OLD_TYPE,
8141 given that it should have mode NEW_MODE. */
7989 8142
7990 tree 8143 tree
7991 vect_double_mask_nunits (tree type) 8144 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
7992 { 8145 {
7993 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2; 8146 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
7994 return build_truth_vector_type (nunits, current_vector_size); 8147 return build_truth_vector_type_for_mode (nunits, new_mode);
7995 } 8148 }
7996 8149
7997 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to 8150 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
7998 contain a sequence of NVECTORS masks that each control a vector of type 8151 contain a sequence of NVECTORS masks that each control a vector of type
7999 VECTYPE. */ 8152 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
8153 these vector masks with the vector version of SCALAR_MASK. */
8000 8154
8001 void 8155 void
8002 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks, 8156 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8003 unsigned int nvectors, tree vectype) 8157 unsigned int nvectors, tree vectype, tree scalar_mask)
8004 { 8158 {
8005 gcc_assert (nvectors != 0); 8159 gcc_assert (nvectors != 0);
8006 if (masks->length () < nvectors) 8160 if (masks->length () < nvectors)
8007 masks->safe_grow_cleared (nvectors); 8161 masks->safe_grow_cleared (nvectors);
8008 rgroup_masks *rgm = &(*masks)[nvectors - 1]; 8162 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8009 /* The number of scalars per iteration and the number of vectors are 8163 /* The number of scalars per iteration and the number of vectors are
8010 both compile-time constants. */ 8164 both compile-time constants. */
8011 unsigned int nscalars_per_iter 8165 unsigned int nscalars_per_iter
8012 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype), 8166 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8013 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant (); 8167 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8168
8169 if (scalar_mask)
8170 {
8171 scalar_cond_masked_key cond (scalar_mask, nvectors);
8172 loop_vinfo->scalar_cond_masked_set.add (cond);
8173 }
8174
8014 if (rgm->max_nscalars_per_iter < nscalars_per_iter) 8175 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8015 { 8176 {
8016 rgm->max_nscalars_per_iter = nscalars_per_iter; 8177 rgm->max_nscalars_per_iter = nscalars_per_iter;
8017 rgm->mask_type = build_same_sized_truth_vector_type (vectype); 8178 rgm->mask_type = truth_type_for (vectype);
8018 } 8179 }
8019 } 8180 }
8020 8181
8021 /* Given a complete set of masks MASKS, extract mask number INDEX 8182 /* Given a complete set of masks MASKS, extract mask number INDEX
8022 for an rgroup that operates on NVECTORS vectors of type VECTYPE, 8183 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8057 We can then view-convert the mask so that each sequence of 8218 We can then view-convert the mask so that each sequence of
8058 N elements is replaced by a single element. */ 8219 N elements is replaced by a single element. */
8059 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type), 8220 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8060 TYPE_VECTOR_SUBPARTS (vectype))); 8221 TYPE_VECTOR_SUBPARTS (vectype)));
8061 gimple_seq seq = NULL; 8222 gimple_seq seq = NULL;
8062 mask_type = build_same_sized_truth_vector_type (vectype); 8223 mask_type = truth_type_for (vectype);
8063 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask); 8224 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8064 if (seq) 8225 if (seq)
8065 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT); 8226 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8066 } 8227 }
8067 return mask; 8228 return mask;
8069 8230
8070 /* Scale profiling counters by estimation for LOOP which is vectorized 8231 /* Scale profiling counters by estimation for LOOP which is vectorized
8071 by factor VF. */ 8232 by factor VF. */
8072 8233
8073 static void 8234 static void
8074 scale_profile_for_vect_loop (struct loop *loop, unsigned vf) 8235 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
8075 { 8236 {
8076 edge preheader = loop_preheader_edge (loop); 8237 edge preheader = loop_preheader_edge (loop);
8077 /* Reduce loop iterations by the vectorization factor. */ 8238 /* Reduce loop iterations by the vectorization factor. */
8078 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf); 8239 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8079 profile_count freq_h = loop->header->count, freq_e = preheader->count (); 8240 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8107 8268
8108 static void 8269 static void
8109 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info, 8270 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8110 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store) 8271 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8111 { 8272 {
8112 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 8273 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8113 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 8274 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8114 8275
8115 if (dump_enabled_p ()) 8276 if (dump_enabled_p ())
8116 dump_printf_loc (MSG_NOTE, vect_location, 8277 dump_printf_loc (MSG_NOTE, vect_location,
8117 "------>vectorizing statement: %G", stmt_info->stmt); 8278 "------>vectorizing statement: %G", stmt_info->stmt);
8145 8306
8146 if (vect_transform_stmt (stmt_info, gsi, NULL, NULL)) 8307 if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
8147 *seen_store = stmt_info; 8308 *seen_store = stmt_info;
8148 } 8309 }
8149 8310
8311 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
8312 in the hash_map with its corresponding values. */
8313
8314 static tree
8315 find_in_mapping (tree t, void *context)
8316 {
8317 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
8318
8319 tree *value = mapping->get (t);
8320 return value ? *value : t;
8321 }
8322
8323 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
8324 original loop that has now been vectorized.
8325
8326 The inits of the data_references need to be advanced with the number of
8327 iterations of the main loop. This has been computed in vect_do_peeling and
8328 is stored in parameter ADVANCE. We first restore the data_references
8329 initial offset with the values recored in ORIG_DRS_INIT.
8330
8331 Since the loop_vec_info of this EPILOGUE was constructed for the original
8332 loop, its stmt_vec_infos all point to the original statements. These need
8333 to be updated to point to their corresponding copies as well as the SSA_NAMES
8334 in their PATTERN_DEF_SEQs and RELATED_STMTs.
8335
8336 The data_reference's connections also need to be updated. Their
8337 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
8338 stmt_vec_infos, their statements need to point to their corresponding copy,
8339 if they are gather loads or scatter stores then their reference needs to be
8340 updated to point to its corresponding copy and finally we set
8341 'base_misaligned' to false as we have already peeled for alignment in the
8342 prologue of the main loop. */
8343
8344 static void
8345 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
8346 {
8347 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
8348 auto_vec<gimple *> stmt_worklist;
8349 hash_map<tree,tree> mapping;
8350 gimple *orig_stmt, *new_stmt;
8351 gimple_stmt_iterator epilogue_gsi;
8352 gphi_iterator epilogue_phi_gsi;
8353 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
8354 basic_block *epilogue_bbs = get_loop_body (epilogue);
8355 unsigned i;
8356
8357 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
8358
8359 /* Advance data_reference's with the number of iterations of the previous
8360 loop and its prologue. */
8361 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
8362
8363
8364 /* The EPILOGUE loop is a copy of the original loop so they share the same
8365 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
8366 point to the copied statements. We also create a mapping of all LHS' in
8367 the original loop and all the LHS' in the EPILOGUE and create worklists to
8368 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
8369 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
8370 {
8371 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
8372 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
8373 {
8374 new_stmt = epilogue_phi_gsi.phi ();
8375
8376 gcc_assert (gimple_uid (new_stmt) > 0);
8377 stmt_vinfo
8378 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8379
8380 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8381 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8382
8383 mapping.put (gimple_phi_result (orig_stmt),
8384 gimple_phi_result (new_stmt));
8385 /* PHI nodes can not have patterns or related statements. */
8386 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
8387 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
8388 }
8389
8390 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
8391 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
8392 {
8393 new_stmt = gsi_stmt (epilogue_gsi);
8394
8395 gcc_assert (gimple_uid (new_stmt) > 0);
8396 stmt_vinfo
8397 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8398
8399 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8400 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8401
8402 if (tree old_lhs = gimple_get_lhs (orig_stmt))
8403 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
8404
8405 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
8406 {
8407 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
8408 for (gimple_stmt_iterator gsi = gsi_start (seq);
8409 !gsi_end_p (gsi); gsi_next (&gsi))
8410 stmt_worklist.safe_push (gsi_stmt (gsi));
8411 }
8412
8413 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
8414 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
8415 {
8416 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
8417 stmt_worklist.safe_push (stmt);
8418 /* Set BB such that the assert in
8419 'get_initial_def_for_reduction' is able to determine that
8420 the BB of the related stmt is inside this loop. */
8421 gimple_set_bb (stmt,
8422 gimple_bb (new_stmt));
8423 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
8424 gcc_assert (related_vinfo == NULL
8425 || related_vinfo == stmt_vinfo);
8426 }
8427 }
8428 }
8429
8430 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
8431 using the original main loop and thus need to be updated to refer to the
8432 cloned variables used in the epilogue. */
8433 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
8434 {
8435 gimple *stmt = stmt_worklist[i];
8436 tree *new_op;
8437
8438 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
8439 {
8440 tree op = gimple_op (stmt, j);
8441 if ((new_op = mapping.get(op)))
8442 gimple_set_op (stmt, j, *new_op);
8443 else
8444 {
8445 /* PR92429: The last argument of simplify_replace_tree disables
8446 folding when replacing arguments. This is required as
8447 otherwise you might end up with different statements than the
8448 ones analyzed in vect_loop_analyze, leading to different
8449 vectorization. */
8450 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
8451 &find_in_mapping, &mapping, false);
8452 gimple_set_op (stmt, j, op);
8453 }
8454 }
8455 }
8456
8457 struct data_reference *dr;
8458 vec<data_reference_p> datarefs = epilogue_vinfo->shared->datarefs;
8459 FOR_EACH_VEC_ELT (datarefs, i, dr)
8460 {
8461 orig_stmt = DR_STMT (dr);
8462 gcc_assert (gimple_uid (orig_stmt) > 0);
8463 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
8464 /* Data references for gather loads and scatter stores do not use the
8465 updated offset we set using ADVANCE. Instead we have to make sure the
8466 reference in the data references point to the corresponding copy of
8467 the original in the epilogue. */
8468 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
8469 == VMAT_GATHER_SCATTER)
8470 {
8471 DR_REF (dr)
8472 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
8473 &find_in_mapping, &mapping);
8474 DR_BASE_ADDRESS (dr)
8475 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
8476 &find_in_mapping, &mapping);
8477 }
8478 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
8479 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
8480 /* The vector size of the epilogue is smaller than that of the main loop
8481 so the alignment is either the same or lower. This means the dr will
8482 thus by definition be aligned. */
8483 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
8484 }
8485
8486 epilogue_vinfo->shared->datarefs_copy.release ();
8487 epilogue_vinfo->shared->save_datarefs ();
8488 }
8489
8150 /* Function vect_transform_loop. 8490 /* Function vect_transform_loop.
8151 8491
8152 The analysis phase has determined that the loop is vectorizable. 8492 The analysis phase has determined that the loop is vectorizable.
8153 Vectorize the loop - created vectorized stmts to replace the scalar 8493 Vectorize the loop - created vectorized stmts to replace the scalar
8154 stmts in the loop, and update the loop exit condition. 8494 stmts in the loop, and update the loop exit condition.
8155 Returns scalar epilogue loop if any. */ 8495 Returns scalar epilogue loop if any. */
8156 8496
8157 struct loop * 8497 class loop *
8158 vect_transform_loop (loop_vec_info loop_vinfo) 8498 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
8159 { 8499 {
8160 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 8500 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8161 struct loop *epilogue = NULL; 8501 class loop *epilogue = NULL;
8162 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 8502 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8163 int nbbs = loop->num_nodes; 8503 int nbbs = loop->num_nodes;
8164 int i; 8504 int i;
8165 tree niters_vector = NULL_TREE; 8505 tree niters_vector = NULL_TREE;
8166 tree step_vector = NULL_TREE; 8506 tree step_vector = NULL_TREE;
8179 of iterations is constant assume the cost check has been performed 8519 of iterations is constant assume the cost check has been performed
8180 by our caller. If the threshold makes all loops profitable that 8520 by our caller. If the threshold makes all loops profitable that
8181 run at least the (estimated) vectorization factor number of times 8521 run at least the (estimated) vectorization factor number of times
8182 checking is pointless, too. */ 8522 checking is pointless, too. */
8183 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo); 8523 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8184 if (th >= vect_vf_for_cost (loop_vinfo) 8524 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
8185 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8186 { 8525 {
8187 if (dump_enabled_p ()) 8526 if (dump_enabled_p ())
8188 dump_printf_loc (MSG_NOTE, vect_location, 8527 dump_printf_loc (MSG_NOTE, vect_location,
8189 "Profitability threshold is %d loop iterations.\n", 8528 "Profitability threshold is %d loop iterations.\n",
8190 th); 8529 th);
8191 check_profitability = true; 8530 check_profitability = true;
8192 } 8531 }
8193 8532
8194 /* Make sure there exists a single-predecessor exit bb. Do this before 8533 /* Make sure there exists a single-predecessor exit bb. Do this before
8195 versioning. */ 8534 versioning. */
8196 edge e = single_exit (loop); 8535 edge e = single_exit (loop);
8197 if (! single_pred_p (e->dest)) 8536 if (! single_pred_p (e->dest))
8198 { 8537 {
8199 split_loop_exit_edge (e); 8538 split_loop_exit_edge (e, true);
8200 if (dump_enabled_p ()) 8539 if (dump_enabled_p ())
8201 dump_printf (MSG_NOTE, "split exit edge\n"); 8540 dump_printf (MSG_NOTE, "split exit edge\n");
8202 } 8541 }
8203 8542
8204 /* Version the loop first, if required, so the profitability check 8543 /* Version the loop first, if required, so the profitability check
8205 comes first. */ 8544 comes first. */
8206 8545
8207 if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) 8546 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8208 { 8547 {
8209 poly_uint64 versioning_threshold 8548 class loop *sloop
8210 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo); 8549 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
8211 if (check_profitability 8550 sloop->force_vectorize = false;
8212 && ordered_p (poly_uint64 (th), versioning_threshold))
8213 {
8214 versioning_threshold = ordered_max (poly_uint64 (th),
8215 versioning_threshold);
8216 check_profitability = false;
8217 }
8218 vect_loop_versioning (loop_vinfo, th, check_profitability,
8219 versioning_threshold);
8220 check_profitability = false; 8551 check_profitability = false;
8221 } 8552 }
8222 8553
8223 /* Make sure there exists a single-predecessor exit bb also on the 8554 /* Make sure there exists a single-predecessor exit bb also on the
8224 scalar loop copy. Do this after versioning but before peeling 8555 scalar loop copy. Do this after versioning but before peeling
8228 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)) 8559 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8229 { 8560 {
8230 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)); 8561 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8231 if (! single_pred_p (e->dest)) 8562 if (! single_pred_p (e->dest))
8232 { 8563 {
8233 split_loop_exit_edge (e); 8564 split_loop_exit_edge (e, true);
8234 if (dump_enabled_p ()) 8565 if (dump_enabled_p ())
8235 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n"); 8566 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8236 } 8567 }
8237 } 8568 }
8238 8569
8239 tree niters = vect_build_loop_niters (loop_vinfo); 8570 tree niters = vect_build_loop_niters (loop_vinfo);
8240 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters; 8571 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8241 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo)); 8572 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8242 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo); 8573 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8574 tree advance;
8575 drs_init_vec orig_drs_init;
8576
8243 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector, 8577 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8244 &step_vector, &niters_vector_mult_vf, th, 8578 &step_vector, &niters_vector_mult_vf, th,
8245 check_profitability, niters_no_overflow); 8579 check_profitability, niters_no_overflow,
8580 &advance);
8581
8582 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
8583 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
8584 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
8585 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
8246 8586
8247 if (niters_vector == NULL_TREE) 8587 if (niters_vector == NULL_TREE)
8248 { 8588 {
8249 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 8589 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8250 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) 8590 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8314 && dump_enabled_p ()) 8654 && dump_enabled_p ())
8315 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n"); 8655 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8316 8656
8317 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def 8657 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8318 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def 8658 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8319 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle) 8659 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
8660 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
8661 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
8320 && ! PURE_SLP_STMT (stmt_info)) 8662 && ! PURE_SLP_STMT (stmt_info))
8321 { 8663 {
8322 if (dump_enabled_p ()) 8664 if (dump_enabled_p ())
8323 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n"); 8665 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8324 vect_transform_stmt (stmt_info, NULL, NULL, NULL); 8666 vect_transform_stmt (stmt_info, NULL, NULL, NULL);
8466 dump_printf_loc (MSG_NOTE, vect_location, 8808 dump_printf_loc (MSG_NOTE, vect_location,
8467 "OUTER LOOP VECTORIZED\n"); 8809 "OUTER LOOP VECTORIZED\n");
8468 dump_printf (MSG_NOTE, "\n"); 8810 dump_printf (MSG_NOTE, "\n");
8469 } 8811 }
8470 else 8812 else
8471 { 8813 dump_printf_loc (MSG_NOTE, vect_location,
8472 dump_printf_loc (MSG_NOTE, vect_location, 8814 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
8473 "LOOP EPILOGUE VECTORIZED (VS="); 8815 GET_MODE_NAME (loop_vinfo->vector_mode));
8474 dump_dec (MSG_NOTE, current_vector_size); 8816 }
8475 dump_printf (MSG_NOTE, ")\n"); 8817
8476 } 8818 /* Loops vectorized with a variable factor won't benefit from
8477 } 8819 unrolling/peeling. */
8478 8820 if (!vf.is_constant ())
8821 {
8822 loop->unroll = 1;
8823 if (dump_enabled_p ())
8824 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
8825 " variable-length vectorization factor\n");
8826 }
8479 /* Free SLP instances here because otherwise stmt reference counting 8827 /* Free SLP instances here because otherwise stmt reference counting
8480 won't work. */ 8828 won't work. */
8481 slp_instance instance; 8829 slp_instance instance;
8482 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance) 8830 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8483 vect_free_slp_instance (instance, true); 8831 vect_free_slp_instance (instance, true);
8484 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release (); 8832 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8485 /* Clear-up safelen field since its value is invalid after vectorization 8833 /* Clear-up safelen field since its value is invalid after vectorization
8486 since vectorized loop can have loop-carried dependencies. */ 8834 since vectorized loop can have loop-carried dependencies. */
8487 loop->safelen = 0; 8835 loop->safelen = 0;
8488 8836
8489 /* Don't vectorize epilogue for epilogue. */
8490 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8491 epilogue = NULL;
8492
8493 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8494 epilogue = NULL;
8495
8496 if (epilogue) 8837 if (epilogue)
8497 { 8838 {
8498 auto_vector_sizes vector_sizes; 8839 update_epilogue_loop_vinfo (epilogue, advance);
8499 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes); 8840
8500 unsigned int next_size = 0; 8841 epilogue->simduid = loop->simduid;
8501
8502 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8503 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
8504 && known_eq (vf, lowest_vf))
8505 {
8506 unsigned int eiters
8507 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8508 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
8509 eiters = eiters % lowest_vf;
8510 epilogue->nb_iterations_upper_bound = eiters - 1;
8511
8512 unsigned int ratio;
8513 while (next_size < vector_sizes.length ()
8514 && !(constant_multiple_p (current_vector_size,
8515 vector_sizes[next_size], &ratio)
8516 && eiters >= lowest_vf / ratio))
8517 next_size += 1;
8518 }
8519 else
8520 while (next_size < vector_sizes.length ()
8521 && maybe_lt (current_vector_size, vector_sizes[next_size]))
8522 next_size += 1;
8523
8524 if (next_size == vector_sizes.length ())
8525 epilogue = NULL;
8526 }
8527
8528 if (epilogue)
8529 {
8530 epilogue->force_vectorize = loop->force_vectorize; 8842 epilogue->force_vectorize = loop->force_vectorize;
8531 epilogue->safelen = loop->safelen;
8532 epilogue->dont_vectorize = false; 8843 epilogue->dont_vectorize = false;
8533
8534 /* We may need to if-convert epilogue to vectorize it. */
8535 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8536 tree_if_conversion (epilogue);
8537 } 8844 }
8538 8845
8539 return epilogue; 8846 return epilogue;
8540 } 8847 }
8541 8848
8561 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184); 8868 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8562 } 8869 }
8563 */ 8870 */
8564 8871
8565 void 8872 void
8566 optimize_mask_stores (struct loop *loop) 8873 optimize_mask_stores (class loop *loop)
8567 { 8874 {
8568 basic_block *bbs = get_loop_body (loop); 8875 basic_block *bbs = get_loop_body (loop);
8569 unsigned nbbs = loop->num_nodes; 8876 unsigned nbbs = loop->num_nodes;
8570 unsigned i; 8877 unsigned i;
8571 basic_block bb; 8878 basic_block bb;
8572 struct loop *bb_loop; 8879 class loop *bb_loop;
8573 gimple_stmt_iterator gsi; 8880 gimple_stmt_iterator gsi;
8574 gimple *stmt; 8881 gimple *stmt;
8575 auto_vec<gimple *> worklist; 8882 auto_vec<gimple *> worklist;
8883 auto_purge_vect_location sentinel;
8576 8884
8577 vect_location = find_loop_location (loop); 8885 vect_location = find_loop_location (loop);
8578 /* Pick up all masked stores in loop if any. */ 8886 /* Pick up all masked stores in loop if any. */
8579 for (i = 0; i < nbbs; i++) 8887 for (i = 0; i < nbbs; i++)
8580 { 8888 {
8745 last = worklist.pop (); 9053 last = worklist.pop ();
8746 } 9054 }
8747 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION); 9055 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
8748 } 9056 }
8749 } 9057 }
9058
9059 /* Decide whether it is possible to use a zero-based induction variable
9060 when vectorizing LOOP_VINFO with a fully-masked loop. If it is,
9061 return the value that the induction variable must be able to hold
9062 in order to ensure that the loop ends with an all-false mask.
9063 Return -1 otherwise. */
9064 widest_int
9065 vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
9066 {
9067 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9068 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9069 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
9070
9071 /* Calculate the value that the induction variable must be able
9072 to hit in order to ensure that we end the loop with an all-false mask.
9073 This involves adding the maximum number of inactive trailing scalar
9074 iterations. */
9075 widest_int iv_limit = -1;
9076 if (max_loop_iterations (loop, &iv_limit))
9077 {
9078 if (niters_skip)
9079 {
9080 /* Add the maximum number of skipped iterations to the
9081 maximum iteration count. */
9082 if (TREE_CODE (niters_skip) == INTEGER_CST)
9083 iv_limit += wi::to_widest (niters_skip);
9084 else
9085 iv_limit += max_vf - 1;
9086 }
9087 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
9088 /* Make a conservatively-correct assumption. */
9089 iv_limit += max_vf - 1;
9090
9091 /* IV_LIMIT is the maximum number of latch iterations, which is also
9092 the maximum in-range IV value. Round this value down to the previous
9093 vector alignment boundary and then add an extra full iteration. */
9094 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9095 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
9096 }
9097 return iv_limit;
9098 }
9099