Mercurial > hg > CbC > CbC_gcc
comparison gcc/tree-vect-loop-manip.c @ 145:1830386684a0
gcc-9.2.0
author | anatofuz |
---|---|
date | Thu, 13 Feb 2020 11:34:05 +0900 |
parents | 84e7813d76e9 |
children |
comparison
equal
deleted
inserted
replaced
131:84e7813d76e9 | 145:1830386684a0 |
---|---|
1 /* Vectorizer Specific Loop Manipulations | 1 /* Vectorizer Specific Loop Manipulations |
2 Copyright (C) 2003-2018 Free Software Foundation, Inc. | 2 Copyright (C) 2003-2020 Free Software Foundation, Inc. |
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> | 3 Contributed by Dorit Naishlos <dorit@il.ibm.com> |
4 and Ira Rosen <irar@il.ibm.com> | 4 and Ira Rosen <irar@il.ibm.com> |
5 | 5 |
6 This file is part of GCC. | 6 This file is part of GCC. |
7 | 7 |
45 #include "tree-ssa-loop-niter.h" | 45 #include "tree-ssa-loop-niter.h" |
46 #include "internal-fn.h" | 46 #include "internal-fn.h" |
47 #include "stor-layout.h" | 47 #include "stor-layout.h" |
48 #include "optabs-query.h" | 48 #include "optabs-query.h" |
49 #include "vec-perm-indices.h" | 49 #include "vec-perm-indices.h" |
50 #include "insn-config.h" | |
51 #include "rtl.h" | |
52 #include "recog.h" | |
50 | 53 |
51 /************************************************************************* | 54 /************************************************************************* |
52 Simple Loop Peeling Utilities | 55 Simple Loop Peeling Utilities |
53 | 56 |
54 Utilities to support loop peeling for vectorization purposes. | 57 Utilities to support loop peeling for vectorization purposes. |
87 gimple *stmt; | 90 gimple *stmt; |
88 use_operand_p use_p; | 91 use_operand_p use_p; |
89 ssa_op_iter iter; | 92 ssa_op_iter iter; |
90 edge e; | 93 edge e; |
91 edge_iterator ei; | 94 edge_iterator ei; |
92 struct loop *loop = bb->loop_father; | 95 class loop *loop = bb->loop_father; |
93 struct loop *outer_loop = NULL; | 96 class loop *outer_loop = NULL; |
94 | 97 |
95 if (rename_from_outer_loop) | 98 if (rename_from_outer_loop) |
96 { | 99 { |
97 gcc_assert (loop); | 100 gcc_assert (loop); |
98 outer_loop = loop_outer (loop); | 101 outer_loop = loop_outer (loop); |
256 /* Define one loop mask MASK from loop LOOP. INIT_MASK is the value that | 259 /* Define one loop mask MASK from loop LOOP. INIT_MASK is the value that |
257 the mask should have during the first iteration and NEXT_MASK is the | 260 the mask should have during the first iteration and NEXT_MASK is the |
258 value that it should have on subsequent iterations. */ | 261 value that it should have on subsequent iterations. */ |
259 | 262 |
260 static void | 263 static void |
261 vect_set_loop_mask (struct loop *loop, tree mask, tree init_mask, | 264 vect_set_loop_mask (class loop *loop, tree mask, tree init_mask, |
262 tree next_mask) | 265 tree next_mask) |
263 { | 266 { |
264 gphi *phi = create_phi_node (mask, loop->header); | 267 gphi *phi = create_phi_node (mask, loop->header); |
265 add_phi_arg (phi, init_mask, loop_preheader_edge (loop), UNKNOWN_LOCATION); | 268 add_phi_arg (phi, init_mask, loop_preheader_edge (loop), UNKNOWN_LOCATION); |
266 add_phi_arg (phi, next_mask, loop_latch_edge (loop), UNKNOWN_LOCATION); | 269 add_phi_arg (phi, next_mask, loop_latch_edge (loop), UNKNOWN_LOCATION); |
267 } | 270 } |
268 | 271 |
269 /* Add SEQ to the end of LOOP's preheader block. */ | 272 /* Add SEQ to the end of LOOP's preheader block. */ |
270 | 273 |
271 static void | 274 static void |
272 add_preheader_seq (struct loop *loop, gimple_seq seq) | 275 add_preheader_seq (class loop *loop, gimple_seq seq) |
273 { | 276 { |
274 if (seq) | 277 if (seq) |
275 { | 278 { |
276 edge pe = loop_preheader_edge (loop); | 279 edge pe = loop_preheader_edge (loop); |
277 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq); | 280 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq); |
280 } | 283 } |
281 | 284 |
282 /* Add SEQ to the beginning of LOOP's header block. */ | 285 /* Add SEQ to the beginning of LOOP's header block. */ |
283 | 286 |
284 static void | 287 static void |
285 add_header_seq (struct loop *loop, gimple_seq seq) | 288 add_header_seq (class loop *loop, gimple_seq seq) |
286 { | 289 { |
287 if (seq) | 290 if (seq) |
288 { | 291 { |
289 gimple_stmt_iterator gsi = gsi_after_labels (loop->header); | 292 gimple_stmt_iterator gsi = gsi_after_labels (loop->header); |
290 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT); | 293 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT); |
321 rgroup_masks *src_rgm) | 324 rgroup_masks *src_rgm) |
322 { | 325 { |
323 tree src_masktype = src_rgm->mask_type; | 326 tree src_masktype = src_rgm->mask_type; |
324 tree dest_masktype = dest_rgm->mask_type; | 327 tree dest_masktype = dest_rgm->mask_type; |
325 machine_mode src_mode = TYPE_MODE (src_masktype); | 328 machine_mode src_mode = TYPE_MODE (src_masktype); |
329 insn_code icode1, icode2; | |
326 if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter | 330 if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter |
327 && optab_handler (vec_unpacku_hi_optab, src_mode) != CODE_FOR_nothing | 331 && (icode1 = optab_handler (vec_unpacku_hi_optab, |
328 && optab_handler (vec_unpacku_lo_optab, src_mode) != CODE_FOR_nothing) | 332 src_mode)) != CODE_FOR_nothing |
333 && (icode2 = optab_handler (vec_unpacku_lo_optab, | |
334 src_mode)) != CODE_FOR_nothing) | |
329 { | 335 { |
330 /* Unpacking the source masks gives at least as many mask bits as | 336 /* Unpacking the source masks gives at least as many mask bits as |
331 we need. We can then VIEW_CONVERT any excess bits away. */ | 337 we need. We can then VIEW_CONVERT any excess bits away. */ |
332 tree unpack_masktype = vect_halve_mask_nunits (src_masktype); | 338 machine_mode dest_mode = insn_data[icode1].operand[0].mode; |
339 gcc_assert (dest_mode == insn_data[icode2].operand[0].mode); | |
340 tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode); | |
333 for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i) | 341 for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i) |
334 { | 342 { |
335 tree src = src_rgm->masks[i / 2]; | 343 tree src = src_rgm->masks[i / 2]; |
336 tree dest = dest_rgm->masks[i]; | 344 tree dest = dest_rgm->masks[i]; |
337 tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1) | 345 tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1) |
380 all the masks in RGM and return a mask that is nonzero when the loop | 388 all the masks in RGM and return a mask that is nonzero when the loop |
381 needs to iterate. Add any new preheader statements to PREHEADER_SEQ. | 389 needs to iterate. Add any new preheader statements to PREHEADER_SEQ. |
382 Use LOOP_COND_GSI to insert code before the exit gcond. | 390 Use LOOP_COND_GSI to insert code before the exit gcond. |
383 | 391 |
384 RGM belongs to loop LOOP. The loop originally iterated NITERS | 392 RGM belongs to loop LOOP. The loop originally iterated NITERS |
385 times and has been vectorized according to LOOP_VINFO. Each iteration | 393 times and has been vectorized according to LOOP_VINFO. |
386 of the vectorized loop handles VF iterations of the scalar loop. | |
387 | 394 |
388 If NITERS_SKIP is nonnull, the first iteration of the vectorized loop | 395 If NITERS_SKIP is nonnull, the first iteration of the vectorized loop |
389 starts with NITERS_SKIP dummy iterations of the scalar loop before | 396 starts with NITERS_SKIP dummy iterations of the scalar loop before |
390 the real work starts. The mask elements for these dummy iterations | 397 the real work starts. The mask elements for these dummy iterations |
391 must be 0, to ensure that the extra iterations do not have an effect. | 398 must be 0, to ensure that the extra iterations do not have an effect. |
405 | 412 |
406 This means that we cannot guarantee that such an induction variable | 413 This means that we cannot guarantee that such an induction variable |
407 would ever hit a value that produces a set of all-false masks for RGM. */ | 414 would ever hit a value that produces a set of all-false masks for RGM. */ |
408 | 415 |
409 static tree | 416 static tree |
410 vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo, | 417 vect_set_loop_masks_directly (class loop *loop, loop_vec_info loop_vinfo, |
411 gimple_seq *preheader_seq, | 418 gimple_seq *preheader_seq, |
412 gimple_stmt_iterator loop_cond_gsi, | 419 gimple_stmt_iterator loop_cond_gsi, |
413 rgroup_masks *rgm, tree vf, | 420 rgroup_masks *rgm, tree niters, tree niters_skip, |
414 tree niters, tree niters_skip, | |
415 bool might_wrap_p) | 421 bool might_wrap_p) |
416 { | 422 { |
417 tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo); | 423 tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo); |
424 tree iv_type = LOOP_VINFO_MASK_IV_TYPE (loop_vinfo); | |
418 tree mask_type = rgm->mask_type; | 425 tree mask_type = rgm->mask_type; |
419 unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter; | 426 unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter; |
420 poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type); | 427 poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type); |
428 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); | |
421 | 429 |
422 /* Calculate the maximum number of scalar values that the rgroup | 430 /* Calculate the maximum number of scalar values that the rgroup |
423 handles in total, the number that it handles for each iteration | 431 handles in total, the number that it handles for each iteration |
424 of the vector loop, and the number that it should skip during the | 432 of the vector loop, and the number that it should skip during the |
425 first iteration of the vector loop. */ | 433 first iteration of the vector loop. */ |
426 tree nscalars_total = niters; | 434 tree nscalars_total = niters; |
427 tree nscalars_step = vf; | 435 tree nscalars_step = build_int_cst (iv_type, vf); |
428 tree nscalars_skip = niters_skip; | 436 tree nscalars_skip = niters_skip; |
429 if (nscalars_per_iter != 1) | 437 if (nscalars_per_iter != 1) |
430 { | 438 { |
431 /* We checked before choosing to use a fully-masked loop that these | 439 /* We checked before choosing to use a fully-masked loop that these |
432 multiplications don't overflow. */ | 440 multiplications don't overflow. */ |
433 tree factor = build_int_cst (compare_type, nscalars_per_iter); | 441 tree compare_factor = build_int_cst (compare_type, nscalars_per_iter); |
442 tree iv_factor = build_int_cst (iv_type, nscalars_per_iter); | |
434 nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type, | 443 nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type, |
435 nscalars_total, factor); | 444 nscalars_total, compare_factor); |
436 nscalars_step = gimple_build (preheader_seq, MULT_EXPR, compare_type, | 445 nscalars_step = gimple_build (preheader_seq, MULT_EXPR, iv_type, |
437 nscalars_step, factor); | 446 nscalars_step, iv_factor); |
438 if (nscalars_skip) | 447 if (nscalars_skip) |
439 nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type, | 448 nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type, |
440 nscalars_skip, factor); | 449 nscalars_skip, compare_factor); |
441 } | 450 } |
442 | 451 |
443 /* Create an induction variable that counts the number of scalars | 452 /* Create an induction variable that counts the number of scalars |
444 processed. */ | 453 processed. */ |
445 tree index_before_incr, index_after_incr; | 454 tree index_before_incr, index_after_incr; |
446 gimple_stmt_iterator incr_gsi; | 455 gimple_stmt_iterator incr_gsi; |
447 bool insert_after; | 456 bool insert_after; |
457 standard_iv_increment_position (loop, &incr_gsi, &insert_after); | |
458 create_iv (build_int_cst (iv_type, 0), nscalars_step, NULL_TREE, loop, | |
459 &incr_gsi, insert_after, &index_before_incr, &index_after_incr); | |
460 | |
448 tree zero_index = build_int_cst (compare_type, 0); | 461 tree zero_index = build_int_cst (compare_type, 0); |
449 standard_iv_increment_position (loop, &incr_gsi, &insert_after); | |
450 create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi, | |
451 insert_after, &index_before_incr, &index_after_incr); | |
452 | |
453 tree test_index, test_limit, first_limit; | 462 tree test_index, test_limit, first_limit; |
454 gimple_stmt_iterator *test_gsi; | 463 gimple_stmt_iterator *test_gsi; |
455 if (might_wrap_p) | 464 if (might_wrap_p) |
456 { | 465 { |
457 /* In principle the loop should stop iterating once the incremented | 466 /* In principle the loop should stop iterating once the incremented |
479 NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP) | 488 NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP) |
480 | 489 |
481 where the rightmost subtraction can be done directly in | 490 where the rightmost subtraction can be done directly in |
482 COMPARE_TYPE. */ | 491 COMPARE_TYPE. */ |
483 test_index = index_before_incr; | 492 test_index = index_before_incr; |
484 tree adjust = nscalars_step; | 493 tree adjust = gimple_convert (preheader_seq, compare_type, |
494 nscalars_step); | |
485 if (nscalars_skip) | 495 if (nscalars_skip) |
486 adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type, | 496 adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type, |
487 adjust, nscalars_skip); | 497 adjust, nscalars_skip); |
488 test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type, | 498 test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type, |
489 nscalars_total, adjust); | 499 nscalars_total, adjust); |
522 test_limit, nscalars_skip); | 532 test_limit, nscalars_skip); |
523 test_gsi = &loop_cond_gsi; | 533 test_gsi = &loop_cond_gsi; |
524 | 534 |
525 first_limit = test_limit; | 535 first_limit = test_limit; |
526 } | 536 } |
537 | |
538 /* Convert the IV value to the comparison type (either a no-op or | |
539 a demotion). */ | |
540 gimple_seq test_seq = NULL; | |
541 test_index = gimple_convert (&test_seq, compare_type, test_index); | |
542 gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT); | |
527 | 543 |
528 /* Provide a definition of each mask in the group. */ | 544 /* Provide a definition of each mask in the group. */ |
529 tree next_mask = NULL_TREE; | 545 tree next_mask = NULL_TREE; |
530 tree mask; | 546 tree mask; |
531 unsigned int i; | 547 unsigned int i; |
625 | 641 |
626 Insert the branch-back condition before LOOP_COND_GSI and return the | 642 Insert the branch-back condition before LOOP_COND_GSI and return the |
627 final gcond. */ | 643 final gcond. */ |
628 | 644 |
629 static gcond * | 645 static gcond * |
630 vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo, | 646 vect_set_loop_condition_masked (class loop *loop, loop_vec_info loop_vinfo, |
631 tree niters, tree final_iv, | 647 tree niters, tree final_iv, |
632 bool niters_maybe_zero, | 648 bool niters_maybe_zero, |
633 gimple_stmt_iterator loop_cond_gsi) | 649 gimple_stmt_iterator loop_cond_gsi) |
634 { | 650 { |
635 gimple_seq preheader_seq = NULL; | 651 gimple_seq preheader_seq = NULL; |
636 gimple_seq header_seq = NULL; | 652 gimple_seq header_seq = NULL; |
637 | 653 |
638 tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo); | 654 tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo); |
639 unsigned int compare_precision = TYPE_PRECISION (compare_type); | 655 unsigned int compare_precision = TYPE_PRECISION (compare_type); |
640 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo); | |
641 tree orig_niters = niters; | 656 tree orig_niters = niters; |
642 | 657 |
643 /* Type of the initial value of NITERS. */ | 658 /* Type of the initial value of NITERS. */ |
644 tree ni_actual_type = TREE_TYPE (niters); | 659 tree ni_actual_type = TREE_TYPE (niters); |
645 unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type); | 660 unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type); |
661 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo); | |
646 | 662 |
647 /* Convert NITERS to the same size as the compare. */ | 663 /* Convert NITERS to the same size as the compare. */ |
648 if (compare_precision > ni_actual_precision | 664 if (compare_precision > ni_actual_precision |
649 && niters_maybe_zero) | 665 && niters_maybe_zero) |
650 { | 666 { |
659 niters, build_one_cst (compare_type)); | 675 niters, build_one_cst (compare_type)); |
660 } | 676 } |
661 else | 677 else |
662 niters = gimple_convert (&preheader_seq, compare_type, niters); | 678 niters = gimple_convert (&preheader_seq, compare_type, niters); |
663 | 679 |
664 /* Convert skip_niters to the right type. */ | 680 widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo); |
665 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo); | |
666 | |
667 /* Now calculate the value that the induction variable must be able | |
668 to hit in order to ensure that we end the loop with an all-false mask. | |
669 This involves adding the maximum number of inactive trailing scalar | |
670 iterations. */ | |
671 widest_int iv_limit; | |
672 bool known_max_iters = max_loop_iterations (loop, &iv_limit); | |
673 if (known_max_iters) | |
674 { | |
675 if (niters_skip) | |
676 { | |
677 /* Add the maximum number of skipped iterations to the | |
678 maximum iteration count. */ | |
679 if (TREE_CODE (niters_skip) == INTEGER_CST) | |
680 iv_limit += wi::to_widest (niters_skip); | |
681 else | |
682 iv_limit += max_vf - 1; | |
683 } | |
684 /* IV_LIMIT is the maximum number of latch iterations, which is also | |
685 the maximum in-range IV value. Round this value down to the previous | |
686 vector alignment boundary and then add an extra full iteration. */ | |
687 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); | |
688 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf; | |
689 } | |
690 | |
691 /* Get the vectorization factor in tree form. */ | |
692 tree vf = build_int_cst (compare_type, | |
693 LOOP_VINFO_VECT_FACTOR (loop_vinfo)); | |
694 | 681 |
695 /* Iterate over all the rgroups and fill in their masks. We could use | 682 /* Iterate over all the rgroups and fill in their masks. We could use |
696 the first mask from any rgroup for the loop condition; here we | 683 the first mask from any rgroup for the loop condition; here we |
697 arbitrarily pick the last. */ | 684 arbitrarily pick the last. */ |
698 tree test_mask = NULL_TREE; | 685 tree test_mask = NULL_TREE; |
715 } | 702 } |
716 | 703 |
717 /* See whether zero-based IV would ever generate all-false masks | 704 /* See whether zero-based IV would ever generate all-false masks |
718 before wrapping around. */ | 705 before wrapping around. */ |
719 bool might_wrap_p | 706 bool might_wrap_p |
720 = (!known_max_iters | 707 = (iv_limit == -1 |
721 || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter, | 708 || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter, |
722 UNSIGNED) | 709 UNSIGNED) |
723 > compare_precision)); | 710 > compare_precision)); |
724 | 711 |
725 /* Set up all masks for this group. */ | 712 /* Set up all masks for this group. */ |
726 test_mask = vect_set_loop_masks_directly (loop, loop_vinfo, | 713 test_mask = vect_set_loop_masks_directly (loop, loop_vinfo, |
727 &preheader_seq, | 714 &preheader_seq, |
728 loop_cond_gsi, rgm, vf, | 715 loop_cond_gsi, rgm, |
729 niters, niters_skip, | 716 niters, niters_skip, |
730 might_wrap_p); | 717 might_wrap_p); |
731 } | 718 } |
732 | 719 |
733 /* Emit all accumulated statements. */ | 720 /* Emit all accumulated statements. */ |
762 | 749 |
763 /* Like vect_set_loop_condition, but handle the case in which there | 750 /* Like vect_set_loop_condition, but handle the case in which there |
764 are no loop masks. */ | 751 are no loop masks. */ |
765 | 752 |
766 static gcond * | 753 static gcond * |
767 vect_set_loop_condition_unmasked (struct loop *loop, tree niters, | 754 vect_set_loop_condition_unmasked (class loop *loop, tree niters, |
768 tree step, tree final_iv, | 755 tree step, tree final_iv, |
769 bool niters_maybe_zero, | 756 bool niters_maybe_zero, |
770 gimple_stmt_iterator loop_cond_gsi) | 757 gimple_stmt_iterator loop_cond_gsi) |
771 { | 758 { |
772 tree indx_before_incr, indx_after_incr; | 759 tree indx_before_incr, indx_after_incr; |
915 N * STEP on exit from the loop. | 902 N * STEP on exit from the loop. |
916 | 903 |
917 Assumption: the exit-condition of LOOP is the last stmt in the loop. */ | 904 Assumption: the exit-condition of LOOP is the last stmt in the loop. */ |
918 | 905 |
919 void | 906 void |
920 vect_set_loop_condition (struct loop *loop, loop_vec_info loop_vinfo, | 907 vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo, |
921 tree niters, tree step, tree final_iv, | 908 tree niters, tree step, tree final_iv, |
922 bool niters_maybe_zero) | 909 bool niters_maybe_zero) |
923 { | 910 { |
924 gcond *cond_stmt; | 911 gcond *cond_stmt; |
925 gcond *orig_cond = get_loop_exit_condition (loop); | 912 gcond *orig_cond = get_loop_exit_condition (loop); |
975 gsi_next (&gsi_to); | 962 gsi_next (&gsi_to); |
976 continue; | 963 continue; |
977 } | 964 } |
978 if (TREE_CODE (from_arg) != SSA_NAME) | 965 if (TREE_CODE (from_arg) != SSA_NAME) |
979 gcc_assert (operand_equal_p (from_arg, to_arg, 0)); | 966 gcc_assert (operand_equal_p (from_arg, to_arg, 0)); |
980 else | 967 else if (TREE_CODE (to_arg) == SSA_NAME |
968 && from_arg != to_arg) | |
981 { | 969 { |
982 if (get_current_def (to_arg) == NULL_TREE) | 970 if (get_current_def (to_arg) == NULL_TREE) |
983 set_current_def (to_arg, get_current_def (from_arg)); | 971 { |
972 gcc_assert (types_compatible_p (TREE_TYPE (to_arg), | |
973 TREE_TYPE (get_current_def | |
974 (from_arg)))); | |
975 set_current_def (to_arg, get_current_def (from_arg)); | |
976 } | |
984 } | 977 } |
985 gsi_next (&gsi_from); | 978 gsi_next (&gsi_from); |
986 gsi_next (&gsi_to); | 979 gsi_next (&gsi_to); |
987 } | 980 } |
988 | 981 |
998 on E which is either the entry or exit of LOOP. If SCALAR_LOOP is | 991 on E which is either the entry or exit of LOOP. If SCALAR_LOOP is |
999 non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the | 992 non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the |
1000 basic blocks from SCALAR_LOOP instead of LOOP, but to either the | 993 basic blocks from SCALAR_LOOP instead of LOOP, but to either the |
1001 entry or exit of LOOP. */ | 994 entry or exit of LOOP. */ |
1002 | 995 |
1003 struct loop * | 996 class loop * |
1004 slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop, | 997 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, |
1005 struct loop *scalar_loop, edge e) | 998 class loop *scalar_loop, edge e) |
1006 { | 999 { |
1007 struct loop *new_loop; | 1000 class loop *new_loop; |
1008 basic_block *new_bbs, *bbs, *pbbs; | 1001 basic_block *new_bbs, *bbs, *pbbs; |
1009 bool at_exit; | 1002 bool at_exit; |
1010 bool was_imm_dom; | 1003 bool was_imm_dom; |
1011 basic_block exit_dest; | 1004 basic_block exit_dest; |
1012 edge exit, new_exit; | 1005 edge exit, new_exit; |
1227 (3) its exit condition is the last stmt in the header | 1220 (3) its exit condition is the last stmt in the header |
1228 (4) E is the entry/exit edge of LOOP. | 1221 (4) E is the entry/exit edge of LOOP. |
1229 */ | 1222 */ |
1230 | 1223 |
1231 bool | 1224 bool |
1232 slpeel_can_duplicate_loop_p (const struct loop *loop, const_edge e) | 1225 slpeel_can_duplicate_loop_p (const class loop *loop, const_edge e) |
1233 { | 1226 { |
1234 edge exit_e = single_exit (loop); | 1227 edge exit_e = single_exit (loop); |
1235 edge entry_e = loop_preheader_edge (loop); | 1228 edge entry_e = loop_preheader_edge (loop); |
1236 gcond *orig_cond = get_loop_exit_condition (loop); | 1229 gcond *orig_cond = get_loop_exit_condition (loop); |
1237 gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src); | 1230 gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src); |
1254 /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI | 1247 /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI |
1255 in the exit bb and rename all the uses after the loop. This simplifies | 1248 in the exit bb and rename all the uses after the loop. This simplifies |
1256 the *guard[12] routines, which assume loop closed SSA form for all PHIs | 1249 the *guard[12] routines, which assume loop closed SSA form for all PHIs |
1257 (but normally loop closed SSA form doesn't require virtual PHIs to be | 1250 (but normally loop closed SSA form doesn't require virtual PHIs to be |
1258 in the same form). Doing this early simplifies the checking what | 1251 in the same form). Doing this early simplifies the checking what |
1259 uses should be renamed. */ | 1252 uses should be renamed. |
1260 | 1253 |
1261 static void | 1254 If we create a new phi after the loop, return the definition that |
1262 create_lcssa_for_virtual_phi (struct loop *loop) | 1255 applies on entry to the loop, otherwise return null. */ |
1256 | |
1257 static tree | |
1258 create_lcssa_for_virtual_phi (class loop *loop) | |
1263 { | 1259 { |
1264 gphi_iterator gsi; | 1260 gphi_iterator gsi; |
1265 edge exit_e = single_exit (loop); | 1261 edge exit_e = single_exit (loop); |
1266 | 1262 |
1267 for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi)) | 1263 for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi)) |
1288 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop) | 1284 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop) |
1289 if (stmt != new_phi | 1285 if (stmt != new_phi |
1290 && !flow_bb_inside_loop_p (loop, gimple_bb (stmt))) | 1286 && !flow_bb_inside_loop_p (loop, gimple_bb (stmt))) |
1291 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) | 1287 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) |
1292 SET_USE (use_p, new_vop); | 1288 SET_USE (use_p, new_vop); |
1289 | |
1290 return PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop)); | |
1293 } | 1291 } |
1294 break; | 1292 break; |
1295 } | 1293 } |
1296 | 1294 return NULL_TREE; |
1297 } | 1295 } |
1298 | 1296 |
1299 /* Function vect_get_loop_location. | 1297 /* Function vect_get_loop_location. |
1300 | 1298 |
1301 Extract the location of the loop in the source code. | 1299 Extract the location of the loop in the source code. |
1302 If the loop is not well formed for vectorization, an estimated | 1300 If the loop is not well formed for vectorization, an estimated |
1303 location is calculated. | 1301 location is calculated. |
1304 Return the loop location if succeed and NULL if not. */ | 1302 Return the loop location if succeed and NULL if not. */ |
1305 | 1303 |
1306 dump_user_location_t | 1304 dump_user_location_t |
1307 find_loop_location (struct loop *loop) | 1305 find_loop_location (class loop *loop) |
1308 { | 1306 { |
1309 gimple *stmt = NULL; | 1307 gimple *stmt = NULL; |
1310 basic_block bb; | 1308 basic_block bb; |
1311 gimple_stmt_iterator si; | 1309 gimple_stmt_iterator si; |
1312 | 1310 |
1364 These restrictions will be relaxed in the future. */ | 1362 These restrictions will be relaxed in the future. */ |
1365 | 1363 |
1366 bool | 1364 bool |
1367 vect_can_advance_ivs_p (loop_vec_info loop_vinfo) | 1365 vect_can_advance_ivs_p (loop_vec_info loop_vinfo) |
1368 { | 1366 { |
1369 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | 1367 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
1370 basic_block bb = loop->header; | 1368 basic_block bb = loop->header; |
1371 gphi_iterator gsi; | 1369 gphi_iterator gsi; |
1372 | 1370 |
1373 /* Analyze phi functions of the loop header. */ | 1371 /* Analyze phi functions of the loop header. */ |
1374 | 1372 |
1478 static void | 1476 static void |
1479 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, | 1477 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, |
1480 tree niters, edge update_e) | 1478 tree niters, edge update_e) |
1481 { | 1479 { |
1482 gphi_iterator gsi, gsi1; | 1480 gphi_iterator gsi, gsi1; |
1483 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | 1481 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
1484 basic_block update_bb = update_e->dest; | 1482 basic_block update_bb = update_e->dest; |
1485 basic_block exit_bb = single_exit (loop)->dest; | 1483 basic_block exit_bb = single_exit (loop)->dest; |
1486 | 1484 |
1487 /* Make sure there exists a single-predecessor exit bb: */ | 1485 /* Make sure there exists a single-predecessor exit bb: */ |
1488 gcc_assert (single_pred_p (exit_bb)); | 1486 gcc_assert (single_pred_p (exit_bb)); |
1559 { | 1557 { |
1560 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo); | 1558 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo); |
1561 stmt_vec_info stmt_info = dr_info->stmt; | 1559 stmt_vec_info stmt_info = dr_info->stmt; |
1562 tree vectype = STMT_VINFO_VECTYPE (stmt_info); | 1560 tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
1563 | 1561 |
1564 unsigned int target_align = DR_TARGET_ALIGNMENT (dr_info); | 1562 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info); |
1565 gcc_assert (target_align != 0); | 1563 unsigned HOST_WIDE_INT target_align_c; |
1564 tree target_align_minus_1; | |
1566 | 1565 |
1567 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr), | 1566 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr), |
1568 size_zero_node) < 0; | 1567 size_zero_node) < 0; |
1569 tree offset = (negative | 1568 tree offset = (negative |
1570 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) | 1569 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) |
1571 : size_zero_node); | 1570 : size_zero_node); |
1572 tree start_addr = vect_create_addr_base_for_vector_ref (stmt_info, seq, | 1571 tree start_addr = vect_create_addr_base_for_vector_ref (stmt_info, seq, |
1573 offset); | 1572 offset); |
1574 tree type = unsigned_type_for (TREE_TYPE (start_addr)); | 1573 tree type = unsigned_type_for (TREE_TYPE (start_addr)); |
1575 tree target_align_minus_1 = build_int_cst (type, target_align - 1); | 1574 if (target_align.is_constant (&target_align_c)) |
1575 target_align_minus_1 = build_int_cst (type, target_align_c - 1); | |
1576 else | |
1577 { | |
1578 tree vla = build_int_cst (type, target_align); | |
1579 tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla, | |
1580 fold_build2 (MINUS_EXPR, type, | |
1581 build_int_cst (type, 0), vla)); | |
1582 target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align, | |
1583 build_int_cst (type, 1)); | |
1584 } | |
1585 | |
1576 HOST_WIDE_INT elem_size | 1586 HOST_WIDE_INT elem_size |
1577 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype))); | 1587 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype))); |
1578 tree elem_size_log = build_int_cst (type, exact_log2 (elem_size)); | 1588 tree elem_size_log = build_int_cst (type, exact_log2 (elem_size)); |
1579 | 1589 |
1580 /* Create: misalign_in_bytes = addr & (target_align - 1). */ | 1590 /* Create: misalign_in_bytes = addr & (target_align - 1). */ |
1629 tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)); | 1639 tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)); |
1630 gimple_seq stmts = NULL, new_stmts = NULL; | 1640 gimple_seq stmts = NULL, new_stmts = NULL; |
1631 tree iters, iters_name; | 1641 tree iters, iters_name; |
1632 stmt_vec_info stmt_info = dr_info->stmt; | 1642 stmt_vec_info stmt_info = dr_info->stmt; |
1633 tree vectype = STMT_VINFO_VECTYPE (stmt_info); | 1643 tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
1634 unsigned int target_align = DR_TARGET_ALIGNMENT (dr_info); | 1644 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info); |
1635 | 1645 |
1636 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0) | 1646 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0) |
1637 { | 1647 { |
1638 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); | 1648 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); |
1639 | 1649 |
1648 { | 1658 { |
1649 tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo); | 1659 tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo); |
1650 tree type = TREE_TYPE (misalign_in_elems); | 1660 tree type = TREE_TYPE (misalign_in_elems); |
1651 HOST_WIDE_INT elem_size | 1661 HOST_WIDE_INT elem_size |
1652 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype))); | 1662 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype))); |
1653 HOST_WIDE_INT align_in_elems = target_align / elem_size; | 1663 /* We only do prolog peeling if the target alignment is known at compile |
1654 tree align_in_elems_minus_1 = build_int_cst (type, align_in_elems - 1); | 1664 time. */ |
1665 poly_uint64 align_in_elems = | |
1666 exact_div (target_align, elem_size); | |
1667 tree align_in_elems_minus_1 = | |
1668 build_int_cst (type, align_in_elems - 1); | |
1655 tree align_in_elems_tree = build_int_cst (type, align_in_elems); | 1669 tree align_in_elems_tree = build_int_cst (type, align_in_elems); |
1656 | 1670 |
1657 /* Create: (niters_type) ((align_in_elems - misalign_in_elems) | 1671 /* Create: (niters_type) ((align_in_elems - misalign_in_elems) |
1658 & (align_in_elems - 1)). */ | 1672 & (align_in_elems - 1)). */ |
1659 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr), | 1673 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr), |
1664 else | 1678 else |
1665 iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree, | 1679 iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree, |
1666 misalign_in_elems); | 1680 misalign_in_elems); |
1667 iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1); | 1681 iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1); |
1668 iters = fold_convert (niters_type, iters); | 1682 iters = fold_convert (niters_type, iters); |
1669 *bound = align_in_elems - 1; | 1683 unsigned HOST_WIDE_INT align_in_elems_c; |
1684 if (align_in_elems.is_constant (&align_in_elems_c)) | |
1685 *bound = align_in_elems_c - 1; | |
1686 else | |
1687 *bound = -1; | |
1670 } | 1688 } |
1671 | 1689 |
1672 if (dump_enabled_p ()) | 1690 if (dump_enabled_p ()) |
1673 dump_printf_loc (MSG_NOTE, vect_location, | 1691 dump_printf_loc (MSG_NOTE, vect_location, |
1674 "niters for prolog loop: %T\n", iters); | 1692 "niters for prolog loop: %T\n", iters); |
1696 If CODE is PLUS, the vector loop starts NITERS iterations after the | 1714 If CODE is PLUS, the vector loop starts NITERS iterations after the |
1697 scalar one, otherwise CODE is MINUS and the vector loop starts NITERS | 1715 scalar one, otherwise CODE is MINUS and the vector loop starts NITERS |
1698 iterations before the scalar one (using masking to skip inactive | 1716 iterations before the scalar one (using masking to skip inactive |
1699 elements). This function updates the information recorded in DR to | 1717 elements). This function updates the information recorded in DR to |
1700 account for the difference. Specifically, it updates the OFFSET | 1718 account for the difference. Specifically, it updates the OFFSET |
1701 field of DR. */ | 1719 field of DR_INFO. */ |
1702 | 1720 |
1703 static void | 1721 static void |
1704 vect_update_init_of_dr (struct data_reference *dr, tree niters, tree_code code) | 1722 vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code) |
1705 { | 1723 { |
1706 tree offset = DR_OFFSET (dr); | 1724 struct data_reference *dr = dr_info->dr; |
1725 tree offset = dr_info->offset; | |
1726 if (!offset) | |
1727 offset = build_zero_cst (sizetype); | |
1707 | 1728 |
1708 niters = fold_build2 (MULT_EXPR, sizetype, | 1729 niters = fold_build2 (MULT_EXPR, sizetype, |
1709 fold_convert (sizetype, niters), | 1730 fold_convert (sizetype, niters), |
1710 fold_convert (sizetype, DR_STEP (dr))); | 1731 fold_convert (sizetype, DR_STEP (dr))); |
1711 offset = fold_build2 (code, sizetype, | 1732 offset = fold_build2 (code, sizetype, |
1712 fold_convert (sizetype, offset), niters); | 1733 fold_convert (sizetype, offset), niters); |
1713 DR_OFFSET (dr) = offset; | 1734 dr_info->offset = offset; |
1714 } | 1735 } |
1715 | 1736 |
1716 | 1737 |
1717 /* Function vect_update_inits_of_drs | 1738 /* Function vect_update_inits_of_drs |
1718 | 1739 |
1719 Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO. | 1740 Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO. |
1720 CODE and NITERS are as for vect_update_inits_of_dr. */ | 1741 CODE and NITERS are as for vect_update_inits_of_dr. */ |
1721 | 1742 |
1722 static void | 1743 void |
1723 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters, | 1744 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters, |
1724 tree_code code) | 1745 tree_code code) |
1725 { | 1746 { |
1726 unsigned int i; | 1747 unsigned int i; |
1727 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); | 1748 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); |
1728 struct data_reference *dr; | 1749 struct data_reference *dr; |
1729 | 1750 |
1730 DUMP_VECT_SCOPE ("vect_update_inits_of_dr"); | 1751 DUMP_VECT_SCOPE ("vect_update_inits_of_dr"); |
1731 | 1752 |
1732 /* Adjust niters to sizetype and insert stmts on loop preheader edge. */ | 1753 /* Adjust niters to sizetype. We used to insert the stmts on loop preheader |
1754 here, but since we might use these niters to update the epilogues niters | |
1755 and data references we can't insert them here as this definition might not | |
1756 always dominate its uses. */ | |
1733 if (!types_compatible_p (sizetype, TREE_TYPE (niters))) | 1757 if (!types_compatible_p (sizetype, TREE_TYPE (niters))) |
1734 { | 1758 niters = fold_convert (sizetype, niters); |
1735 gimple_seq seq; | |
1736 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo)); | |
1737 tree var = create_tmp_var (sizetype, "prolog_loop_adjusted_niters"); | |
1738 | |
1739 niters = fold_convert (sizetype, niters); | |
1740 niters = force_gimple_operand (niters, &seq, false, var); | |
1741 if (seq) | |
1742 { | |
1743 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq); | |
1744 gcc_assert (!new_bb); | |
1745 } | |
1746 } | |
1747 | 1759 |
1748 FOR_EACH_VEC_ELT (datarefs, i, dr) | 1760 FOR_EACH_VEC_ELT (datarefs, i, dr) |
1749 { | 1761 { |
1750 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr); | 1762 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr); |
1751 if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt)) | 1763 if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt)) |
1752 vect_update_init_of_dr (dr, niters, code); | 1764 vect_update_init_of_dr (dr_info, niters, code); |
1753 } | 1765 } |
1754 } | 1766 } |
1755 | 1767 |
1756 /* For the information recorded in LOOP_VINFO prepare the loop for peeling | 1768 /* For the information recorded in LOOP_VINFO prepare the loop for peeling |
1757 by masking. This involves calculating the number of iterations to | 1769 by masking. This involves calculating the number of iterations to |
1984 tree niters_vector, | 1996 tree niters_vector, |
1985 tree *niters_vector_mult_vf_ptr) | 1997 tree *niters_vector_mult_vf_ptr) |
1986 { | 1998 { |
1987 /* We should be using a step_vector of VF if VF is variable. */ | 1999 /* We should be using a step_vector of VF if VF is variable. */ |
1988 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant (); | 2000 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant (); |
1989 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | 2001 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
1990 tree type = TREE_TYPE (niters_vector); | 2002 tree type = TREE_TYPE (niters_vector); |
1991 tree log_vf = build_int_cst (type, exact_log2 (vf)); | 2003 tree log_vf = build_int_cst (type, exact_log2 (vf)); |
1992 basic_block exit_bb = single_exit (loop)->dest; | 2004 basic_block exit_bb = single_exit (loop)->dest; |
1993 | 2005 |
1994 gcc_assert (niters_vector_mult_vf_ptr != NULL); | 2006 gcc_assert (niters_vector_mult_vf_ptr != NULL); |
2002 &stmts, true, var); | 2014 &stmts, true, var); |
2003 gimple_stmt_iterator gsi = gsi_start_bb (exit_bb); | 2015 gimple_stmt_iterator gsi = gsi_start_bb (exit_bb); |
2004 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT); | 2016 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT); |
2005 } | 2017 } |
2006 *niters_vector_mult_vf_ptr = niters_vector_mult_vf; | 2018 *niters_vector_mult_vf_ptr = niters_vector_mult_vf; |
2019 } | |
2020 | |
2021 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP, | |
2022 this function searches for the corresponding lcssa phi node in exit | |
2023 bb of LOOP. If it is found, return the phi result; otherwise return | |
2024 NULL. */ | |
2025 | |
2026 static tree | |
2027 find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED, | |
2028 gphi *lcssa_phi) | |
2029 { | |
2030 gphi_iterator gsi; | |
2031 edge e = single_exit (loop); | |
2032 | |
2033 gcc_assert (single_pred_p (e->dest)); | |
2034 for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi)) | |
2035 { | |
2036 gphi *phi = gsi.phi (); | |
2037 if (operand_equal_p (PHI_ARG_DEF (phi, 0), | |
2038 PHI_ARG_DEF (lcssa_phi, 0), 0)) | |
2039 return PHI_RESULT (phi); | |
2040 } | |
2041 return NULL_TREE; | |
2007 } | 2042 } |
2008 | 2043 |
2009 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND | 2044 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND |
2010 from SECOND/FIRST and puts it at the original loop's preheader/exit | 2045 from SECOND/FIRST and puts it at the original loop's preheader/exit |
2011 edge, the two loops are arranged as below: | 2046 edge, the two loops are arranged as below: |
2052 second loop, i.e, between_bb in the example code. With PHIs updated, | 2087 second loop, i.e, between_bb in the example code. With PHIs updated, |
2053 the second loop will execute rest iterations of the first. */ | 2088 the second loop will execute rest iterations of the first. */ |
2054 | 2089 |
2055 static void | 2090 static void |
2056 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo, | 2091 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo, |
2057 struct loop *first, struct loop *second, | 2092 class loop *first, class loop *second, |
2058 bool create_lcssa_for_iv_phis) | 2093 bool create_lcssa_for_iv_phis) |
2059 { | 2094 { |
2060 gphi_iterator gsi_update, gsi_orig; | 2095 gphi_iterator gsi_update, gsi_orig; |
2061 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | 2096 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
2062 | 2097 |
2063 edge first_latch_e = EDGE_SUCC (first->latch, 0); | 2098 edge first_latch_e = EDGE_SUCC (first->latch, 0); |
2064 edge second_preheader_e = loop_preheader_edge (second); | 2099 edge second_preheader_e = loop_preheader_edge (second); |
2065 basic_block between_bb = single_exit (first)->dest; | 2100 basic_block between_bb = single_exit (first)->dest; |
2066 | 2101 |
2090 } | 2125 } |
2091 | 2126 |
2092 /* Update PHI node in the second loop by replacing arg on the loop's | 2127 /* Update PHI node in the second loop by replacing arg on the loop's |
2093 incoming edge. */ | 2128 incoming edge. */ |
2094 adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg); | 2129 adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg); |
2130 } | |
2131 | |
2132 /* For epilogue peeling we have to make sure to copy all LC PHIs | |
2133 for correct vectorization of live stmts. */ | |
2134 if (loop == first) | |
2135 { | |
2136 basic_block orig_exit = single_exit (second)->dest; | |
2137 for (gsi_orig = gsi_start_phis (orig_exit); | |
2138 !gsi_end_p (gsi_orig); gsi_next (&gsi_orig)) | |
2139 { | |
2140 gphi *orig_phi = gsi_orig.phi (); | |
2141 tree orig_arg = PHI_ARG_DEF (orig_phi, 0); | |
2142 if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p (orig_arg)) | |
2143 continue; | |
2144 | |
2145 /* Already created in the above loop. */ | |
2146 if (find_guard_arg (first, second, orig_phi)) | |
2147 continue; | |
2148 | |
2149 tree new_res = copy_ssa_name (orig_arg); | |
2150 gphi *lcphi = create_phi_node (new_res, between_bb); | |
2151 add_phi_arg (lcphi, orig_arg, single_exit (first), UNKNOWN_LOCATION); | |
2152 } | |
2095 } | 2153 } |
2096 } | 2154 } |
2097 | 2155 |
2098 /* Function slpeel_add_loop_guard adds guard skipping from the beginning | 2156 /* Function slpeel_add_loop_guard adds guard skipping from the beginning |
2099 of SKIP_LOOP to the beginning of UPDATE_LOOP. GUARD_EDGE and MERGE_EDGE | 2157 of SKIP_LOOP to the beginning of UPDATE_LOOP. GUARD_EDGE and MERGE_EDGE |
2140 | 2198 |
2141 This function creates PHI nodes at merge_bb and replaces the use of i_5 | 2199 This function creates PHI nodes at merge_bb and replaces the use of i_5 |
2142 in the update_loop's PHI node with the result of new PHI result. */ | 2200 in the update_loop's PHI node with the result of new PHI result. */ |
2143 | 2201 |
2144 static void | 2202 static void |
2145 slpeel_update_phi_nodes_for_guard1 (struct loop *skip_loop, | 2203 slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop, |
2146 struct loop *update_loop, | 2204 class loop *update_loop, |
2147 edge guard_edge, edge merge_edge) | 2205 edge guard_edge, edge merge_edge) |
2148 { | 2206 { |
2149 source_location merge_loc, guard_loc; | 2207 location_t merge_loc, guard_loc; |
2150 edge orig_e = loop_preheader_edge (skip_loop); | 2208 edge orig_e = loop_preheader_edge (skip_loop); |
2151 edge update_e = loop_preheader_edge (update_loop); | 2209 edge update_e = loop_preheader_edge (update_loop); |
2152 gphi_iterator gsi_orig, gsi_update; | 2210 gphi_iterator gsi_orig, gsi_update; |
2153 | 2211 |
2154 for ((gsi_orig = gsi_start_phis (skip_loop->header), | 2212 for ((gsi_orig = gsi_start_phis (skip_loop->header), |
2173 add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc); | 2231 add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc); |
2174 | 2232 |
2175 /* Update phi in UPDATE_PHI. */ | 2233 /* Update phi in UPDATE_PHI. */ |
2176 adjust_phi_and_debug_stmts (update_phi, update_e, new_res); | 2234 adjust_phi_and_debug_stmts (update_phi, update_e, new_res); |
2177 } | 2235 } |
2178 } | |
2179 | |
2180 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP, | |
2181 this function searches for the corresponding lcssa phi node in exit | |
2182 bb of LOOP. If it is found, return the phi result; otherwise return | |
2183 NULL. */ | |
2184 | |
2185 static tree | |
2186 find_guard_arg (struct loop *loop, struct loop *epilog ATTRIBUTE_UNUSED, | |
2187 gphi *lcssa_phi) | |
2188 { | |
2189 gphi_iterator gsi; | |
2190 edge e = single_exit (loop); | |
2191 | |
2192 gcc_assert (single_pred_p (e->dest)); | |
2193 for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi)) | |
2194 { | |
2195 gphi *phi = gsi.phi (); | |
2196 if (operand_equal_p (PHI_ARG_DEF (phi, 0), | |
2197 PHI_ARG_DEF (lcssa_phi, 0), 0)) | |
2198 return PHI_RESULT (phi); | |
2199 } | |
2200 return NULL_TREE; | |
2201 } | 2236 } |
2202 | 2237 |
2203 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied | 2238 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied |
2204 from LOOP. Function slpeel_add_loop_guard adds guard skipping from a | 2239 from LOOP. Function slpeel_add_loop_guard adds guard skipping from a |
2205 point between the two loops to the end of EPILOG. Edges GUARD_EDGE | 2240 point between the two loops to the end of EPILOG. Edges GUARD_EDGE |
2252 the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined | 2287 the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined |
2253 by LOOP and is found in the exit bb of LOOP. Arg of the original PHI | 2288 by LOOP and is found in the exit bb of LOOP. Arg of the original PHI |
2254 in exit_bb will also be updated. */ | 2289 in exit_bb will also be updated. */ |
2255 | 2290 |
2256 static void | 2291 static void |
2257 slpeel_update_phi_nodes_for_guard2 (struct loop *loop, struct loop *epilog, | 2292 slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog, |
2258 edge guard_edge, edge merge_edge) | 2293 edge guard_edge, edge merge_edge) |
2259 { | 2294 { |
2260 gphi_iterator gsi; | 2295 gphi_iterator gsi; |
2261 basic_block merge_bb = guard_edge->dest; | 2296 basic_block merge_bb = guard_edge->dest; |
2262 | 2297 |
2268 | 2303 |
2269 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi)) | 2304 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi)) |
2270 { | 2305 { |
2271 gphi *update_phi = gsi.phi (); | 2306 gphi *update_phi = gsi.phi (); |
2272 tree old_arg = PHI_ARG_DEF (update_phi, 0); | 2307 tree old_arg = PHI_ARG_DEF (update_phi, 0); |
2273 /* This loop-closed-phi actually doesn't represent a use out of the | 2308 |
2274 loop - the phi arg is a constant. */ | 2309 tree merge_arg = NULL_TREE; |
2275 if (TREE_CODE (old_arg) != SSA_NAME) | 2310 |
2276 continue; | 2311 /* If the old argument is a SSA_NAME use its current_def. */ |
2277 | 2312 if (TREE_CODE (old_arg) == SSA_NAME) |
2278 tree merge_arg = get_current_def (old_arg); | 2313 merge_arg = get_current_def (old_arg); |
2314 /* If it's a constant or doesn't have a current_def, just use the old | |
2315 argument. */ | |
2279 if (!merge_arg) | 2316 if (!merge_arg) |
2280 merge_arg = old_arg; | 2317 merge_arg = old_arg; |
2281 | 2318 |
2282 tree guard_arg = find_guard_arg (loop, epilog, update_phi); | 2319 tree guard_arg = find_guard_arg (loop, epilog, update_phi); |
2283 /* If the var is live after loop but not a reduction, we simply | 2320 /* If the var is live after loop but not a reduction, we simply |
2301 | 2338 |
2302 /* EPILOG loop is duplicated from the original loop for vectorizing, | 2339 /* EPILOG loop is duplicated from the original loop for vectorizing, |
2303 the arg of its loop closed ssa PHI needs to be updated. */ | 2340 the arg of its loop closed ssa PHI needs to be updated. */ |
2304 | 2341 |
2305 static void | 2342 static void |
2306 slpeel_update_phi_nodes_for_lcssa (struct loop *epilog) | 2343 slpeel_update_phi_nodes_for_lcssa (class loop *epilog) |
2307 { | 2344 { |
2308 gphi_iterator gsi; | 2345 gphi_iterator gsi; |
2309 basic_block exit_bb = single_exit (epilog)->dest; | 2346 basic_block exit_bb = single_exit (epilog)->dest; |
2310 | 2347 |
2311 gcc_assert (single_pred_p (exit_bb)); | 2348 gcc_assert (single_pred_p (exit_bb)); |
2384 | 2421 |
2385 merge_bb_3: | 2422 merge_bb_3: |
2386 | 2423 |
2387 Note this function peels prolog and epilog only if it's necessary, | 2424 Note this function peels prolog and epilog only if it's necessary, |
2388 as well as guards. | 2425 as well as guards. |
2389 Returns created epilogue or NULL. | 2426 This function returns the epilogue loop if a decision was made to vectorize |
2427 it, otherwise NULL. | |
2428 | |
2429 The analysis resulting in this epilogue loop's loop_vec_info was performed | |
2430 in the same vect_analyze_loop call as the main loop's. At that time | |
2431 vect_analyze_loop constructs a list of accepted loop_vec_info's for lower | |
2432 vectorization factors than the main loop. This list is stored in the main | |
2433 loop's loop_vec_info in the 'epilogue_vinfos' member. Everytime we decide to | |
2434 vectorize the epilogue loop for a lower vectorization factor, the | |
2435 loop_vec_info sitting at the top of the epilogue_vinfos list is removed, | |
2436 updated and linked to the epilogue loop. This is later used to vectorize | |
2437 the epilogue. The reason the loop_vec_info needs updating is that it was | |
2438 constructed based on the original main loop, and the epilogue loop is a | |
2439 copy of this loop, so all links pointing to statements in the original loop | |
2440 need updating. Furthermore, these loop_vec_infos share the | |
2441 data_reference's records, which will also need to be updated. | |
2390 | 2442 |
2391 TODO: Guard for prefer_scalar_loop should be emitted along with | 2443 TODO: Guard for prefer_scalar_loop should be emitted along with |
2392 versioning conditions if loop versioning is needed. */ | 2444 versioning conditions if loop versioning is needed. */ |
2393 | 2445 |
2394 | 2446 |
2395 struct loop * | 2447 class loop * |
2396 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, | 2448 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, |
2397 tree *niters_vector, tree *step_vector, | 2449 tree *niters_vector, tree *step_vector, |
2398 tree *niters_vector_mult_vf_var, int th, | 2450 tree *niters_vector_mult_vf_var, int th, |
2399 bool check_profitability, bool niters_no_overflow) | 2451 bool check_profitability, bool niters_no_overflow, |
2452 tree *advance) | |
2400 { | 2453 { |
2401 edge e, guard_e; | 2454 edge e, guard_e; |
2402 tree type = TREE_TYPE (niters), guard_cond; | 2455 tree type = TREE_TYPE (niters), guard_cond; |
2403 basic_block guard_bb, guard_to; | 2456 basic_block guard_bb, guard_to; |
2404 profile_probability prob_prolog, prob_vector, prob_epilog; | 2457 profile_probability prob_prolog, prob_vector, prob_epilog; |
2405 int estimated_vf; | 2458 int estimated_vf; |
2406 int prolog_peeling = 0; | 2459 int prolog_peeling = 0; |
2460 bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0; | |
2461 /* We currently do not support prolog peeling if the target alignment is not | |
2462 known at compile time. 'vect_gen_prolog_loop_niters' depends on the | |
2463 target alignment being constant. */ | |
2464 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo); | |
2465 if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ()) | |
2466 return NULL; | |
2467 | |
2407 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo)) | 2468 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo)) |
2408 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); | 2469 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); |
2409 | 2470 |
2410 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); | 2471 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
2411 poly_uint64 bound_epilog = 0; | 2472 poly_uint64 bound_epilog = 0; |
2425 if (estimated_vf == 2) | 2486 if (estimated_vf == 2) |
2426 estimated_vf = 3; | 2487 estimated_vf = 3; |
2427 prob_prolog = prob_epilog = profile_probability::guessed_always () | 2488 prob_prolog = prob_epilog = profile_probability::guessed_always () |
2428 .apply_scale (estimated_vf - 1, estimated_vf); | 2489 .apply_scale (estimated_vf - 1, estimated_vf); |
2429 | 2490 |
2430 struct loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo); | 2491 class loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo); |
2431 struct loop *first_loop = loop; | 2492 class loop *first_loop = loop; |
2432 bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP; | 2493 bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP; |
2494 | |
2495 /* We might have a queued need to update virtual SSA form. As we | |
2496 delete the update SSA machinery below after doing a regular | |
2497 incremental SSA update during loop copying make sure we don't | |
2498 lose that fact. | |
2499 ??? Needing to update virtual SSA form by renaming is unfortunate | |
2500 but not all of the vectorizer code inserting new loads / stores | |
2501 properly assigns virtual operands to those statements. */ | |
2502 update_ssa (TODO_update_ssa_only_virtuals); | |
2503 | |
2433 create_lcssa_for_virtual_phi (loop); | 2504 create_lcssa_for_virtual_phi (loop); |
2434 update_ssa (TODO_update_ssa_only_virtuals); | 2505 |
2506 /* If we're vectorizing an epilogue loop, the update_ssa above will | |
2507 have ensured that the virtual operand is in SSA form throughout the | |
2508 vectorized main loop. Normally it is possible to trace the updated | |
2509 vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses | |
2510 back to scalar-stmt vuses, meaning that the effect of the SSA update | |
2511 remains local to the main loop. However, there are rare cases in | |
2512 which the vectorized loop has vdefs even when the original scalar | |
2513 loop didn't. For example, vectorizing a load with IFN_LOAD_LANES | |
2514 introduces clobbers of the temporary vector array, which in turn | |
2515 needs new vdefs. If the scalar loop doesn't write to memory, these | |
2516 new vdefs will be the only ones in the vector loop. | |
2517 | |
2518 In that case, update_ssa will have added a new virtual phi to the | |
2519 main loop, which previously didn't need one. Ensure that we (locally) | |
2520 maintain LCSSA form for the virtual operand, just as we would have | |
2521 done if the virtual phi had existed from the outset. This makes it | |
2522 easier to duplicate the scalar epilogue loop below. */ | |
2523 tree vop_to_rename = NULL_TREE; | |
2524 if (loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) | |
2525 { | |
2526 class loop *orig_loop = LOOP_VINFO_LOOP (orig_loop_vinfo); | |
2527 vop_to_rename = create_lcssa_for_virtual_phi (orig_loop); | |
2528 } | |
2435 | 2529 |
2436 if (MAY_HAVE_DEBUG_BIND_STMTS) | 2530 if (MAY_HAVE_DEBUG_BIND_STMTS) |
2437 { | 2531 { |
2438 gcc_assert (!adjust_vec.exists ()); | 2532 gcc_assert (!adjust_vec.exists ()); |
2439 adjust_vec.create (32); | 2533 adjust_vec.create (32); |
2448 so that we can also get the upper bound on the number of iterations. */ | 2542 so that we can also get the upper bound on the number of iterations. */ |
2449 tree niters_prolog; | 2543 tree niters_prolog; |
2450 int bound_prolog = 0; | 2544 int bound_prolog = 0; |
2451 if (prolog_peeling) | 2545 if (prolog_peeling) |
2452 niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor, | 2546 niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor, |
2453 &bound_prolog); | 2547 &bound_prolog); |
2454 else | 2548 else |
2455 niters_prolog = build_int_cst (type, 0); | 2549 niters_prolog = build_int_cst (type, 0); |
2456 | 2550 |
2551 loop_vec_info epilogue_vinfo = NULL; | |
2552 if (vect_epilogues) | |
2553 { | |
2554 epilogue_vinfo = loop_vinfo->epilogue_vinfos[0]; | |
2555 loop_vinfo->epilogue_vinfos.ordered_remove (0); | |
2556 } | |
2557 | |
2558 tree niters_vector_mult_vf = NULL_TREE; | |
2559 /* Saving NITERs before the loop, as this may be changed by prologue. */ | |
2560 tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo); | |
2561 edge update_e = NULL, skip_e = NULL; | |
2562 unsigned int lowest_vf = constant_lower_bound (vf); | |
2563 /* If we know the number of scalar iterations for the main loop we should | |
2564 check whether after the main loop there are enough iterations left over | |
2565 for the epilogue. */ | |
2566 if (vect_epilogues | |
2567 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) | |
2568 && prolog_peeling >= 0 | |
2569 && known_eq (vf, lowest_vf)) | |
2570 { | |
2571 unsigned HOST_WIDE_INT eiters | |
2572 = (LOOP_VINFO_INT_NITERS (loop_vinfo) | |
2573 - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)); | |
2574 | |
2575 eiters -= prolog_peeling; | |
2576 eiters | |
2577 = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo); | |
2578 | |
2579 unsigned int ratio; | |
2580 unsigned int epilogue_gaps | |
2581 = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo); | |
2582 while (!(constant_multiple_p | |
2583 (GET_MODE_SIZE (loop_vinfo->vector_mode), | |
2584 GET_MODE_SIZE (epilogue_vinfo->vector_mode), &ratio) | |
2585 && eiters >= lowest_vf / ratio + epilogue_gaps)) | |
2586 { | |
2587 delete epilogue_vinfo; | |
2588 epilogue_vinfo = NULL; | |
2589 if (loop_vinfo->epilogue_vinfos.length () == 0) | |
2590 { | |
2591 vect_epilogues = false; | |
2592 break; | |
2593 } | |
2594 epilogue_vinfo = loop_vinfo->epilogue_vinfos[0]; | |
2595 loop_vinfo->epilogue_vinfos.ordered_remove (0); | |
2596 epilogue_gaps = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo); | |
2597 } | |
2598 } | |
2457 /* Prolog loop may be skipped. */ | 2599 /* Prolog loop may be skipped. */ |
2458 bool skip_prolog = (prolog_peeling != 0); | 2600 bool skip_prolog = (prolog_peeling != 0); |
2459 /* Skip to epilog if scalar loop may be preferred. It's only needed | 2601 /* Skip this loop to epilog when there are not enough iterations to enter this |
2460 when we peel for epilog loop and when it hasn't been checked with | 2602 vectorized loop. If true we should perform runtime checks on the NITERS |
2461 loop versioning. */ | 2603 to check whether we should skip the current vectorized loop. If we know |
2604 the number of scalar iterations we may choose to add a runtime check if | |
2605 this number "maybe" smaller than the number of iterations required | |
2606 when we know the number of scalar iterations may potentially | |
2607 be smaller than the number of iterations required to enter this loop, for | |
2608 this we use the upper bounds on the prolog and epilog peeling. When we | |
2609 don't know the number of iterations and don't require versioning it is | |
2610 because we have asserted that there are enough scalar iterations to enter | |
2611 the main loop, so this skip is not necessary. When we are versioning then | |
2612 we only add such a skip if we have chosen to vectorize the epilogue. */ | |
2462 bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) | 2613 bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) |
2463 ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo), | 2614 ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo), |
2464 bound_prolog + bound_epilog) | 2615 bound_prolog + bound_epilog) |
2465 : !LOOP_REQUIRES_VERSIONING (loop_vinfo)); | 2616 : (!LOOP_REQUIRES_VERSIONING (loop_vinfo) |
2617 || vect_epilogues)); | |
2466 /* Epilog loop must be executed if the number of iterations for epilog | 2618 /* Epilog loop must be executed if the number of iterations for epilog |
2467 loop is known at compile time, otherwise we need to add a check at | 2619 loop is known at compile time, otherwise we need to add a check at |
2468 the end of vector loop and skip to the end of epilog loop. */ | 2620 the end of vector loop and skip to the end of epilog loop. */ |
2469 bool skip_epilog = (prolog_peeling < 0 | 2621 bool skip_epilog = (prolog_peeling < 0 |
2470 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) | 2622 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) |
2489 scale_loop_profile (loop, prob_vector, 0); | 2641 scale_loop_profile (loop, prob_vector, 0); |
2490 } | 2642 } |
2491 } | 2643 } |
2492 | 2644 |
2493 dump_user_location_t loop_loc = find_loop_location (loop); | 2645 dump_user_location_t loop_loc = find_loop_location (loop); |
2494 struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); | 2646 class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); |
2647 if (vect_epilogues) | |
2648 /* Make sure to set the epilogue's epilogue scalar loop, such that we can | |
2649 use the original scalar loop as remaining epilogue if necessary. */ | |
2650 LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo) | |
2651 = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); | |
2652 | |
2495 if (prolog_peeling) | 2653 if (prolog_peeling) |
2496 { | 2654 { |
2497 e = loop_preheader_edge (loop); | 2655 e = loop_preheader_edge (loop); |
2498 if (!slpeel_can_duplicate_loop_p (loop, e)) | 2656 if (!slpeel_can_duplicate_loop_p (loop, e)) |
2499 { | 2657 { |
2507 { | 2665 { |
2508 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc, | 2666 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc, |
2509 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n"); | 2667 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n"); |
2510 gcc_unreachable (); | 2668 gcc_unreachable (); |
2511 } | 2669 } |
2670 prolog->force_vectorize = false; | |
2512 slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true); | 2671 slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true); |
2513 first_loop = prolog; | 2672 first_loop = prolog; |
2514 reset_original_copy_tables (); | 2673 reset_original_copy_tables (); |
2515 | 2674 |
2516 /* Update the number of iterations for prolog loop. */ | 2675 /* Update the number of iterations for prolog loop. */ |
2535 slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e); | 2694 slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e); |
2536 | 2695 |
2537 scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog); | 2696 scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog); |
2538 scale_loop_profile (prolog, prob_prolog, bound_prolog); | 2697 scale_loop_profile (prolog, prob_prolog, bound_prolog); |
2539 } | 2698 } |
2699 | |
2540 /* Update init address of DRs. */ | 2700 /* Update init address of DRs. */ |
2541 vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR); | 2701 vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR); |
2542 /* Update niters for vector loop. */ | 2702 /* Update niters for vector loop. */ |
2543 LOOP_VINFO_NITERS (loop_vinfo) | 2703 LOOP_VINFO_NITERS (loop_vinfo) |
2544 = fold_build2 (MINUS_EXPR, type, niters, niters_prolog); | 2704 = fold_build2 (MINUS_EXPR, type, niters, niters_prolog); |
2569 { | 2729 { |
2570 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc, | 2730 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc, |
2571 "loop can't be duplicated to exit edge.\n"); | 2731 "loop can't be duplicated to exit edge.\n"); |
2572 gcc_unreachable (); | 2732 gcc_unreachable (); |
2573 } | 2733 } |
2574 /* Peel epilog and put it on exit edge of loop. */ | 2734 /* Peel epilog and put it on exit edge of loop. If we are vectorizing |
2575 epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e); | 2735 said epilog then we should use a copy of the main loop as a starting |
2736 point. This loop may have already had some preliminary transformations | |
2737 to allow for more optimal vectorization, for example if-conversion. | |
2738 If we are not vectorizing the epilog then we should use the scalar loop | |
2739 as the transformations mentioned above make less or no sense when not | |
2740 vectorizing. */ | |
2741 epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop; | |
2742 if (vop_to_rename) | |
2743 { | |
2744 /* Vectorizing the main loop can sometimes introduce a vdef to | |
2745 a loop that previously didn't have one; see the comment above | |
2746 the definition of VOP_TO_RENAME for details. The definition | |
2747 D that holds on E will then be different from the definition | |
2748 VOP_TO_RENAME that holds during SCALAR_LOOP, so we need to | |
2749 rename VOP_TO_RENAME to D when copying the loop. | |
2750 | |
2751 The virtual operand is in LCSSA form for the main loop, | |
2752 and no stmt between the main loop and E needs a vdef, | |
2753 so we know that D is provided by a phi rather than by a | |
2754 vdef on a normal gimple stmt. */ | |
2755 basic_block vdef_bb = e->src; | |
2756 gphi *vphi; | |
2757 while (!(vphi = get_virtual_phi (vdef_bb))) | |
2758 vdef_bb = get_immediate_dominator (CDI_DOMINATORS, vdef_bb); | |
2759 gcc_assert (vop_to_rename != gimple_phi_result (vphi)); | |
2760 set_current_def (vop_to_rename, gimple_phi_result (vphi)); | |
2761 } | |
2762 epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e); | |
2576 if (!epilog) | 2763 if (!epilog) |
2577 { | 2764 { |
2578 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc, | 2765 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc, |
2579 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n"); | 2766 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n"); |
2580 gcc_unreachable (); | 2767 gcc_unreachable (); |
2581 } | 2768 } |
2769 epilog->force_vectorize = false; | |
2582 slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false); | 2770 slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false); |
2583 | 2771 |
2584 /* Scalar version loop may be preferred. In this case, add guard | 2772 /* Scalar version loop may be preferred. In this case, add guard |
2585 and skip to epilog. Note this only happens when the number of | 2773 and skip to epilog. Note this only happens when the number of |
2586 iterations of loop is unknown at compile time, otherwise this | 2774 iterations of loop is unknown at compile time, otherwise this |
2598 guard_to = split_edge (loop_preheader_edge (epilog)); | 2786 guard_to = split_edge (loop_preheader_edge (epilog)); |
2599 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, | 2787 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, |
2600 guard_to, guard_bb, | 2788 guard_to, guard_bb, |
2601 prob_vector.invert (), | 2789 prob_vector.invert (), |
2602 irred_flag); | 2790 irred_flag); |
2791 skip_e = guard_e; | |
2603 e = EDGE_PRED (guard_to, 0); | 2792 e = EDGE_PRED (guard_to, 0); |
2604 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1)); | 2793 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1)); |
2605 slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e); | 2794 slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e); |
2606 | 2795 |
2607 /* Simply propagate profile info from guard_bb to guard_to which is | 2796 /* Simply propagate profile info from guard_bb to guard_to which is |
2619 (prob_vector)); | 2808 (prob_vector)); |
2620 free (bbs); | 2809 free (bbs); |
2621 } | 2810 } |
2622 | 2811 |
2623 basic_block bb_before_epilog = loop_preheader_edge (epilog)->src; | 2812 basic_block bb_before_epilog = loop_preheader_edge (epilog)->src; |
2624 tree niters_vector_mult_vf; | |
2625 /* If loop is peeled for non-zero constant times, now niters refers to | 2813 /* If loop is peeled for non-zero constant times, now niters refers to |
2626 orig_niters - prolog_peeling, it won't overflow even the orig_niters | 2814 orig_niters - prolog_peeling, it won't overflow even the orig_niters |
2627 overflows. */ | 2815 overflows. */ |
2628 niters_no_overflow |= (prolog_peeling > 0); | 2816 niters_no_overflow |= (prolog_peeling > 0); |
2629 vect_gen_vector_loop_niters (loop_vinfo, niters, | 2817 vect_gen_vector_loop_niters (loop_vinfo, niters, |
2642 vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector, | 2830 vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector, |
2643 &niters_vector_mult_vf); | 2831 &niters_vector_mult_vf); |
2644 /* Update IVs of original loop as if they were advanced by | 2832 /* Update IVs of original loop as if they were advanced by |
2645 niters_vector_mult_vf steps. */ | 2833 niters_vector_mult_vf steps. */ |
2646 gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo)); | 2834 gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo)); |
2647 edge update_e = skip_vector ? e : loop_preheader_edge (epilog); | 2835 update_e = skip_vector ? e : loop_preheader_edge (epilog); |
2648 vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf, | 2836 vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf, |
2649 update_e); | 2837 update_e); |
2650 | 2838 |
2651 if (skip_epilog) | 2839 if (skip_epilog) |
2652 { | 2840 { |
2683 | 2871 |
2684 delete_update_ssa (); | 2872 delete_update_ssa (); |
2685 adjust_vec_debug_stmts (); | 2873 adjust_vec_debug_stmts (); |
2686 scev_reset (); | 2874 scev_reset (); |
2687 } | 2875 } |
2876 | |
2877 if (vect_epilogues) | |
2878 { | |
2879 epilog->aux = epilogue_vinfo; | |
2880 LOOP_VINFO_LOOP (epilogue_vinfo) = epilog; | |
2881 | |
2882 loop_constraint_clear (epilog, LOOP_C_INFINITE); | |
2883 | |
2884 /* We now must calculate the number of NITERS performed by the previous | |
2885 loop and EPILOGUE_NITERS to be performed by the epilogue. */ | |
2886 tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf), | |
2887 niters_prolog, niters_vector_mult_vf); | |
2888 | |
2889 /* If skip_vector we may skip the previous loop, we insert a phi-node to | |
2890 determine whether we are coming from the previous vectorized loop | |
2891 using the update_e edge or the skip_vector basic block using the | |
2892 skip_e edge. */ | |
2893 if (skip_vector) | |
2894 { | |
2895 gcc_assert (update_e != NULL && skip_e != NULL); | |
2896 gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)), | |
2897 update_e->dest); | |
2898 tree new_ssa = make_ssa_name (TREE_TYPE (niters)); | |
2899 gimple *stmt = gimple_build_assign (new_ssa, niters); | |
2900 gimple_stmt_iterator gsi; | |
2901 if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME | |
2902 && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL) | |
2903 { | |
2904 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf)); | |
2905 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT); | |
2906 } | |
2907 else | |
2908 { | |
2909 gsi = gsi_last_bb (update_e->src); | |
2910 gsi_insert_before (&gsi, stmt, GSI_NEW_STMT); | |
2911 } | |
2912 | |
2913 niters = new_ssa; | |
2914 add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION); | |
2915 add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e, | |
2916 UNKNOWN_LOCATION); | |
2917 niters = PHI_RESULT (new_phi); | |
2918 } | |
2919 | |
2920 /* Subtract the number of iterations performed by the vectorized loop | |
2921 from the number of total iterations. */ | |
2922 tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters), | |
2923 before_loop_niters, | |
2924 niters); | |
2925 | |
2926 LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters; | |
2927 LOOP_VINFO_NITERSM1 (epilogue_vinfo) | |
2928 = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters), | |
2929 epilogue_niters, | |
2930 build_one_cst (TREE_TYPE (epilogue_niters))); | |
2931 | |
2932 /* Set ADVANCE to the number of iterations performed by the previous | |
2933 loop and its prologue. */ | |
2934 *advance = niters; | |
2935 | |
2936 /* Redo the peeling for niter analysis as the NITERs and alignment | |
2937 may have been updated to take the main loop into account. */ | |
2938 determine_peel_for_niter (epilogue_vinfo); | |
2939 } | |
2940 | |
2688 adjust_vec.release (); | 2941 adjust_vec.release (); |
2689 free_original_copy_tables (); | 2942 free_original_copy_tables (); |
2690 | 2943 |
2691 return epilog; | 2944 return vect_epilogues ? epilog : NULL; |
2692 } | 2945 } |
2693 | 2946 |
2694 /* Function vect_create_cond_for_niters_checks. | 2947 /* Function vect_create_cond_for_niters_checks. |
2695 | 2948 |
2696 Create a conditional expression that represents the run-time checks for | 2949 Create a conditional expression that represents the run-time checks for |
2949 cost model threshold TH. | 3202 cost model threshold TH. |
2950 | 3203 |
2951 The versioning precondition(s) are placed in *COND_EXPR and | 3204 The versioning precondition(s) are placed in *COND_EXPR and |
2952 *COND_EXPR_STMT_LIST. */ | 3205 *COND_EXPR_STMT_LIST. */ |
2953 | 3206 |
2954 void | 3207 class loop * |
2955 vect_loop_versioning (loop_vec_info loop_vinfo, | 3208 vect_loop_versioning (loop_vec_info loop_vinfo, |
2956 unsigned int th, bool check_profitability, | 3209 gimple *loop_vectorized_call) |
2957 poly_uint64 versioning_threshold) | 3210 { |
2958 { | 3211 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop; |
2959 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop; | 3212 class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); |
2960 struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); | |
2961 basic_block condition_bb; | 3213 basic_block condition_bb; |
2962 gphi_iterator gsi; | 3214 gphi_iterator gsi; |
2963 gimple_stmt_iterator cond_exp_gsi; | 3215 gimple_stmt_iterator cond_exp_gsi; |
2964 basic_block merge_bb; | 3216 basic_block merge_bb; |
2965 basic_block new_exit_bb; | 3217 basic_block new_exit_bb; |
2972 gimple_seq gimplify_stmt_list = NULL; | 3224 gimple_seq gimplify_stmt_list = NULL; |
2973 tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo); | 3225 tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo); |
2974 bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo); | 3226 bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo); |
2975 bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo); | 3227 bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo); |
2976 bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo); | 3228 bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo); |
2977 | 3229 poly_uint64 versioning_threshold |
2978 if (check_profitability) | 3230 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo); |
3231 tree version_simd_if_cond | |
3232 = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo); | |
3233 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo); | |
3234 | |
3235 if (vect_apply_runtime_profitability_check_p (loop_vinfo) | |
3236 && !ordered_p (th, versioning_threshold)) | |
2979 cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters, | 3237 cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters, |
2980 build_int_cst (TREE_TYPE (scalar_loop_iters), | 3238 build_int_cst (TREE_TYPE (scalar_loop_iters), |
2981 th - 1)); | 3239 th - 1)); |
2982 if (maybe_ne (versioning_threshold, 0U)) | 3240 if (maybe_ne (versioning_threshold, 0U)) |
2983 { | 3241 { |
2993 | 3251 |
2994 if (version_niter) | 3252 if (version_niter) |
2995 vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr); | 3253 vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr); |
2996 | 3254 |
2997 if (cond_expr) | 3255 if (cond_expr) |
2998 cond_expr = force_gimple_operand_1 (cond_expr, &cond_expr_stmt_list, | 3256 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr), |
3257 &cond_expr_stmt_list, | |
2999 is_gimple_condexpr, NULL_TREE); | 3258 is_gimple_condexpr, NULL_TREE); |
3000 | 3259 |
3001 if (version_align) | 3260 if (version_align) |
3002 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr, | 3261 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr, |
3003 &cond_expr_stmt_list); | 3262 &cond_expr_stmt_list); |
3005 if (version_alias) | 3264 if (version_alias) |
3006 { | 3265 { |
3007 vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr); | 3266 vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr); |
3008 vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr); | 3267 vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr); |
3009 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr); | 3268 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr); |
3269 } | |
3270 | |
3271 if (version_simd_if_cond) | |
3272 { | |
3273 gcc_assert (dom_info_available_p (CDI_DOMINATORS)); | |
3274 if (flag_checking) | |
3275 if (basic_block bb | |
3276 = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond))) | |
3277 gcc_assert (bb != loop->header | |
3278 && dominated_by_p (CDI_DOMINATORS, loop->header, bb) | |
3279 && (scalar_loop == NULL | |
3280 || (bb != scalar_loop->header | |
3281 && dominated_by_p (CDI_DOMINATORS, | |
3282 scalar_loop->header, bb)))); | |
3283 tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond)); | |
3284 tree c = fold_build2 (NE_EXPR, boolean_type_node, | |
3285 version_simd_if_cond, zero); | |
3286 if (cond_expr) | |
3287 cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, | |
3288 c, cond_expr); | |
3289 else | |
3290 cond_expr = c; | |
3291 if (dump_enabled_p ()) | |
3292 dump_printf_loc (MSG_NOTE, vect_location, | |
3293 "created versioning for simd if condition check.\n"); | |
3010 } | 3294 } |
3011 | 3295 |
3012 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr), | 3296 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr), |
3013 &gimplify_stmt_list, | 3297 &gimplify_stmt_list, |
3014 is_gimple_condexpr, NULL_TREE); | 3298 is_gimple_condexpr, NULL_TREE); |
3015 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list); | 3299 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list); |
3016 | 3300 |
3017 initialize_original_copy_tables (); | 3301 /* Compute the outermost loop cond_expr and cond_expr_stmt_list are |
3018 if (scalar_loop) | 3302 invariant in. */ |
3019 { | 3303 class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr); |
3020 edge scalar_e; | 3304 for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list); |
3021 basic_block preheader, scalar_preheader; | 3305 !gsi_end_p (gsi); gsi_next (&gsi)) |
3022 | 3306 { |
3023 /* We don't want to scale SCALAR_LOOP's frequencies, we need to | 3307 gimple *stmt = gsi_stmt (gsi); |
3024 scale LOOP's frequencies instead. */ | 3308 update_stmt (stmt); |
3025 nloop = loop_version (scalar_loop, cond_expr, &condition_bb, | 3309 ssa_op_iter iter; |
3310 use_operand_p use_p; | |
3311 basic_block def_bb; | |
3312 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE) | |
3313 if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p)))) | |
3314 && flow_bb_inside_loop_p (outermost, def_bb)) | |
3315 outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1); | |
3316 } | |
3317 | |
3318 /* Search for the outermost loop we can version. Avoid versioning of | |
3319 non-perfect nests but allow if-conversion versioned loops inside. */ | |
3320 class loop *loop_to_version = loop; | |
3321 if (flow_loop_nested_p (outermost, loop)) | |
3322 { | |
3323 if (dump_enabled_p ()) | |
3324 dump_printf_loc (MSG_NOTE, vect_location, | |
3325 "trying to apply versioning to outer loop %d\n", | |
3326 outermost->num); | |
3327 if (outermost->num == 0) | |
3328 outermost = superloop_at_depth (loop, 1); | |
3329 /* And avoid applying versioning on non-perfect nests. */ | |
3330 while (loop_to_version != outermost | |
3331 && single_exit (loop_outer (loop_to_version)) | |
3332 && (!loop_outer (loop_to_version)->inner->next | |
3333 || vect_loop_vectorized_call (loop_to_version)) | |
3334 && (!loop_outer (loop_to_version)->inner->next | |
3335 || !loop_outer (loop_to_version)->inner->next->next)) | |
3336 loop_to_version = loop_outer (loop_to_version); | |
3337 } | |
3338 | |
3339 /* Apply versioning. If there is already a scalar version created by | |
3340 if-conversion re-use that. Note we cannot re-use the copy of | |
3341 an if-converted outer-loop when vectorizing the inner loop only. */ | |
3342 gcond *cond; | |
3343 if ((!loop_to_version->inner || loop == loop_to_version) | |
3344 && loop_vectorized_call) | |
3345 { | |
3346 gcc_assert (scalar_loop); | |
3347 condition_bb = gimple_bb (loop_vectorized_call); | |
3348 cond = as_a <gcond *> (last_stmt (condition_bb)); | |
3349 gimple_cond_set_condition_from_tree (cond, cond_expr); | |
3350 update_stmt (cond); | |
3351 | |
3352 if (cond_expr_stmt_list) | |
3353 { | |
3354 cond_exp_gsi = gsi_for_stmt (loop_vectorized_call); | |
3355 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list, | |
3356 GSI_SAME_STMT); | |
3357 } | |
3358 | |
3359 /* if-conversion uses profile_probability::always () for both paths, | |
3360 reset the paths probabilities appropriately. */ | |
3361 edge te, fe; | |
3362 extract_true_false_edges_from_block (condition_bb, &te, &fe); | |
3363 te->probability = prob; | |
3364 fe->probability = prob.invert (); | |
3365 /* We can scale loops counts immediately but have to postpone | |
3366 scaling the scalar loop because we re-use it during peeling. */ | |
3367 scale_loop_frequencies (loop_to_version, te->probability); | |
3368 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability; | |
3369 | |
3370 nloop = scalar_loop; | |
3371 if (dump_enabled_p ()) | |
3372 dump_printf_loc (MSG_NOTE, vect_location, | |
3373 "reusing %sloop version created by if conversion\n", | |
3374 loop_to_version != loop ? "outer " : ""); | |
3375 } | |
3376 else | |
3377 { | |
3378 if (loop_to_version != loop | |
3379 && dump_enabled_p ()) | |
3380 dump_printf_loc (MSG_NOTE, vect_location, | |
3381 "applying loop versioning to outer loop %d\n", | |
3382 loop_to_version->num); | |
3383 | |
3384 initialize_original_copy_tables (); | |
3385 nloop = loop_version (loop_to_version, cond_expr, &condition_bb, | |
3026 prob, prob.invert (), prob, prob.invert (), true); | 3386 prob, prob.invert (), prob, prob.invert (), true); |
3027 scale_loop_frequencies (loop, prob); | 3387 gcc_assert (nloop); |
3028 /* CONDITION_BB was created above SCALAR_LOOP's preheader, | 3388 nloop = get_loop_copy (loop); |
3029 while we need to move it above LOOP's preheader. */ | 3389 |
3030 e = loop_preheader_edge (loop); | 3390 /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will |
3031 scalar_e = loop_preheader_edge (scalar_loop); | 3391 reap those otherwise; they also refer to the original |
3032 /* The vector loop preheader might not be empty, since new | 3392 loops. */ |
3033 invariants could have been created while analyzing the loop. */ | 3393 class loop *l = loop; |
3034 gcc_assert (single_pred_p (e->src)); | 3394 while (gimple *call = vect_loop_vectorized_call (l)) |
3035 gcc_assert (empty_block_p (scalar_e->src) | 3395 { |
3036 && single_pred_p (scalar_e->src)); | 3396 call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call))); |
3037 gcc_assert (single_pred_p (condition_bb)); | 3397 fold_loop_internal_call (call, boolean_false_node); |
3038 preheader = e->src; | 3398 l = loop_outer (l); |
3039 scalar_preheader = scalar_e->src; | 3399 } |
3040 scalar_e = find_edge (condition_bb, scalar_preheader); | 3400 free_original_copy_tables (); |
3041 e = single_pred_edge (preheader); | 3401 |
3042 redirect_edge_and_branch_force (single_pred_edge (condition_bb), | 3402 if (cond_expr_stmt_list) |
3043 scalar_preheader); | 3403 { |
3044 redirect_edge_and_branch_force (scalar_e, preheader); | 3404 cond_exp_gsi = gsi_last_bb (condition_bb); |
3045 redirect_edge_and_branch_force (e, condition_bb); | 3405 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list, |
3046 set_immediate_dominator (CDI_DOMINATORS, condition_bb, | 3406 GSI_SAME_STMT); |
3047 single_pred (condition_bb)); | 3407 } |
3048 set_immediate_dominator (CDI_DOMINATORS, scalar_preheader, | 3408 |
3049 single_pred (scalar_preheader)); | 3409 /* Loop versioning violates an assumption we try to maintain during |
3050 set_immediate_dominator (CDI_DOMINATORS, preheader, | 3410 vectorization - that the loop exit block has a single predecessor. |
3051 condition_bb); | 3411 After versioning, the exit block of both loop versions is the same |
3052 } | 3412 basic block (i.e. it has two predecessors). Just in order to simplify |
3053 else | 3413 following transformations in the vectorizer, we fix this situation |
3054 nloop = loop_version (loop, cond_expr, &condition_bb, | 3414 here by adding a new (empty) block on the exit-edge of the loop, |
3055 prob, prob.invert (), prob, prob.invert (), true); | 3415 with the proper loop-exit phis to maintain loop-closed-form. |
3416 If loop versioning wasn't done from loop, but scalar_loop instead, | |
3417 merge_bb will have already just a single successor. */ | |
3418 | |
3419 merge_bb = single_exit (loop_to_version)->dest; | |
3420 if (EDGE_COUNT (merge_bb->preds) >= 2) | |
3421 { | |
3422 gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2); | |
3423 new_exit_bb = split_edge (single_exit (loop_to_version)); | |
3424 new_exit_e = single_exit (loop_to_version); | |
3425 e = EDGE_SUCC (new_exit_bb, 0); | |
3426 | |
3427 for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); | |
3428 gsi_next (&gsi)) | |
3429 { | |
3430 tree new_res; | |
3431 orig_phi = gsi.phi (); | |
3432 new_res = copy_ssa_name (PHI_RESULT (orig_phi)); | |
3433 new_phi = create_phi_node (new_res, new_exit_bb); | |
3434 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e); | |
3435 add_phi_arg (new_phi, arg, new_exit_e, | |
3436 gimple_phi_arg_location_from_edge (orig_phi, e)); | |
3437 adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi)); | |
3438 } | |
3439 } | |
3440 | |
3441 update_ssa (TODO_update_ssa); | |
3442 } | |
3056 | 3443 |
3057 if (version_niter) | 3444 if (version_niter) |
3058 { | 3445 { |
3059 /* The versioned loop could be infinite, we need to clear existing | 3446 /* The versioned loop could be infinite, we need to clear existing |
3060 niter information which is copied from the original loop. */ | 3447 niter information which is copied from the original loop. */ |
3077 vect_location, | 3464 vect_location, |
3078 "loop versioned for vectorization to enhance " | 3465 "loop versioned for vectorization to enhance " |
3079 "alignment\n"); | 3466 "alignment\n"); |
3080 | 3467 |
3081 } | 3468 } |
3082 free_original_copy_tables (); | 3469 |
3083 | 3470 return nloop; |
3084 /* Loop versioning violates an assumption we try to maintain during | 3471 } |
3085 vectorization - that the loop exit block has a single predecessor. | |
3086 After versioning, the exit block of both loop versions is the same | |
3087 basic block (i.e. it has two predecessors). Just in order to simplify | |
3088 following transformations in the vectorizer, we fix this situation | |
3089 here by adding a new (empty) block on the exit-edge of the loop, | |
3090 with the proper loop-exit phis to maintain loop-closed-form. | |
3091 If loop versioning wasn't done from loop, but scalar_loop instead, | |
3092 merge_bb will have already just a single successor. */ | |
3093 | |
3094 merge_bb = single_exit (loop)->dest; | |
3095 if (scalar_loop == NULL || EDGE_COUNT (merge_bb->preds) >= 2) | |
3096 { | |
3097 gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2); | |
3098 new_exit_bb = split_edge (single_exit (loop)); | |
3099 new_exit_e = single_exit (loop); | |
3100 e = EDGE_SUCC (new_exit_bb, 0); | |
3101 | |
3102 for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); gsi_next (&gsi)) | |
3103 { | |
3104 tree new_res; | |
3105 orig_phi = gsi.phi (); | |
3106 new_res = copy_ssa_name (PHI_RESULT (orig_phi)); | |
3107 new_phi = create_phi_node (new_res, new_exit_bb); | |
3108 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e); | |
3109 add_phi_arg (new_phi, arg, new_exit_e, | |
3110 gimple_phi_arg_location_from_edge (orig_phi, e)); | |
3111 adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi)); | |
3112 } | |
3113 } | |
3114 | |
3115 /* End loop-exit-fixes after versioning. */ | |
3116 | |
3117 if (cond_expr_stmt_list) | |
3118 { | |
3119 cond_exp_gsi = gsi_last_bb (condition_bb); | |
3120 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list, | |
3121 GSI_SAME_STMT); | |
3122 } | |
3123 update_ssa (TODO_update_ssa); | |
3124 } |