comparison gcc/tree-vect-loop-manip.c @ 132:d34655255c78

update gcc-8.2
author mir3636
date Thu, 25 Oct 2018 10:21:07 +0900
parents 84e7813d76e9
children 1830386684a0
comparison
equal deleted inserted replaced
130:e108057fa461 132:d34655255c78
1 /* Vectorizer Specific Loop Manipulations 1 /* Vectorizer Specific Loop Manipulations
2 Copyright (C) 2003-2017 Free Software Foundation, Inc. 2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> 3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com> 4 and Ira Rosen <irar@il.ibm.com>
5 5
6 This file is part of GCC. 6 This file is part of GCC.
7 7
39 #include "tree-ssa.h" 39 #include "tree-ssa.h"
40 #include "cfgloop.h" 40 #include "cfgloop.h"
41 #include "tree-scalar-evolution.h" 41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h" 42 #include "tree-vectorizer.h"
43 #include "tree-ssa-loop-ivopts.h" 43 #include "tree-ssa-loop-ivopts.h"
44 #include "gimple-fold.h"
45 #include "tree-ssa-loop-niter.h"
46 #include "internal-fn.h"
47 #include "stor-layout.h"
48 #include "optabs-query.h"
49 #include "vec-perm-indices.h"
44 50
45 /************************************************************************* 51 /*************************************************************************
46 Simple Loop Peeling Utilities 52 Simple Loop Peeling Utilities
47 53
48 Utilities to support loop peeling for vectorization purposes. 54 Utilities to support loop peeling for vectorization purposes.
190 /* Adjust debug stmts as scheduled before. */ 196 /* Adjust debug stmts as scheduled before. */
191 197
192 static void 198 static void
193 adjust_vec_debug_stmts (void) 199 adjust_vec_debug_stmts (void)
194 { 200 {
195 if (!MAY_HAVE_DEBUG_STMTS) 201 if (!MAY_HAVE_DEBUG_BIND_STMTS)
196 return; 202 return;
197 203
198 gcc_assert (adjust_vec.exists ()); 204 gcc_assert (adjust_vec.exists ());
199 205
200 while (!adjust_vec.is_empty ()) 206 while (!adjust_vec.is_empty ())
212 static void 218 static void
213 adjust_debug_stmts (tree from, tree to, basic_block bb) 219 adjust_debug_stmts (tree from, tree to, basic_block bb)
214 { 220 {
215 adjust_info ai; 221 adjust_info ai;
216 222
217 if (MAY_HAVE_DEBUG_STMTS 223 if (MAY_HAVE_DEBUG_BIND_STMTS
218 && TREE_CODE (from) == SSA_NAME 224 && TREE_CODE (from) == SSA_NAME
219 && ! SSA_NAME_IS_DEFAULT_DEF (from) 225 && ! SSA_NAME_IS_DEFAULT_DEF (from)
220 && ! virtual_operand_p (from)) 226 && ! virtual_operand_p (from))
221 { 227 {
222 ai.from = from; 228 ai.from = from;
240 { 246 {
241 tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e); 247 tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
242 248
243 SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def); 249 SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
244 250
245 if (MAY_HAVE_DEBUG_STMTS) 251 if (MAY_HAVE_DEBUG_BIND_STMTS)
246 adjust_debug_stmts (orig_def, PHI_RESULT (update_phi), 252 adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
247 gimple_bb (update_phi)); 253 gimple_bb (update_phi));
248 } 254 }
249 255
250 /* Make the LOOP iterate NITERS times. This is done by adding a new IV 256 /* Define one loop mask MASK from loop LOOP. INIT_MASK is the value that
251 that starts at zero, increases by one and its limit is NITERS. 257 the mask should have during the first iteration and NEXT_MASK is the
252 258 value that it should have on subsequent iterations. */
253 Assumption: the exit-condition of LOOP is the last stmt in the loop. */ 259
254 260 static void
255 void 261 vect_set_loop_mask (struct loop *loop, tree mask, tree init_mask,
256 slpeel_make_loop_iterate_ntimes (struct loop *loop, tree niters) 262 tree next_mask)
263 {
264 gphi *phi = create_phi_node (mask, loop->header);
265 add_phi_arg (phi, init_mask, loop_preheader_edge (loop), UNKNOWN_LOCATION);
266 add_phi_arg (phi, next_mask, loop_latch_edge (loop), UNKNOWN_LOCATION);
267 }
268
269 /* Add SEQ to the end of LOOP's preheader block. */
270
271 static void
272 add_preheader_seq (struct loop *loop, gimple_seq seq)
273 {
274 if (seq)
275 {
276 edge pe = loop_preheader_edge (loop);
277 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
278 gcc_assert (!new_bb);
279 }
280 }
281
282 /* Add SEQ to the beginning of LOOP's header block. */
283
284 static void
285 add_header_seq (struct loop *loop, gimple_seq seq)
286 {
287 if (seq)
288 {
289 gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
290 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
291 }
292 }
293
294 /* Return true if the target can interleave elements of two vectors.
295 OFFSET is 0 if the first half of the vectors should be interleaved
296 or 1 if the second half should. When returning true, store the
297 associated permutation in INDICES. */
298
299 static bool
300 interleave_supported_p (vec_perm_indices *indices, tree vectype,
301 unsigned int offset)
302 {
303 poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
304 poly_uint64 base = exact_div (nelts, 2) * offset;
305 vec_perm_builder sel (nelts, 2, 3);
306 for (unsigned int i = 0; i < 3; ++i)
307 {
308 sel.quick_push (base + i);
309 sel.quick_push (base + i + nelts);
310 }
311 indices->new_vector (sel, 2, nelts);
312 return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
313 }
314
315 /* Try to use permutes to define the masks in DEST_RGM using the masks
316 in SRC_RGM, given that the former has twice as many masks as the
317 latter. Return true on success, adding any new statements to SEQ. */
318
319 static bool
320 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
321 rgroup_masks *src_rgm)
322 {
323 tree src_masktype = src_rgm->mask_type;
324 tree dest_masktype = dest_rgm->mask_type;
325 machine_mode src_mode = TYPE_MODE (src_masktype);
326 if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
327 && optab_handler (vec_unpacku_hi_optab, src_mode) != CODE_FOR_nothing
328 && optab_handler (vec_unpacku_lo_optab, src_mode) != CODE_FOR_nothing)
329 {
330 /* Unpacking the source masks gives at least as many mask bits as
331 we need. We can then VIEW_CONVERT any excess bits away. */
332 tree unpack_masktype = vect_halve_mask_nunits (src_masktype);
333 for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
334 {
335 tree src = src_rgm->masks[i / 2];
336 tree dest = dest_rgm->masks[i];
337 tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
338 ? VEC_UNPACK_HI_EXPR
339 : VEC_UNPACK_LO_EXPR);
340 gassign *stmt;
341 if (dest_masktype == unpack_masktype)
342 stmt = gimple_build_assign (dest, code, src);
343 else
344 {
345 tree temp = make_ssa_name (unpack_masktype);
346 stmt = gimple_build_assign (temp, code, src);
347 gimple_seq_add_stmt (seq, stmt);
348 stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
349 build1 (VIEW_CONVERT_EXPR,
350 dest_masktype, temp));
351 }
352 gimple_seq_add_stmt (seq, stmt);
353 }
354 return true;
355 }
356 vec_perm_indices indices[2];
357 if (dest_masktype == src_masktype
358 && interleave_supported_p (&indices[0], src_masktype, 0)
359 && interleave_supported_p (&indices[1], src_masktype, 1))
360 {
361 /* The destination requires twice as many mask bits as the source, so
362 we can use interleaving permutes to double up the number of bits. */
363 tree masks[2];
364 for (unsigned int i = 0; i < 2; ++i)
365 masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
366 for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
367 {
368 tree src = src_rgm->masks[i / 2];
369 tree dest = dest_rgm->masks[i];
370 gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
371 src, src, masks[i & 1]);
372 gimple_seq_add_stmt (seq, stmt);
373 }
374 return true;
375 }
376 return false;
377 }
378
379 /* Helper for vect_set_loop_condition_masked. Generate definitions for
380 all the masks in RGM and return a mask that is nonzero when the loop
381 needs to iterate. Add any new preheader statements to PREHEADER_SEQ.
382 Use LOOP_COND_GSI to insert code before the exit gcond.
383
384 RGM belongs to loop LOOP. The loop originally iterated NITERS
385 times and has been vectorized according to LOOP_VINFO. Each iteration
386 of the vectorized loop handles VF iterations of the scalar loop.
387
388 If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
389 starts with NITERS_SKIP dummy iterations of the scalar loop before
390 the real work starts. The mask elements for these dummy iterations
391 must be 0, to ensure that the extra iterations do not have an effect.
392
393 It is known that:
394
395 NITERS * RGM->max_nscalars_per_iter
396
397 does not overflow. However, MIGHT_WRAP_P says whether an induction
398 variable that starts at 0 and has step:
399
400 VF * RGM->max_nscalars_per_iter
401
402 might overflow before hitting a value above:
403
404 (NITERS + NITERS_SKIP) * RGM->max_nscalars_per_iter
405
406 This means that we cannot guarantee that such an induction variable
407 would ever hit a value that produces a set of all-false masks for RGM. */
408
409 static tree
410 vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
411 gimple_seq *preheader_seq,
412 gimple_stmt_iterator loop_cond_gsi,
413 rgroup_masks *rgm, tree vf,
414 tree niters, tree niters_skip,
415 bool might_wrap_p)
416 {
417 tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
418 tree mask_type = rgm->mask_type;
419 unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
420 poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
421
422 /* Calculate the maximum number of scalar values that the rgroup
423 handles in total, the number that it handles for each iteration
424 of the vector loop, and the number that it should skip during the
425 first iteration of the vector loop. */
426 tree nscalars_total = niters;
427 tree nscalars_step = vf;
428 tree nscalars_skip = niters_skip;
429 if (nscalars_per_iter != 1)
430 {
431 /* We checked before choosing to use a fully-masked loop that these
432 multiplications don't overflow. */
433 tree factor = build_int_cst (compare_type, nscalars_per_iter);
434 nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
435 nscalars_total, factor);
436 nscalars_step = gimple_build (preheader_seq, MULT_EXPR, compare_type,
437 nscalars_step, factor);
438 if (nscalars_skip)
439 nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
440 nscalars_skip, factor);
441 }
442
443 /* Create an induction variable that counts the number of scalars
444 processed. */
445 tree index_before_incr, index_after_incr;
446 gimple_stmt_iterator incr_gsi;
447 bool insert_after;
448 tree zero_index = build_int_cst (compare_type, 0);
449 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
450 create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,
451 insert_after, &index_before_incr, &index_after_incr);
452
453 tree test_index, test_limit, first_limit;
454 gimple_stmt_iterator *test_gsi;
455 if (might_wrap_p)
456 {
457 /* In principle the loop should stop iterating once the incremented
458 IV reaches a value greater than or equal to:
459
460 NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP
461
462 However, there's no guarantee that this addition doesn't overflow
463 the comparison type, or that the IV hits a value above it before
464 wrapping around. We therefore adjust the limit down by one
465 IV step:
466
467 (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
468 -[infinite-prec] NSCALARS_STEP
469
470 and compare the IV against this limit _before_ incrementing it.
471 Since the comparison type is unsigned, we actually want the
472 subtraction to saturate at zero:
473
474 (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
475 -[sat] NSCALARS_STEP
476
477 And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as:
478
479 NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP)
480
481 where the rightmost subtraction can be done directly in
482 COMPARE_TYPE. */
483 test_index = index_before_incr;
484 tree adjust = nscalars_step;
485 if (nscalars_skip)
486 adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
487 adjust, nscalars_skip);
488 test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
489 nscalars_total, adjust);
490 test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
491 test_limit, adjust);
492 test_gsi = &incr_gsi;
493
494 /* Get a safe limit for the first iteration. */
495 if (nscalars_skip)
496 {
497 /* The first vector iteration can handle at most NSCALARS_STEP
498 scalars. NSCALARS_STEP <= CONST_LIMIT, and adding
499 NSCALARS_SKIP to that cannot overflow. */
500 tree const_limit = build_int_cst (compare_type,
501 LOOP_VINFO_VECT_FACTOR (loop_vinfo)
502 * nscalars_per_iter);
503 first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
504 nscalars_total, const_limit);
505 first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
506 first_limit, nscalars_skip);
507 }
508 else
509 /* For the first iteration it doesn't matter whether the IV hits
510 a value above NSCALARS_TOTAL. That only matters for the latch
511 condition. */
512 first_limit = nscalars_total;
513 }
514 else
515 {
516 /* Test the incremented IV, which will always hit a value above
517 the bound before wrapping. */
518 test_index = index_after_incr;
519 test_limit = nscalars_total;
520 if (nscalars_skip)
521 test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
522 test_limit, nscalars_skip);
523 test_gsi = &loop_cond_gsi;
524
525 first_limit = test_limit;
526 }
527
528 /* Provide a definition of each mask in the group. */
529 tree next_mask = NULL_TREE;
530 tree mask;
531 unsigned int i;
532 FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask)
533 {
534 /* Previous masks will cover BIAS scalars. This mask covers the
535 next batch. */
536 poly_uint64 bias = nscalars_per_mask * i;
537 tree bias_tree = build_int_cst (compare_type, bias);
538 gimple *tmp_stmt;
539
540 /* See whether the first iteration of the vector loop is known
541 to have a full mask. */
542 poly_uint64 const_limit;
543 bool first_iteration_full
544 = (poly_int_tree_p (first_limit, &const_limit)
545 && known_ge (const_limit, (i + 1) * nscalars_per_mask));
546
547 /* Rather than have a new IV that starts at BIAS and goes up to
548 TEST_LIMIT, prefer to use the same 0-based IV for each mask
549 and adjust the bound down by BIAS. */
550 tree this_test_limit = test_limit;
551 if (i != 0)
552 {
553 this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
554 compare_type, this_test_limit,
555 bias_tree);
556 this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
557 compare_type, this_test_limit,
558 bias_tree);
559 }
560
561 /* Create the initial mask. First include all scalars that
562 are within the loop limit. */
563 tree init_mask = NULL_TREE;
564 if (!first_iteration_full)
565 {
566 tree start, end;
567 if (first_limit == test_limit)
568 {
569 /* Use a natural test between zero (the initial IV value)
570 and the loop limit. The "else" block would be valid too,
571 but this choice can avoid the need to load BIAS_TREE into
572 a register. */
573 start = zero_index;
574 end = this_test_limit;
575 }
576 else
577 {
578 /* FIRST_LIMIT is the maximum number of scalars handled by the
579 first iteration of the vector loop. Test the portion
580 associated with this mask. */
581 start = bias_tree;
582 end = first_limit;
583 }
584
585 init_mask = make_temp_ssa_name (mask_type, NULL, "max_mask");
586 tmp_stmt = vect_gen_while (init_mask, start, end);
587 gimple_seq_add_stmt (preheader_seq, tmp_stmt);
588 }
589
590 /* Now AND out the bits that are within the number of skipped
591 scalars. */
592 poly_uint64 const_skip;
593 if (nscalars_skip
594 && !(poly_int_tree_p (nscalars_skip, &const_skip)
595 && known_le (const_skip, bias)))
596 {
597 tree unskipped_mask = vect_gen_while_not (preheader_seq, mask_type,
598 bias_tree, nscalars_skip);
599 if (init_mask)
600 init_mask = gimple_build (preheader_seq, BIT_AND_EXPR, mask_type,
601 init_mask, unskipped_mask);
602 else
603 init_mask = unskipped_mask;
604 }
605
606 if (!init_mask)
607 /* First iteration is full. */
608 init_mask = build_minus_one_cst (mask_type);
609
610 /* Get the mask value for the next iteration of the loop. */
611 next_mask = make_temp_ssa_name (mask_type, NULL, "next_mask");
612 gcall *call = vect_gen_while (next_mask, test_index, this_test_limit);
613 gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
614
615 vect_set_loop_mask (loop, mask, init_mask, next_mask);
616 }
617 return next_mask;
618 }
619
620 /* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.
621 LOOP_VINFO describes the vectorization of LOOP. NITERS is the
622 number of iterations of the original scalar loop that should be
623 handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are
624 as for vect_set_loop_condition.
625
626 Insert the branch-back condition before LOOP_COND_GSI and return the
627 final gcond. */
628
629 static gcond *
630 vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
631 tree niters, tree final_iv,
632 bool niters_maybe_zero,
633 gimple_stmt_iterator loop_cond_gsi)
634 {
635 gimple_seq preheader_seq = NULL;
636 gimple_seq header_seq = NULL;
637
638 tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
639 unsigned int compare_precision = TYPE_PRECISION (compare_type);
640 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
641 tree orig_niters = niters;
642
643 /* Type of the initial value of NITERS. */
644 tree ni_actual_type = TREE_TYPE (niters);
645 unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
646
647 /* Convert NITERS to the same size as the compare. */
648 if (compare_precision > ni_actual_precision
649 && niters_maybe_zero)
650 {
651 /* We know that there is always at least one iteration, so if the
652 count is zero then it must have wrapped. Cope with this by
653 subtracting 1 before the conversion and adding 1 to the result. */
654 gcc_assert (TYPE_UNSIGNED (ni_actual_type));
655 niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
656 niters, build_minus_one_cst (ni_actual_type));
657 niters = gimple_convert (&preheader_seq, compare_type, niters);
658 niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
659 niters, build_one_cst (compare_type));
660 }
661 else
662 niters = gimple_convert (&preheader_seq, compare_type, niters);
663
664 /* Convert skip_niters to the right type. */
665 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
666
667 /* Now calculate the value that the induction variable must be able
668 to hit in order to ensure that we end the loop with an all-false mask.
669 This involves adding the maximum number of inactive trailing scalar
670 iterations. */
671 widest_int iv_limit;
672 bool known_max_iters = max_loop_iterations (loop, &iv_limit);
673 if (known_max_iters)
674 {
675 if (niters_skip)
676 {
677 /* Add the maximum number of skipped iterations to the
678 maximum iteration count. */
679 if (TREE_CODE (niters_skip) == INTEGER_CST)
680 iv_limit += wi::to_widest (niters_skip);
681 else
682 iv_limit += max_vf - 1;
683 }
684 /* IV_LIMIT is the maximum number of latch iterations, which is also
685 the maximum in-range IV value. Round this value down to the previous
686 vector alignment boundary and then add an extra full iteration. */
687 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
688 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
689 }
690
691 /* Get the vectorization factor in tree form. */
692 tree vf = build_int_cst (compare_type,
693 LOOP_VINFO_VECT_FACTOR (loop_vinfo));
694
695 /* Iterate over all the rgroups and fill in their masks. We could use
696 the first mask from any rgroup for the loop condition; here we
697 arbitrarily pick the last. */
698 tree test_mask = NULL_TREE;
699 rgroup_masks *rgm;
700 unsigned int i;
701 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
702 FOR_EACH_VEC_ELT (*masks, i, rgm)
703 if (!rgm->masks.is_empty ())
704 {
705 /* First try using permutes. This adds a single vector
706 instruction to the loop for each mask, but needs no extra
707 loop invariants or IVs. */
708 unsigned int nmasks = i + 1;
709 if ((nmasks & 1) == 0)
710 {
711 rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1];
712 if (!half_rgm->masks.is_empty ()
713 && vect_maybe_permute_loop_masks (&header_seq, rgm, half_rgm))
714 continue;
715 }
716
717 /* See whether zero-based IV would ever generate all-false masks
718 before wrapping around. */
719 bool might_wrap_p
720 = (!known_max_iters
721 || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
722 UNSIGNED)
723 > compare_precision));
724
725 /* Set up all masks for this group. */
726 test_mask = vect_set_loop_masks_directly (loop, loop_vinfo,
727 &preheader_seq,
728 loop_cond_gsi, rgm, vf,
729 niters, niters_skip,
730 might_wrap_p);
731 }
732
733 /* Emit all accumulated statements. */
734 add_preheader_seq (loop, preheader_seq);
735 add_header_seq (loop, header_seq);
736
737 /* Get a boolean result that tells us whether to iterate. */
738 edge exit_edge = single_exit (loop);
739 tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
740 tree zero_mask = build_zero_cst (TREE_TYPE (test_mask));
741 gcond *cond_stmt = gimple_build_cond (code, test_mask, zero_mask,
742 NULL_TREE, NULL_TREE);
743 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
744
745 /* The loop iterates (NITERS - 1) / VF + 1 times.
746 Subtract one from this to get the latch count. */
747 tree step = build_int_cst (compare_type,
748 LOOP_VINFO_VECT_FACTOR (loop_vinfo));
749 tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
750 build_minus_one_cst (compare_type));
751 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
752 niters_minus_one, step);
753
754 if (final_iv)
755 {
756 gassign *assign = gimple_build_assign (final_iv, orig_niters);
757 gsi_insert_on_edge_immediate (single_exit (loop), assign);
758 }
759
760 return cond_stmt;
761 }
762
763 /* Like vect_set_loop_condition, but handle the case in which there
764 are no loop masks. */
765
766 static gcond *
767 vect_set_loop_condition_unmasked (struct loop *loop, tree niters,
768 tree step, tree final_iv,
769 bool niters_maybe_zero,
770 gimple_stmt_iterator loop_cond_gsi)
257 { 771 {
258 tree indx_before_incr, indx_after_incr; 772 tree indx_before_incr, indx_after_incr;
259 gcond *cond_stmt; 773 gcond *cond_stmt;
260 gcond *orig_cond; 774 gcond *orig_cond;
775 edge pe = loop_preheader_edge (loop);
261 edge exit_edge = single_exit (loop); 776 edge exit_edge = single_exit (loop);
262 gimple_stmt_iterator loop_cond_gsi;
263 gimple_stmt_iterator incr_gsi; 777 gimple_stmt_iterator incr_gsi;
264 bool insert_after; 778 bool insert_after;
265 tree init = build_int_cst (TREE_TYPE (niters), 0);
266 tree step = build_int_cst (TREE_TYPE (niters), 1);
267 source_location loop_loc;
268 enum tree_code code; 779 enum tree_code code;
780 tree niters_type = TREE_TYPE (niters);
269 781
270 orig_cond = get_loop_exit_condition (loop); 782 orig_cond = get_loop_exit_condition (loop);
271 gcc_assert (orig_cond); 783 gcc_assert (orig_cond);
272 loop_cond_gsi = gsi_for_stmt (orig_cond); 784 loop_cond_gsi = gsi_for_stmt (orig_cond);
273 785
786 tree init, limit;
787 if (!niters_maybe_zero && integer_onep (step))
788 {
789 /* In this case we can use a simple 0-based IV:
790
791 A:
792 x = 0;
793 do
794 {
795 ...
796 x += 1;
797 }
798 while (x < NITERS); */
799 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
800 init = build_zero_cst (niters_type);
801 limit = niters;
802 }
803 else
804 {
805 /* The following works for all values of NITERS except 0:
806
807 B:
808 x = 0;
809 do
810 {
811 ...
812 x += STEP;
813 }
814 while (x <= NITERS - STEP);
815
816 so that the loop continues to iterate if x + STEP - 1 < NITERS
817 but stops if x + STEP - 1 >= NITERS.
818
819 However, if NITERS is zero, x never hits a value above NITERS - STEP
820 before wrapping around. There are two obvious ways of dealing with
821 this:
822
823 - start at STEP - 1 and compare x before incrementing it
824 - start at -1 and compare x after incrementing it
825
826 The latter is simpler and is what we use. The loop in this case
827 looks like:
828
829 C:
830 x = -1;
831 do
832 {
833 ...
834 x += STEP;
835 }
836 while (x < NITERS - STEP);
837
838 In both cases the loop limit is NITERS - STEP. */
839 gimple_seq seq = NULL;
840 limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
841 limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
842 if (seq)
843 {
844 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
845 gcc_assert (!new_bb);
846 }
847 if (niters_maybe_zero)
848 {
849 /* Case C. */
850 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
851 init = build_all_ones_cst (niters_type);
852 }
853 else
854 {
855 /* Case B. */
856 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
857 init = build_zero_cst (niters_type);
858 }
859 }
860
274 standard_iv_increment_position (loop, &incr_gsi, &insert_after); 861 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
275 create_iv (init, step, NULL_TREE, loop, 862 create_iv (init, step, NULL_TREE, loop,
276 &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr); 863 &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
277
278 indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr, 864 indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
279 true, NULL_TREE, true, 865 true, NULL_TREE, true,
280 GSI_SAME_STMT); 866 GSI_SAME_STMT);
281 niters = force_gimple_operand_gsi (&loop_cond_gsi, niters, true, NULL_TREE, 867 limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
282 true, GSI_SAME_STMT); 868 true, GSI_SAME_STMT);
283 869
284 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR; 870 cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
285 cond_stmt = gimple_build_cond (code, indx_after_incr, niters, NULL_TREE,
286 NULL_TREE); 871 NULL_TREE);
287 872
288 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT); 873 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
289 874
290 /* Remove old loop exit test: */ 875 /* Record the number of latch iterations. */
291 gsi_remove (&loop_cond_gsi, true); 876 if (limit == niters)
292 free_stmt_vec_info (orig_cond); 877 /* Case A: the loop iterates NITERS times. Subtract one to get the
293 878 latch count. */
294 loop_loc = find_loop_location (loop); 879 loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
880 build_int_cst (niters_type, 1));
881 else
882 /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
883 Subtract one from this to get the latch count. */
884 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
885 limit, step);
886
887 if (final_iv)
888 {
889 gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
890 indx_after_incr, init);
891 gsi_insert_on_edge_immediate (single_exit (loop), assign);
892 }
893
894 return cond_stmt;
895 }
896
897 /* If we're using fully-masked loops, make LOOP iterate:
898
899 N == (NITERS - 1) / STEP + 1
900
901 times. When NITERS is zero, this is equivalent to making the loop
902 execute (1 << M) / STEP times, where M is the precision of NITERS.
903 NITERS_MAYBE_ZERO is true if this last case might occur.
904
905 If we're not using fully-masked loops, make LOOP iterate:
906
907 N == (NITERS - STEP) / STEP + 1
908
909 times, where NITERS is known to be outside the range [1, STEP - 1].
910 This is equivalent to making the loop execute NITERS / STEP times
911 when NITERS is nonzero and (1 << M) / STEP times otherwise.
912 NITERS_MAYBE_ZERO again indicates whether this last case might occur.
913
914 If FINAL_IV is nonnull, it is an SSA name that should be set to
915 N * STEP on exit from the loop.
916
917 Assumption: the exit-condition of LOOP is the last stmt in the loop. */
918
919 void
920 vect_set_loop_condition (struct loop *loop, loop_vec_info loop_vinfo,
921 tree niters, tree step, tree final_iv,
922 bool niters_maybe_zero)
923 {
924 gcond *cond_stmt;
925 gcond *orig_cond = get_loop_exit_condition (loop);
926 gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
927
928 if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
929 cond_stmt = vect_set_loop_condition_masked (loop, loop_vinfo, niters,
930 final_iv, niters_maybe_zero,
931 loop_cond_gsi);
932 else
933 cond_stmt = vect_set_loop_condition_unmasked (loop, niters, step,
934 final_iv, niters_maybe_zero,
935 loop_cond_gsi);
936
937 /* Remove old loop exit test. */
938 stmt_vec_info orig_cond_info;
939 if (loop_vinfo
940 && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
941 loop_vinfo->remove_stmt (orig_cond_info);
942 else
943 gsi_remove (&loop_cond_gsi, true);
944
295 if (dump_enabled_p ()) 945 if (dump_enabled_p ())
296 { 946 dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
297 if (LOCATION_LOCUS (loop_loc) != UNKNOWN_LOCATION) 947 cond_stmt);
298 dump_printf (MSG_NOTE, "\nloop at %s:%d: ", LOCATION_FILE (loop_loc),
299 LOCATION_LINE (loop_loc));
300 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, cond_stmt, 0);
301 }
302
303 /* Record the number of latch iterations. */
304 loop->nb_iterations = fold_build2 (MINUS_EXPR, TREE_TYPE (niters), niters,
305 build_int_cst (TREE_TYPE (niters), 1));
306 } 948 }
307 949
308 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg. 950 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
309 For all PHI arguments in FROM->dest and TO->dest from those 951 For all PHI arguments in FROM->dest and TO->dest from those
310 edges ensure that TO->dest PHI arguments have current_def 952 edges ensure that TO->dest PHI arguments have current_def
637 tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0)); 1279 tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
638 imm_use_iterator imm_iter; 1280 imm_use_iterator imm_iter;
639 gimple *stmt; 1281 gimple *stmt;
640 use_operand_p use_p; 1282 use_operand_p use_p;
641 1283
1284 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1285 = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
642 add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION); 1286 add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
643 gimple_phi_set_result (new_phi, new_vop); 1287 gimple_phi_set_result (new_phi, new_vop);
644 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop) 1288 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
645 if (stmt != new_phi 1289 if (stmt != new_phi
646 && !flow_bb_inside_loop_p (loop, gimple_bb (stmt))) 1290 && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
657 Extract the location of the loop in the source code. 1301 Extract the location of the loop in the source code.
658 If the loop is not well formed for vectorization, an estimated 1302 If the loop is not well formed for vectorization, an estimated
659 location is calculated. 1303 location is calculated.
660 Return the loop location if succeed and NULL if not. */ 1304 Return the loop location if succeed and NULL if not. */
661 1305
662 source_location 1306 dump_user_location_t
663 find_loop_location (struct loop *loop) 1307 find_loop_location (struct loop *loop)
664 { 1308 {
665 gimple *stmt = NULL; 1309 gimple *stmt = NULL;
666 basic_block bb; 1310 basic_block bb;
667 gimple_stmt_iterator si; 1311 gimple_stmt_iterator si;
668 1312
669 if (!loop) 1313 if (!loop)
670 return UNKNOWN_LOCATION; 1314 return dump_user_location_t ();
671 1315
672 stmt = get_loop_exit_condition (loop); 1316 stmt = get_loop_exit_condition (loop);
673 1317
674 if (stmt 1318 if (stmt
675 && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION) 1319 && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
676 return gimple_location (stmt); 1320 return stmt;
677 1321
678 /* If we got here the loop is probably not "well formed", 1322 /* If we got here the loop is probably not "well formed",
679 try to estimate the loop location */ 1323 try to estimate the loop location */
680 1324
681 if (!loop->header) 1325 if (!loop->header)
682 return UNKNOWN_LOCATION; 1326 return dump_user_location_t ();
683 1327
684 bb = loop->header; 1328 bb = loop->header;
685 1329
686 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) 1330 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
687 { 1331 {
688 stmt = gsi_stmt (si); 1332 stmt = gsi_stmt (si);
689 if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION) 1333 if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
690 return gimple_location (stmt); 1334 return stmt;
691 } 1335 }
692 1336
693 return UNKNOWN_LOCATION; 1337 return dump_user_location_t ();
694 } 1338 }
695 1339
696 /* Return true if PHI defines an IV of the loop to be vectorized. */ 1340 /* Return true if the phi described by STMT_INFO defines an IV of the
1341 loop to be vectorized. */
697 1342
698 static bool 1343 static bool
699 iv_phi_p (gphi *phi) 1344 iv_phi_p (stmt_vec_info stmt_info)
700 { 1345 {
1346 gphi *phi = as_a <gphi *> (stmt_info->stmt);
701 if (virtual_operand_p (PHI_RESULT (phi))) 1347 if (virtual_operand_p (PHI_RESULT (phi)))
702 return false; 1348 return false;
703 1349
704 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
705 gcc_assert (stmt_info != NULL);
706 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def 1350 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
707 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def) 1351 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
708 return false; 1352 return false;
709 1353
710 return true; 1354 return true;
733 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi)) 1377 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
734 { 1378 {
735 tree evolution_part; 1379 tree evolution_part;
736 1380
737 gphi *phi = gsi.phi (); 1381 gphi *phi = gsi.phi ();
1382 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
738 if (dump_enabled_p ()) 1383 if (dump_enabled_p ())
739 { 1384 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
740 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: "); 1385 phi_info->stmt);
741 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
742 }
743 1386
744 /* Skip virtual phi's. The data dependences that are associated with 1387 /* Skip virtual phi's. The data dependences that are associated with
745 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. 1388 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
746 1389
747 Skip reduction phis. */ 1390 Skip reduction phis. */
748 if (!iv_phi_p (phi)) 1391 if (!iv_phi_p (phi_info))
749 { 1392 {
750 if (dump_enabled_p ()) 1393 if (dump_enabled_p ())
751 dump_printf_loc (MSG_NOTE, vect_location, 1394 dump_printf_loc (MSG_NOTE, vect_location,
752 "reduc or virtual phi. skip.\n"); 1395 "reduc or virtual phi. skip.\n");
753 continue; 1396 continue;
754 } 1397 }
755 1398
756 /* Analyze the evolution function. */ 1399 /* Analyze the evolution function. */
757 1400
758 evolution_part 1401 evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
759 = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (vinfo_for_stmt (phi));
760 if (evolution_part == NULL_TREE) 1402 if (evolution_part == NULL_TREE)
761 { 1403 {
762 if (dump_enabled_p ()) 1404 if (dump_enabled_p ())
763 dump_printf (MSG_MISSED_OPTIMIZATION, 1405 dump_printf (MSG_MISSED_OPTIMIZATION,
764 "No access function or evolution.\n"); 1406 "No access function or evolution.\n");
856 tree var, ni, ni_name; 1498 tree var, ni, ni_name;
857 gimple_stmt_iterator last_gsi; 1499 gimple_stmt_iterator last_gsi;
858 1500
859 gphi *phi = gsi.phi (); 1501 gphi *phi = gsi.phi ();
860 gphi *phi1 = gsi1.phi (); 1502 gphi *phi1 = gsi1.phi ();
1503 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
861 if (dump_enabled_p ()) 1504 if (dump_enabled_p ())
862 { 1505 dump_printf_loc (MSG_NOTE, vect_location,
863 dump_printf_loc (MSG_NOTE, vect_location, 1506 "vect_update_ivs_after_vectorizer: phi: %G", phi);
864 "vect_update_ivs_after_vectorizer: phi: ");
865 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
866 }
867 1507
868 /* Skip reduction and virtual phis. */ 1508 /* Skip reduction and virtual phis. */
869 if (!iv_phi_p (phi)) 1509 if (!iv_phi_p (phi_info))
870 { 1510 {
871 if (dump_enabled_p ()) 1511 if (dump_enabled_p ())
872 dump_printf_loc (MSG_NOTE, vect_location, 1512 dump_printf_loc (MSG_NOTE, vect_location,
873 "reduc or virtual phi. skip.\n"); 1513 "reduc or virtual phi. skip.\n");
874 continue; 1514 continue;
875 } 1515 }
876 1516
877 type = TREE_TYPE (gimple_phi_result (phi)); 1517 type = TREE_TYPE (gimple_phi_result (phi));
878 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (vinfo_for_stmt (phi)); 1518 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
879 step_expr = unshare_expr (step_expr); 1519 step_expr = unshare_expr (step_expr);
880 1520
881 /* FORNOW: We do not support IVs whose evolution function is a polynomial 1521 /* FORNOW: We do not support IVs whose evolution function is a polynomial
882 of degree >= 2 or exponential. */ 1522 of degree >= 2 or exponential. */
883 gcc_assert (!tree_is_chrec (step_expr)); 1523 gcc_assert (!tree_is_chrec (step_expr));
907 /* Fix phi expressions in the successor bb. */ 1547 /* Fix phi expressions in the successor bb. */
908 adjust_phi_and_debug_stmts (phi1, update_e, ni_name); 1548 adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
909 } 1549 }
910 } 1550 }
911 1551
1552 /* Return a gimple value containing the misalignment (measured in vector
1553 elements) for the loop described by LOOP_VINFO, i.e. how many elements
1554 it is away from a perfectly aligned address. Add any new statements
1555 to SEQ. */
1556
1557 static tree
1558 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1559 {
1560 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1561 stmt_vec_info stmt_info = dr_info->stmt;
1562 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1563
1564 unsigned int target_align = DR_TARGET_ALIGNMENT (dr_info);
1565 gcc_assert (target_align != 0);
1566
1567 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1568 size_zero_node) < 0;
1569 tree offset = (negative
1570 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1571 : size_zero_node);
1572 tree start_addr = vect_create_addr_base_for_vector_ref (stmt_info, seq,
1573 offset);
1574 tree type = unsigned_type_for (TREE_TYPE (start_addr));
1575 tree target_align_minus_1 = build_int_cst (type, target_align - 1);
1576 HOST_WIDE_INT elem_size
1577 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1578 tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1579
1580 /* Create: misalign_in_bytes = addr & (target_align - 1). */
1581 tree int_start_addr = fold_convert (type, start_addr);
1582 tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1583 target_align_minus_1);
1584
1585 /* Create: misalign_in_elems = misalign_in_bytes / element_size. */
1586 tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1587 elem_size_log);
1588
1589 return misalign_in_elems;
1590 }
1591
912 /* Function vect_gen_prolog_loop_niters 1592 /* Function vect_gen_prolog_loop_niters
913 1593
914 Generate the number of iterations which should be peeled as prolog for the 1594 Generate the number of iterations which should be peeled as prolog for the
915 loop represented by LOOP_VINFO. It is calculated as the misalignment of 1595 loop represented by LOOP_VINFO. It is calculated as the misalignment of
916 DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). 1596 DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
918 refer to an aligned location. The following computation is generated: 1598 refer to an aligned location. The following computation is generated:
919 1599
920 If the misalignment of DR is known at compile time: 1600 If the misalignment of DR is known at compile time:
921 addr_mis = int mis = DR_MISALIGNMENT (dr); 1601 addr_mis = int mis = DR_MISALIGNMENT (dr);
922 Else, compute address misalignment in bytes: 1602 Else, compute address misalignment in bytes:
923 addr_mis = addr & (vectype_align - 1) 1603 addr_mis = addr & (target_align - 1)
924 1604
925 prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step 1605 prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
926 1606
927 (elem_size = element type size; an element is the scalar element whose type 1607 (elem_size = element type size; an element is the scalar element whose type
928 is the inner type of the vectype) 1608 is the inner type of the vectype)
942 1622
943 static tree 1623 static tree
944 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo, 1624 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
945 basic_block bb, int *bound) 1625 basic_block bb, int *bound)
946 { 1626 {
947 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo); 1627 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
948 tree var; 1628 tree var;
949 tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)); 1629 tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
950 gimple_seq stmts = NULL, new_stmts = NULL; 1630 gimple_seq stmts = NULL, new_stmts = NULL;
951 tree iters, iters_name; 1631 tree iters, iters_name;
952 gimple *dr_stmt = DR_STMT (dr); 1632 stmt_vec_info stmt_info = dr_info->stmt;
953 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
954 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 1633 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
955 unsigned int target_align = DR_TARGET_ALIGNMENT (dr); 1634 unsigned int target_align = DR_TARGET_ALIGNMENT (dr_info);
956 1635
957 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0) 1636 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
958 { 1637 {
959 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); 1638 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
960 1639
965 iters = build_int_cst (niters_type, npeel); 1644 iters = build_int_cst (niters_type, npeel);
966 *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); 1645 *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
967 } 1646 }
968 else 1647 else
969 { 1648 {
970 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0; 1649 tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
971 tree offset = negative 1650 tree type = TREE_TYPE (misalign_in_elems);
972 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
973 tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt,
974 &stmts, offset);
975 tree type = unsigned_type_for (TREE_TYPE (start_addr));
976 tree target_align_minus_1 = build_int_cst (type, target_align - 1);
977 HOST_WIDE_INT elem_size 1651 HOST_WIDE_INT elem_size
978 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype))); 1652 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
979 tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
980 HOST_WIDE_INT align_in_elems = target_align / elem_size; 1653 HOST_WIDE_INT align_in_elems = target_align / elem_size;
981 tree align_in_elems_minus_1 = build_int_cst (type, align_in_elems - 1); 1654 tree align_in_elems_minus_1 = build_int_cst (type, align_in_elems - 1);
982 tree align_in_elems_tree = build_int_cst (type, align_in_elems); 1655 tree align_in_elems_tree = build_int_cst (type, align_in_elems);
983 tree misalign_in_bytes;
984 tree misalign_in_elems;
985
986 /* Create: misalign_in_bytes = addr & (target_align - 1). */
987 misalign_in_bytes
988 = fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr),
989 target_align_minus_1);
990
991 /* Create: misalign_in_elems = misalign_in_bytes / element_size. */
992 misalign_in_elems
993 = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes, elem_size_log);
994 1656
995 /* Create: (niters_type) ((align_in_elems - misalign_in_elems) 1657 /* Create: (niters_type) ((align_in_elems - misalign_in_elems)
996 & (align_in_elems - 1)). */ 1658 & (align_in_elems - 1)). */
1659 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1660 size_zero_node) < 0;
997 if (negative) 1661 if (negative)
998 iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems, 1662 iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
999 align_in_elems_tree); 1663 align_in_elems_tree);
1000 else 1664 else
1001 iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree, 1665 iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1004 iters = fold_convert (niters_type, iters); 1668 iters = fold_convert (niters_type, iters);
1005 *bound = align_in_elems - 1; 1669 *bound = align_in_elems - 1;
1006 } 1670 }
1007 1671
1008 if (dump_enabled_p ()) 1672 if (dump_enabled_p ())
1009 { 1673 dump_printf_loc (MSG_NOTE, vect_location,
1010 dump_printf_loc (MSG_NOTE, vect_location, 1674 "niters for prolog loop: %T\n", iters);
1011 "niters for prolog loop: ");
1012 dump_generic_expr (MSG_NOTE, TDF_SLIM, iters);
1013 dump_printf (MSG_NOTE, "\n");
1014 }
1015 1675
1016 var = create_tmp_var (niters_type, "prolog_loop_niters"); 1676 var = create_tmp_var (niters_type, "prolog_loop_niters");
1017 iters_name = force_gimple_operand (iters, &new_stmts, false, var); 1677 iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1018 1678
1019 if (new_stmts) 1679 if (new_stmts)
1031 } 1691 }
1032 1692
1033 1693
1034 /* Function vect_update_init_of_dr 1694 /* Function vect_update_init_of_dr
1035 1695
1036 NITERS iterations were peeled from LOOP. DR represents a data reference 1696 If CODE is PLUS, the vector loop starts NITERS iterations after the
1037 in LOOP. This function updates the information recorded in DR to 1697 scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1038 account for the fact that the first NITERS iterations had already been 1698 iterations before the scalar one (using masking to skip inactive
1039 executed. Specifically, it updates the OFFSET field of DR. */ 1699 elements). This function updates the information recorded in DR to
1700 account for the difference. Specifically, it updates the OFFSET
1701 field of DR. */
1040 1702
1041 static void 1703 static void
1042 vect_update_init_of_dr (struct data_reference *dr, tree niters) 1704 vect_update_init_of_dr (struct data_reference *dr, tree niters, tree_code code)
1043 { 1705 {
1044 tree offset = DR_OFFSET (dr); 1706 tree offset = DR_OFFSET (dr);
1045 1707
1046 niters = fold_build2 (MULT_EXPR, sizetype, 1708 niters = fold_build2 (MULT_EXPR, sizetype,
1047 fold_convert (sizetype, niters), 1709 fold_convert (sizetype, niters),
1048 fold_convert (sizetype, DR_STEP (dr))); 1710 fold_convert (sizetype, DR_STEP (dr)));
1049 offset = fold_build2 (PLUS_EXPR, sizetype, 1711 offset = fold_build2 (code, sizetype,
1050 fold_convert (sizetype, offset), niters); 1712 fold_convert (sizetype, offset), niters);
1051 DR_OFFSET (dr) = offset; 1713 DR_OFFSET (dr) = offset;
1052 } 1714 }
1053 1715
1054 1716
1055 /* Function vect_update_inits_of_drs 1717 /* Function vect_update_inits_of_drs
1056 1718
1057 NITERS iterations were peeled from the loop represented by LOOP_VINFO. 1719 Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1058 This function updates the information recorded for the data references in 1720 CODE and NITERS are as for vect_update_inits_of_dr. */
1059 the loop to account for the fact that the first NITERS iterations had
1060 already been executed. Specifically, it updates the initial_condition of
1061 the access_function of all the data_references in the loop. */
1062 1721
1063 static void 1722 static void
1064 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters) 1723 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1724 tree_code code)
1065 { 1725 {
1066 unsigned int i; 1726 unsigned int i;
1067 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); 1727 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1068 struct data_reference *dr; 1728 struct data_reference *dr;
1069 1729
1070 if (dump_enabled_p ()) 1730 DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1071 dump_printf_loc (MSG_NOTE, vect_location,
1072 "=== vect_update_inits_of_dr ===\n");
1073 1731
1074 /* Adjust niters to sizetype and insert stmts on loop preheader edge. */ 1732 /* Adjust niters to sizetype and insert stmts on loop preheader edge. */
1075 if (!types_compatible_p (sizetype, TREE_TYPE (niters))) 1733 if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1076 { 1734 {
1077 gimple_seq seq; 1735 gimple_seq seq;
1086 gcc_assert (!new_bb); 1744 gcc_assert (!new_bb);
1087 } 1745 }
1088 } 1746 }
1089 1747
1090 FOR_EACH_VEC_ELT (datarefs, i, dr) 1748 FOR_EACH_VEC_ELT (datarefs, i, dr)
1091 vect_update_init_of_dr (dr, niters); 1749 {
1092 } 1750 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1093 1751 if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt))
1752 vect_update_init_of_dr (dr, niters, code);
1753 }
1754 }
1755
1756 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1757 by masking. This involves calculating the number of iterations to
1758 be peeled and then aligning all memory references appropriately. */
1759
1760 void
1761 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1762 {
1763 tree misalign_in_elems;
1764 tree type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
1765
1766 gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1767
1768 /* From the information recorded in LOOP_VINFO get the number of iterations
1769 that need to be skipped via masking. */
1770 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1771 {
1772 poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1773 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1774 misalign_in_elems = build_int_cst (type, misalign);
1775 }
1776 else
1777 {
1778 gimple_seq seq1 = NULL, seq2 = NULL;
1779 misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1780 misalign_in_elems = fold_convert (type, misalign_in_elems);
1781 misalign_in_elems = force_gimple_operand (misalign_in_elems,
1782 &seq2, true, NULL_TREE);
1783 gimple_seq_add_seq (&seq1, seq2);
1784 if (seq1)
1785 {
1786 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1787 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1788 gcc_assert (!new_bb);
1789 }
1790 }
1791
1792 if (dump_enabled_p ())
1793 dump_printf_loc (MSG_NOTE, vect_location,
1794 "misalignment for fully-masked loop: %T\n",
1795 misalign_in_elems);
1796
1797 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1798
1799 vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1800 }
1094 1801
1095 /* This function builds ni_name = number of iterations. Statements 1802 /* This function builds ni_name = number of iterations. Statements
1096 are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set 1803 are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set
1097 it to TRUE if new ssa_var is generated. */ 1804 it to TRUE if new ssa_var is generated. */
1098 1805
1122 } 1829 }
1123 1830
1124 /* Calculate the number of iterations above which vectorized loop will be 1831 /* Calculate the number of iterations above which vectorized loop will be
1125 preferred than scalar loop. NITERS_PROLOG is the number of iterations 1832 preferred than scalar loop. NITERS_PROLOG is the number of iterations
1126 of prolog loop. If it's integer const, the integer number is also passed 1833 of prolog loop. If it's integer const, the integer number is also passed
1127 in INT_NITERS_PROLOG. BOUND_PROLOG is the upper bound (included) of 1834 in INT_NITERS_PROLOG. BOUND_PROLOG is the upper bound (inclusive) of the
1128 number of iterations of prolog loop. VFM1 is vector factor minus one. 1835 number of iterations of the prolog loop. BOUND_EPILOG is the corresponding
1129 If CHECK_PROFITABILITY is true, TH is the threshold below which scalar 1836 value for the epilog loop. If CHECK_PROFITABILITY is true, TH is the
1130 (rather than vectorized) loop will be executed. This function stores 1837 threshold below which the scalar (rather than vectorized) loop will be
1131 upper bound (included) of the result in BOUND_SCALAR. */ 1838 executed. This function stores the upper bound (inclusive) of the result
1839 in BOUND_SCALAR. */
1132 1840
1133 static tree 1841 static tree
1134 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog, 1842 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1135 int bound_prolog, int vfm1, int th, 1843 int bound_prolog, poly_int64 bound_epilog, int th,
1136 int *bound_scalar, bool check_profitability) 1844 poly_uint64 *bound_scalar,
1845 bool check_profitability)
1137 { 1846 {
1138 tree type = TREE_TYPE (niters_prolog); 1847 tree type = TREE_TYPE (niters_prolog);
1139 tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog, 1848 tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1140 build_int_cst (type, vfm1)); 1849 build_int_cst (type, bound_epilog));
1141 1850
1142 *bound_scalar = vfm1 + bound_prolog; 1851 *bound_scalar = bound_prolog + bound_epilog;
1143 if (check_profitability) 1852 if (check_profitability)
1144 { 1853 {
1145 /* TH indicates the minimum niters of vectorized loop, while we 1854 /* TH indicates the minimum niters of vectorized loop, while we
1146 compute the maximum niters of scalar loop. */ 1855 compute the maximum niters of scalar loop. */
1147 th--; 1856 th--;
1148 /* Peeling for constant times. */ 1857 /* Peeling for constant times. */
1149 if (int_niters_prolog >= 0) 1858 if (int_niters_prolog >= 0)
1150 { 1859 {
1151 *bound_scalar = (int_niters_prolog + vfm1 < th 1860 *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1152 ? th
1153 : vfm1 + int_niters_prolog);
1154 return build_int_cst (type, *bound_scalar); 1861 return build_int_cst (type, *bound_scalar);
1155 } 1862 }
1156 /* Peeling for unknown times. Note BOUND_PROLOG is the upper 1863 /* Peeling an unknown number of times. Note that both BOUND_PROLOG
1157 bound (inlcuded) of niters of prolog loop. */ 1864 and BOUND_EPILOG are inclusive upper bounds. */
1158 if (th >= vfm1 + bound_prolog) 1865 if (known_ge (th, bound_prolog + bound_epilog))
1159 { 1866 {
1160 *bound_scalar = th; 1867 *bound_scalar = th;
1161 return build_int_cst (type, th); 1868 return build_int_cst (type, th);
1162 } 1869 }
1163 /* Need to do runtime comparison, but BOUND_SCALAR remains the same. */ 1870 /* Need to do runtime comparison. */
1164 else if (th > vfm1) 1871 else if (maybe_gt (th, bound_epilog))
1165 return fold_build2 (MAX_EXPR, type, build_int_cst (type, th), niters); 1872 {
1873 *bound_scalar = upper_bound (*bound_scalar, th);
1874 return fold_build2 (MAX_EXPR, type,
1875 build_int_cst (type, th), niters);
1876 }
1166 } 1877 }
1167 return niters; 1878 return niters;
1168 } 1879 }
1169 1880
1170 /* This function generates the following statements: 1881 /* NITERS is the number of times that the original scalar loop executes
1171 1882 after peeling. Work out the maximum number of iterations N that can
1172 niters = number of iterations loop executes (after peeling) 1883 be handled by the vectorized form of the loop and then either:
1173 niters_vector = niters / vf 1884
1174 1885 a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1175 and places them on the loop preheader edge. NITERS_NO_OVERFLOW is 1886
1176 true if NITERS doesn't overflow. */ 1887 niters_vector = N
1888
1889 b) set *STEP_VECTOR_PTR to one and generate:
1890
1891 niters_vector = N / vf
1892
1893 In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1894 any new statements on the loop preheader edge. NITERS_NO_OVERFLOW
1895 is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero). */
1177 1896
1178 void 1897 void
1179 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters, 1898 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1180 tree *niters_vector_ptr, bool niters_no_overflow) 1899 tree *niters_vector_ptr, tree *step_vector_ptr,
1900 bool niters_no_overflow)
1181 { 1901 {
1182 tree ni_minus_gap, var; 1902 tree ni_minus_gap, var;
1183 tree niters_vector, type = TREE_TYPE (niters); 1903 tree niters_vector, step_vector, type = TREE_TYPE (niters);
1184 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 1904 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1185 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo)); 1905 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1186 tree log_vf = build_int_cst (type, exact_log2 (vf)); 1906 tree log_vf = NULL_TREE;
1187 1907
1188 /* If epilogue loop is required because of data accesses with gaps, we 1908 /* If epilogue loop is required because of data accesses with gaps, we
1189 subtract one iteration from the total number of iterations here for 1909 subtract one iteration from the total number of iterations here for
1190 correct calculation of RATIO. */ 1910 correct calculation of RATIO. */
1191 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) 1911 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1202 } 1922 }
1203 } 1923 }
1204 else 1924 else
1205 ni_minus_gap = niters; 1925 ni_minus_gap = niters;
1206 1926
1207 /* Create: niters >> log2(vf) */ 1927 unsigned HOST_WIDE_INT const_vf;
1208 /* If it's known that niters == number of latch executions + 1 doesn't 1928 if (vf.is_constant (&const_vf)
1209 overflow, we can generate niters >> log2(vf); otherwise we generate 1929 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1210 (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio 1930 {
1211 will be at least one. */ 1931 /* Create: niters >> log2(vf) */
1212 if (niters_no_overflow) 1932 /* If it's known that niters == number of latch executions + 1 doesn't
1213 niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf); 1933 overflow, we can generate niters >> log2(vf); otherwise we generate
1934 (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
1935 will be at least one. */
1936 log_vf = build_int_cst (type, exact_log2 (const_vf));
1937 if (niters_no_overflow)
1938 niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
1939 else
1940 niters_vector
1941 = fold_build2 (PLUS_EXPR, type,
1942 fold_build2 (RSHIFT_EXPR, type,
1943 fold_build2 (MINUS_EXPR, type,
1944 ni_minus_gap,
1945 build_int_cst (type, vf)),
1946 log_vf),
1947 build_int_cst (type, 1));
1948 step_vector = build_one_cst (type);
1949 }
1214 else 1950 else
1215 niters_vector 1951 {
1216 = fold_build2 (PLUS_EXPR, type, 1952 niters_vector = ni_minus_gap;
1217 fold_build2 (RSHIFT_EXPR, type, 1953 step_vector = build_int_cst (type, vf);
1218 fold_build2 (MINUS_EXPR, type, ni_minus_gap, 1954 }
1219 build_int_cst (type, vf)),
1220 log_vf),
1221 build_int_cst (type, 1));
1222 1955
1223 if (!is_gimple_val (niters_vector)) 1956 if (!is_gimple_val (niters_vector))
1224 { 1957 {
1225 var = create_tmp_var (type, "bnd"); 1958 var = create_tmp_var (type, "bnd");
1226 gimple_seq stmts = NULL; 1959 gimple_seq stmts = NULL;
1227 niters_vector = force_gimple_operand (niters_vector, &stmts, true, var); 1960 niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
1228 gsi_insert_seq_on_edge_immediate (pe, stmts); 1961 gsi_insert_seq_on_edge_immediate (pe, stmts);
1229 /* Peeling algorithm guarantees that vector loop bound is at least ONE, 1962 /* Peeling algorithm guarantees that vector loop bound is at least ONE,
1230 we set range information to make niters analyzer's life easier. */ 1963 we set range information to make niters analyzer's life easier. */
1231 if (stmts != NULL) 1964 if (stmts != NULL && log_vf)
1232 set_range_info (niters_vector, VR_RANGE, 1965 set_range_info (niters_vector, VR_RANGE,
1233 wi::to_wide (build_int_cst (type, 1)), 1966 wi::to_wide (build_int_cst (type, 1)),
1234 wi::to_wide (fold_build2 (RSHIFT_EXPR, type, 1967 wi::to_wide (fold_build2 (RSHIFT_EXPR, type,
1235 TYPE_MAX_VALUE (type), 1968 TYPE_MAX_VALUE (type),
1236 log_vf))); 1969 log_vf)));
1237 } 1970 }
1238 *niters_vector_ptr = niters_vector; 1971 *niters_vector_ptr = niters_vector;
1972 *step_vector_ptr = step_vector;
1239 1973
1240 return; 1974 return;
1241 } 1975 }
1242 1976
1243 /* Given NITERS_VECTOR which is the number of iterations for vectorized 1977 /* Given NITERS_VECTOR which is the number of iterations for vectorized
1248 static void 1982 static void
1249 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo, 1983 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
1250 tree niters_vector, 1984 tree niters_vector,
1251 tree *niters_vector_mult_vf_ptr) 1985 tree *niters_vector_mult_vf_ptr)
1252 { 1986 {
1253 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 1987 /* We should be using a step_vector of VF if VF is variable. */
1988 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
1254 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1989 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1255 tree type = TREE_TYPE (niters_vector); 1990 tree type = TREE_TYPE (niters_vector);
1256 tree log_vf = build_int_cst (type, exact_log2 (vf)); 1991 tree log_vf = build_int_cst (type, exact_log2 (vf));
1257 basic_block exit_bb = single_exit (loop)->dest; 1992 basic_block exit_bb = single_exit (loop)->dest;
1258 1993
1343 gphi *update_phi = gsi_update.phi (); 2078 gphi *update_phi = gsi_update.phi ();
1344 2079
1345 tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e); 2080 tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
1346 /* Generate lcssa PHI node for the first loop. */ 2081 /* Generate lcssa PHI node for the first loop. */
1347 gphi *vect_phi = (loop == first) ? orig_phi : update_phi; 2082 gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
1348 if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi)) 2083 stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2084 if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
1349 { 2085 {
1350 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi)); 2086 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
1351 gphi *lcssa_phi = create_phi_node (new_res, between_bb); 2087 gphi *lcssa_phi = create_phi_node (new_res, between_bb);
1352 add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION); 2088 add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
1353 arg = new_res; 2089 arg = new_res;
1595 - NITERSM1: The number of iterations of the loop's latch. 2331 - NITERSM1: The number of iterations of the loop's latch.
1596 - NITERS_NO_OVERFLOW: No overflow in computing NITERS. 2332 - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
1597 - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if 2333 - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
1598 CHECK_PROFITABILITY is true. 2334 CHECK_PROFITABILITY is true.
1599 Output: 2335 Output:
1600 - NITERS_VECTOR: The number of iterations of loop after vectorization. 2336 - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2337 iterate after vectorization; see vect_set_loop_condition for details.
2338 - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2339 should be set to the number of scalar iterations handled by the
2340 vector loop. The SSA name is only used on exit from the loop.
1601 2341
1602 This function peels prolog and epilog from the loop, adds guards skipping 2342 This function peels prolog and epilog from the loop, adds guards skipping
1603 PROLOG and EPILOG for various conditions. As a result, the changed CFG 2343 PROLOG and EPILOG for various conditions. As a result, the changed CFG
1604 would look like: 2344 would look like:
1605 2345
1652 versioning conditions if loop versioning is needed. */ 2392 versioning conditions if loop versioning is needed. */
1653 2393
1654 2394
1655 struct loop * 2395 struct loop *
1656 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, 2396 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
1657 tree *niters_vector, int th, bool check_profitability, 2397 tree *niters_vector, tree *step_vector,
1658 bool niters_no_overflow) 2398 tree *niters_vector_mult_vf_var, int th,
2399 bool check_profitability, bool niters_no_overflow)
1659 { 2400 {
1660 edge e, guard_e; 2401 edge e, guard_e;
1661 tree type = TREE_TYPE (niters), guard_cond; 2402 tree type = TREE_TYPE (niters), guard_cond;
1662 basic_block guard_bb, guard_to; 2403 basic_block guard_bb, guard_to;
1663 profile_probability prob_prolog, prob_vector, prob_epilog; 2404 profile_probability prob_prolog, prob_vector, prob_epilog;
1664 int bound_prolog = 0, bound_scalar = 0, bound = 0; 2405 int estimated_vf;
1665 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 2406 int prolog_peeling = 0;
1666 int prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); 2407 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
1667 bool epilog_peeling = (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) 2408 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1668 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)); 2409
2410 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2411 poly_uint64 bound_epilog = 0;
2412 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2413 && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2414 bound_epilog += vf - 1;
2415 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2416 bound_epilog += 1;
2417 bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2418 poly_uint64 bound_scalar = bound_epilog;
1669 2419
1670 if (!prolog_peeling && !epilog_peeling) 2420 if (!prolog_peeling && !epilog_peeling)
1671 return NULL; 2421 return NULL;
1672 2422
1673 prob_vector = profile_probability::guessed_always ().apply_scale (9, 10); 2423 prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
1674 if ((vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo)) == 2) 2424 estimated_vf = vect_vf_for_cost (loop_vinfo);
1675 vf = 3; 2425 if (estimated_vf == 2)
2426 estimated_vf = 3;
1676 prob_prolog = prob_epilog = profile_probability::guessed_always () 2427 prob_prolog = prob_epilog = profile_probability::guessed_always ()
1677 .apply_scale (vf - 1, vf); 2428 .apply_scale (estimated_vf - 1, estimated_vf);
1678 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1679 2429
1680 struct loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo); 2430 struct loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo);
1681 struct loop *first_loop = loop; 2431 struct loop *first_loop = loop;
1682 bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP; 2432 bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
1683 create_lcssa_for_virtual_phi (loop); 2433 create_lcssa_for_virtual_phi (loop);
1684 update_ssa (TODO_update_ssa_only_virtuals); 2434 update_ssa (TODO_update_ssa_only_virtuals);
1685 2435
1686 if (MAY_HAVE_DEBUG_STMTS) 2436 if (MAY_HAVE_DEBUG_BIND_STMTS)
1687 { 2437 {
1688 gcc_assert (!adjust_vec.exists ()); 2438 gcc_assert (!adjust_vec.exists ());
1689 adjust_vec.create (32); 2439 adjust_vec.create (32);
1690 } 2440 }
1691 initialize_original_copy_tables (); 2441 initialize_original_copy_tables ();
2442
2443 /* Record the anchor bb at which the guard should be placed if the scalar
2444 loop might be preferred. */
2445 basic_block anchor = loop_preheader_edge (loop)->src;
2446
2447 /* Generate the number of iterations for the prolog loop. We do this here
2448 so that we can also get the upper bound on the number of iterations. */
2449 tree niters_prolog;
2450 int bound_prolog = 0;
2451 if (prolog_peeling)
2452 niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2453 &bound_prolog);
2454 else
2455 niters_prolog = build_int_cst (type, 0);
1692 2456
1693 /* Prolog loop may be skipped. */ 2457 /* Prolog loop may be skipped. */
1694 bool skip_prolog = (prolog_peeling != 0); 2458 bool skip_prolog = (prolog_peeling != 0);
1695 /* Skip to epilog if scalar loop may be preferred. It's only needed 2459 /* Skip to epilog if scalar loop may be preferred. It's only needed
1696 when we peel for epilog loop and when it hasn't been checked with 2460 when we peel for epilog loop and when it hasn't been checked with
1697 loop versioning. */ 2461 loop versioning. */
1698 bool skip_vector = (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 2462 bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1699 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)); 2463 ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2464 bound_prolog + bound_epilog)
2465 : !LOOP_REQUIRES_VERSIONING (loop_vinfo));
1700 /* Epilog loop must be executed if the number of iterations for epilog 2466 /* Epilog loop must be executed if the number of iterations for epilog
1701 loop is known at compile time, otherwise we need to add a check at 2467 loop is known at compile time, otherwise we need to add a check at
1702 the end of vector loop and skip to the end of epilog loop. */ 2468 the end of vector loop and skip to the end of epilog loop. */
1703 bool skip_epilog = (prolog_peeling < 0 2469 bool skip_epilog = (prolog_peeling < 0
1704 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)); 2470 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2471 || !vf.is_constant ());
1705 /* PEELING_FOR_GAPS is special because epilog loop must be executed. */ 2472 /* PEELING_FOR_GAPS is special because epilog loop must be executed. */
1706 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) 2473 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1707 skip_epilog = false; 2474 skip_epilog = false;
1708 2475
1709 /* Record the anchor bb at which guard should be placed if scalar loop
1710 may be preferred. */
1711 basic_block anchor = loop_preheader_edge (loop)->src;
1712 if (skip_vector) 2476 if (skip_vector)
1713 { 2477 {
1714 split_edge (loop_preheader_edge (loop)); 2478 split_edge (loop_preheader_edge (loop));
1715 2479
1716 /* Due to the order in which we peel prolog and epilog, we first 2480 /* Due to the order in which we peel prolog and epilog, we first
1718 avoid adjusting probabilities of both prolog and vector loops 2482 avoid adjusting probabilities of both prolog and vector loops
1719 separately. Note in this case, the probability of epilog loop 2483 separately. Note in this case, the probability of epilog loop
1720 needs to be scaled back later. */ 2484 needs to be scaled back later. */
1721 basic_block bb_before_loop = loop_preheader_edge (loop)->src; 2485 basic_block bb_before_loop = loop_preheader_edge (loop)->src;
1722 if (prob_vector.initialized_p ()) 2486 if (prob_vector.initialized_p ())
1723 scale_bbs_frequencies (&bb_before_loop, 1, prob_vector); 2487 {
1724 scale_loop_profile (loop, prob_vector, bound); 2488 scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
1725 } 2489 scale_loop_profile (loop, prob_vector, 0);
1726 2490 }
1727 tree niters_prolog = build_int_cst (type, 0); 2491 }
1728 source_location loop_loc = find_loop_location (loop); 2492
2493 dump_user_location_t loop_loc = find_loop_location (loop);
1729 struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); 2494 struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
1730 if (prolog_peeling) 2495 if (prolog_peeling)
1731 { 2496 {
1732 e = loop_preheader_edge (loop); 2497 e = loop_preheader_edge (loop);
1733 if (!slpeel_can_duplicate_loop_p (loop, e)) 2498 if (!slpeel_can_duplicate_loop_p (loop, e))
1746 } 2511 }
1747 slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true); 2512 slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
1748 first_loop = prolog; 2513 first_loop = prolog;
1749 reset_original_copy_tables (); 2514 reset_original_copy_tables ();
1750 2515
1751 /* Generate and update the number of iterations for prolog loop. */ 2516 /* Update the number of iterations for prolog loop. */
1752 niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor, 2517 tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
1753 &bound_prolog); 2518 vect_set_loop_condition (prolog, NULL, niters_prolog,
1754 slpeel_make_loop_iterate_ntimes (prolog, niters_prolog); 2519 step_prolog, NULL_TREE, false);
1755 2520
1756 /* Skip the prolog loop. */ 2521 /* Skip the prolog loop. */
1757 if (skip_prolog) 2522 if (skip_prolog)
1758 { 2523 {
1759 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node, 2524 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
1771 2536
1772 scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog); 2537 scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
1773 scale_loop_profile (prolog, prob_prolog, bound_prolog); 2538 scale_loop_profile (prolog, prob_prolog, bound_prolog);
1774 } 2539 }
1775 /* Update init address of DRs. */ 2540 /* Update init address of DRs. */
1776 vect_update_inits_of_drs (loop_vinfo, niters_prolog); 2541 vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
1777 /* Update niters for vector loop. */ 2542 /* Update niters for vector loop. */
1778 LOOP_VINFO_NITERS (loop_vinfo) 2543 LOOP_VINFO_NITERS (loop_vinfo)
1779 = fold_build2 (MINUS_EXPR, type, niters, niters_prolog); 2544 = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
1780 LOOP_VINFO_NITERSM1 (loop_vinfo) 2545 LOOP_VINFO_NITERSM1 (loop_vinfo)
1781 = fold_build2 (MINUS_EXPR, type, 2546 = fold_build2 (MINUS_EXPR, type,
1821 iterations of loop is unknown at compile time, otherwise this 2586 iterations of loop is unknown at compile time, otherwise this
1822 won't be vectorized. */ 2587 won't be vectorized. */
1823 if (skip_vector) 2588 if (skip_vector)
1824 { 2589 {
1825 /* Additional epilogue iteration is peeled if gap exists. */ 2590 /* Additional epilogue iteration is peeled if gap exists. */
1826 bool peel_for_gaps = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
1827 tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling, 2591 tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
1828 bound_prolog, 2592 bound_prolog, bound_epilog,
1829 peel_for_gaps ? vf : vf - 1,
1830 th, &bound_scalar, 2593 th, &bound_scalar,
1831 check_profitability); 2594 check_profitability);
1832 /* Build guard against NITERSM1 since NITERS may overflow. */ 2595 /* Build guard against NITERSM1 since NITERS may overflow. */
1833 guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t); 2596 guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
1834 guard_bb = anchor; 2597 guard_bb = anchor;
1841 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1)); 2604 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
1842 slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e); 2605 slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
1843 2606
1844 /* Simply propagate profile info from guard_bb to guard_to which is 2607 /* Simply propagate profile info from guard_bb to guard_to which is
1845 a merge point of control flow. */ 2608 a merge point of control flow. */
1846 guard_to->frequency = guard_bb->frequency;
1847 guard_to->count = guard_bb->count; 2609 guard_to->count = guard_bb->count;
2610
1848 /* Scale probability of epilog loop back. 2611 /* Scale probability of epilog loop back.
1849 FIXME: We should avoid scaling down and back up. Profile may 2612 FIXME: We should avoid scaling down and back up. Profile may
1850 get lost if we scale down to 0. */ 2613 get lost if we scale down to 0. */
1851 int scale_up = REG_BR_PROB_BASE * REG_BR_PROB_BASE
1852 / prob_vector.to_reg_br_prob_base ();
1853 basic_block *bbs = get_loop_body (epilog); 2614 basic_block *bbs = get_loop_body (epilog);
1854 scale_bbs_frequencies_int (bbs, epilog->num_nodes, scale_up, 2615 for (unsigned int i = 0; i < epilog->num_nodes; i++)
1855 REG_BR_PROB_BASE); 2616 bbs[i]->count = bbs[i]->count.apply_scale
2617 (bbs[i]->count,
2618 bbs[i]->count.apply_probability
2619 (prob_vector));
1856 free (bbs); 2620 free (bbs);
1857 } 2621 }
1858 2622
1859 basic_block bb_before_epilog = loop_preheader_edge (epilog)->src; 2623 basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
1860 tree niters_vector_mult_vf; 2624 tree niters_vector_mult_vf;
1861 /* If loop is peeled for non-zero constant times, now niters refers to 2625 /* If loop is peeled for non-zero constant times, now niters refers to
1862 orig_niters - prolog_peeling, it won't overflow even the orig_niters 2626 orig_niters - prolog_peeling, it won't overflow even the orig_niters
1863 overflows. */ 2627 overflows. */
1864 niters_no_overflow |= (prolog_peeling > 0); 2628 niters_no_overflow |= (prolog_peeling > 0);
1865 vect_gen_vector_loop_niters (loop_vinfo, niters, 2629 vect_gen_vector_loop_niters (loop_vinfo, niters,
1866 niters_vector, niters_no_overflow); 2630 niters_vector, step_vector,
1867 vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector, 2631 niters_no_overflow);
1868 &niters_vector_mult_vf); 2632 if (!integer_onep (*step_vector))
2633 {
2634 /* On exit from the loop we will have an easy way of calcalating
2635 NITERS_VECTOR / STEP * STEP. Install a dummy definition
2636 until then. */
2637 niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2638 SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2639 *niters_vector_mult_vf_var = niters_vector_mult_vf;
2640 }
2641 else
2642 vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2643 &niters_vector_mult_vf);
1869 /* Update IVs of original loop as if they were advanced by 2644 /* Update IVs of original loop as if they were advanced by
1870 niters_vector_mult_vf steps. */ 2645 niters_vector_mult_vf steps. */
1871 gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo)); 2646 gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
1872 edge update_e = skip_vector ? e : loop_preheader_edge (epilog); 2647 edge update_e = skip_vector ? e : loop_preheader_edge (epilog);
1873 vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf, 2648 vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
1891 { 2666 {
1892 prob_epilog = prob_vector * prob_epilog + prob_vector.invert (); 2667 prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
1893 2668
1894 scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog); 2669 scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
1895 } 2670 }
1896 scale_loop_profile (epilog, prob_epilog, bound); 2671 scale_loop_profile (epilog, prob_epilog, 0);
1897 } 2672 }
1898 else 2673 else
1899 slpeel_update_phi_nodes_for_lcssa (epilog); 2674 slpeel_update_phi_nodes_for_lcssa (epilog);
1900 2675
1901 bound = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? vf - 1 : vf - 2; 2676 unsigned HOST_WIDE_INT bound;
1902 /* We share epilog loop with scalar version loop. */ 2677 if (bound_scalar.is_constant (&bound))
1903 bound = MAX (bound, bound_scalar - 1); 2678 {
1904 record_niter_bound (epilog, bound, false, true); 2679 gcc_assert (bound != 0);
2680 /* -1 to convert loop iterations to latch iterations. */
2681 record_niter_bound (epilog, bound - 1, false, true);
2682 }
1905 2683
1906 delete_update_ssa (); 2684 delete_update_ssa ();
1907 adjust_vec_debug_stmts (); 2685 adjust_vec_debug_stmts ();
1908 scev_reset (); 2686 scev_reset ();
1909 } 2687 }
1914 } 2692 }
1915 2693
1916 /* Function vect_create_cond_for_niters_checks. 2694 /* Function vect_create_cond_for_niters_checks.
1917 2695
1918 Create a conditional expression that represents the run-time checks for 2696 Create a conditional expression that represents the run-time checks for
1919 loop's niter. The loop is guaranteed to to terminate if the run-time 2697 loop's niter. The loop is guaranteed to terminate if the run-time
1920 checks hold. 2698 checks hold.
1921 2699
1922 Input: 2700 Input:
1923 COND_EXPR - input conditional expression. New conditions will be chained 2701 COND_EXPR - input conditional expression. New conditions will be chained
1924 with logical AND operation. If it is NULL, then the function 2702 with logical AND operation. If it is NULL, then the function
1986 static void 2764 static void
1987 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo, 2765 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
1988 tree *cond_expr, 2766 tree *cond_expr,
1989 gimple_seq *cond_expr_stmt_list) 2767 gimple_seq *cond_expr_stmt_list)
1990 { 2768 {
1991 vec<gimple *> may_misalign_stmts 2769 vec<stmt_vec_info> may_misalign_stmts
1992 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo); 2770 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1993 gimple *ref_stmt; 2771 stmt_vec_info stmt_info;
1994 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo); 2772 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
1995 tree mask_cst; 2773 tree mask_cst;
1996 unsigned int i; 2774 unsigned int i;
1997 tree int_ptrsize_type; 2775 tree int_ptrsize_type;
1998 char tmp_name[20]; 2776 char tmp_name[20];
2009 int_ptrsize_type = signed_type_for (ptr_type_node); 2787 int_ptrsize_type = signed_type_for (ptr_type_node);
2010 2788
2011 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address 2789 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
2012 of the first vector of the i'th data reference. */ 2790 of the first vector of the i'th data reference. */
2013 2791
2014 FOR_EACH_VEC_ELT (may_misalign_stmts, i, ref_stmt) 2792 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2015 { 2793 {
2016 gimple_seq new_stmt_list = NULL; 2794 gimple_seq new_stmt_list = NULL;
2017 tree addr_base; 2795 tree addr_base;
2018 tree addr_tmp_name; 2796 tree addr_tmp_name;
2019 tree new_or_tmp_name; 2797 tree new_or_tmp_name;
2020 gimple *addr_stmt, *or_stmt; 2798 gimple *addr_stmt, *or_stmt;
2021 stmt_vec_info stmt_vinfo = vinfo_for_stmt (ref_stmt); 2799 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2022 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
2023 bool negative = tree_int_cst_compare 2800 bool negative = tree_int_cst_compare
2024 (DR_STEP (STMT_VINFO_DATA_REF (stmt_vinfo)), size_zero_node) < 0; 2801 (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
2025 tree offset = negative 2802 tree offset = negative
2026 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node; 2803 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
2027 2804
2028 /* create: addr_tmp = (int)(address_of_first_vector) */ 2805 /* create: addr_tmp = (int)(address_of_first_vector) */
2029 addr_base = 2806 addr_base =
2030 vect_create_addr_base_for_vector_ref (ref_stmt, &new_stmt_list, 2807 vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
2031 offset); 2808 offset);
2032 if (new_stmt_list != NULL) 2809 if (new_stmt_list != NULL)
2033 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list); 2810 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
2034 2811
2035 sprintf (tmp_name, "addr2int%d", i); 2812 sprintf (tmp_name, "addr2int%d", i);
2090 addr1, addr2); 2867 addr1, addr2);
2091 chain_cond_expr (cond_expr, part_cond_expr); 2868 chain_cond_expr (cond_expr, part_cond_expr);
2092 } 2869 }
2093 } 2870 }
2094 2871
2872 /* Create an expression that is true when all lower-bound conditions for
2873 the vectorized loop are met. Chain this condition with *COND_EXPR. */
2874
2875 static void
2876 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
2877 {
2878 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
2879 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
2880 {
2881 tree expr = lower_bounds[i].expr;
2882 tree type = unsigned_type_for (TREE_TYPE (expr));
2883 expr = fold_convert (type, expr);
2884 poly_uint64 bound = lower_bounds[i].min_value;
2885 if (!lower_bounds[i].unsigned_p)
2886 {
2887 expr = fold_build2 (PLUS_EXPR, type, expr,
2888 build_int_cstu (type, bound - 1));
2889 bound += bound - 1;
2890 }
2891 tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
2892 build_int_cstu (type, bound));
2893 chain_cond_expr (cond_expr, part_cond_expr);
2894 }
2895 }
2896
2095 /* Function vect_create_cond_for_alias_checks. 2897 /* Function vect_create_cond_for_alias_checks.
2096 2898
2097 Create a conditional expression that represents the run-time checks for 2899 Create a conditional expression that represents the run-time checks for
2098 overlapping of address ranges represented by a list of data references 2900 overlapping of address ranges represented by a list of data references
2099 relations passed as input. 2901 relations passed as input.
2149 The versioning precondition(s) are placed in *COND_EXPR and 2951 The versioning precondition(s) are placed in *COND_EXPR and
2150 *COND_EXPR_STMT_LIST. */ 2952 *COND_EXPR_STMT_LIST. */
2151 2953
2152 void 2954 void
2153 vect_loop_versioning (loop_vec_info loop_vinfo, 2955 vect_loop_versioning (loop_vec_info loop_vinfo,
2154 unsigned int th, bool check_profitability) 2956 unsigned int th, bool check_profitability,
2957 poly_uint64 versioning_threshold)
2155 { 2958 {
2156 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop; 2959 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
2157 struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); 2960 struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2158 basic_block condition_bb; 2961 basic_block condition_bb;
2159 gphi_iterator gsi; 2962 gphi_iterator gsi;
2174 2977
2175 if (check_profitability) 2978 if (check_profitability)
2176 cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters, 2979 cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
2177 build_int_cst (TREE_TYPE (scalar_loop_iters), 2980 build_int_cst (TREE_TYPE (scalar_loop_iters),
2178 th - 1)); 2981 th - 1));
2982 if (maybe_ne (versioning_threshold, 0U))
2983 {
2984 tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
2985 build_int_cst (TREE_TYPE (scalar_loop_iters),
2986 versioning_threshold - 1));
2987 if (cond_expr)
2988 cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
2989 expr, cond_expr);
2990 else
2991 cond_expr = expr;
2992 }
2179 2993
2180 if (version_niter) 2994 if (version_niter)
2181 vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr); 2995 vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
2182 2996
2183 if (cond_expr) 2997 if (cond_expr)
2189 &cond_expr_stmt_list); 3003 &cond_expr_stmt_list);
2190 3004
2191 if (version_alias) 3005 if (version_alias)
2192 { 3006 {
2193 vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr); 3007 vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3008 vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
2194 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr); 3009 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
2195 } 3010 }
2196 3011
2197 cond_expr = force_gimple_operand_1 (cond_expr, &gimplify_stmt_list, 3012 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3013 &gimplify_stmt_list,
2198 is_gimple_condexpr, NULL_TREE); 3014 is_gimple_condexpr, NULL_TREE);
2199 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list); 3015 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
2200 3016
2201 initialize_original_copy_tables (); 3017 initialize_original_copy_tables ();
2202 if (scalar_loop) 3018 if (scalar_loop)
2211 scale_loop_frequencies (loop, prob); 3027 scale_loop_frequencies (loop, prob);
2212 /* CONDITION_BB was created above SCALAR_LOOP's preheader, 3028 /* CONDITION_BB was created above SCALAR_LOOP's preheader,
2213 while we need to move it above LOOP's preheader. */ 3029 while we need to move it above LOOP's preheader. */
2214 e = loop_preheader_edge (loop); 3030 e = loop_preheader_edge (loop);
2215 scalar_e = loop_preheader_edge (scalar_loop); 3031 scalar_e = loop_preheader_edge (scalar_loop);
2216 gcc_assert (empty_block_p (e->src) 3032 /* The vector loop preheader might not be empty, since new
2217 && single_pred_p (e->src)); 3033 invariants could have been created while analyzing the loop. */
3034 gcc_assert (single_pred_p (e->src));
2218 gcc_assert (empty_block_p (scalar_e->src) 3035 gcc_assert (empty_block_p (scalar_e->src)
2219 && single_pred_p (scalar_e->src)); 3036 && single_pred_p (scalar_e->src));
2220 gcc_assert (single_pred_p (condition_bb)); 3037 gcc_assert (single_pred_p (condition_bb));
2221 preheader = e->src; 3038 preheader = e->src;
2222 scalar_preheader = scalar_e->src; 3039 scalar_preheader = scalar_e->src;
2245 vect_free_loop_info_assumptions (nloop); 3062 vect_free_loop_info_assumptions (nloop);
2246 /* And set constraint LOOP_C_INFINITE for niter analyzer. */ 3063 /* And set constraint LOOP_C_INFINITE for niter analyzer. */
2247 loop_constraint_set (loop, LOOP_C_INFINITE); 3064 loop_constraint_set (loop, LOOP_C_INFINITE);
2248 } 3065 }
2249 3066
2250 if (LOCATION_LOCUS (vect_location) != UNKNOWN_LOCATION 3067 if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
2251 && dump_enabled_p ()) 3068 && dump_enabled_p ())
2252 { 3069 {
2253 if (version_alias) 3070 if (version_alias)
2254 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, 3071 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3072 vect_location,
2255 "loop versioned for vectorization because of " 3073 "loop versioned for vectorization because of "
2256 "possible aliasing\n"); 3074 "possible aliasing\n");
2257 if (version_align) 3075 if (version_align)
2258 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, 3076 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3077 vect_location,
2259 "loop versioned for vectorization to enhance " 3078 "loop versioned for vectorization to enhance "
2260 "alignment\n"); 3079 "alignment\n");
2261 3080
2262 } 3081 }
2263 free_original_copy_tables (); 3082 free_original_copy_tables ();