Mercurial > hg > CbC > CbC_gcc
comparison gcc/tree-vect-loop-manip.c @ 132:d34655255c78
update gcc-8.2
author | mir3636 |
---|---|
date | Thu, 25 Oct 2018 10:21:07 +0900 |
parents | 84e7813d76e9 |
children | 1830386684a0 |
comparison
equal
deleted
inserted
replaced
130:e108057fa461 | 132:d34655255c78 |
---|---|
1 /* Vectorizer Specific Loop Manipulations | 1 /* Vectorizer Specific Loop Manipulations |
2 Copyright (C) 2003-2017 Free Software Foundation, Inc. | 2 Copyright (C) 2003-2018 Free Software Foundation, Inc. |
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> | 3 Contributed by Dorit Naishlos <dorit@il.ibm.com> |
4 and Ira Rosen <irar@il.ibm.com> | 4 and Ira Rosen <irar@il.ibm.com> |
5 | 5 |
6 This file is part of GCC. | 6 This file is part of GCC. |
7 | 7 |
39 #include "tree-ssa.h" | 39 #include "tree-ssa.h" |
40 #include "cfgloop.h" | 40 #include "cfgloop.h" |
41 #include "tree-scalar-evolution.h" | 41 #include "tree-scalar-evolution.h" |
42 #include "tree-vectorizer.h" | 42 #include "tree-vectorizer.h" |
43 #include "tree-ssa-loop-ivopts.h" | 43 #include "tree-ssa-loop-ivopts.h" |
44 #include "gimple-fold.h" | |
45 #include "tree-ssa-loop-niter.h" | |
46 #include "internal-fn.h" | |
47 #include "stor-layout.h" | |
48 #include "optabs-query.h" | |
49 #include "vec-perm-indices.h" | |
44 | 50 |
45 /************************************************************************* | 51 /************************************************************************* |
46 Simple Loop Peeling Utilities | 52 Simple Loop Peeling Utilities |
47 | 53 |
48 Utilities to support loop peeling for vectorization purposes. | 54 Utilities to support loop peeling for vectorization purposes. |
190 /* Adjust debug stmts as scheduled before. */ | 196 /* Adjust debug stmts as scheduled before. */ |
191 | 197 |
192 static void | 198 static void |
193 adjust_vec_debug_stmts (void) | 199 adjust_vec_debug_stmts (void) |
194 { | 200 { |
195 if (!MAY_HAVE_DEBUG_STMTS) | 201 if (!MAY_HAVE_DEBUG_BIND_STMTS) |
196 return; | 202 return; |
197 | 203 |
198 gcc_assert (adjust_vec.exists ()); | 204 gcc_assert (adjust_vec.exists ()); |
199 | 205 |
200 while (!adjust_vec.is_empty ()) | 206 while (!adjust_vec.is_empty ()) |
212 static void | 218 static void |
213 adjust_debug_stmts (tree from, tree to, basic_block bb) | 219 adjust_debug_stmts (tree from, tree to, basic_block bb) |
214 { | 220 { |
215 adjust_info ai; | 221 adjust_info ai; |
216 | 222 |
217 if (MAY_HAVE_DEBUG_STMTS | 223 if (MAY_HAVE_DEBUG_BIND_STMTS |
218 && TREE_CODE (from) == SSA_NAME | 224 && TREE_CODE (from) == SSA_NAME |
219 && ! SSA_NAME_IS_DEFAULT_DEF (from) | 225 && ! SSA_NAME_IS_DEFAULT_DEF (from) |
220 && ! virtual_operand_p (from)) | 226 && ! virtual_operand_p (from)) |
221 { | 227 { |
222 ai.from = from; | 228 ai.from = from; |
240 { | 246 { |
241 tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e); | 247 tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e); |
242 | 248 |
243 SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def); | 249 SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def); |
244 | 250 |
245 if (MAY_HAVE_DEBUG_STMTS) | 251 if (MAY_HAVE_DEBUG_BIND_STMTS) |
246 adjust_debug_stmts (orig_def, PHI_RESULT (update_phi), | 252 adjust_debug_stmts (orig_def, PHI_RESULT (update_phi), |
247 gimple_bb (update_phi)); | 253 gimple_bb (update_phi)); |
248 } | 254 } |
249 | 255 |
250 /* Make the LOOP iterate NITERS times. This is done by adding a new IV | 256 /* Define one loop mask MASK from loop LOOP. INIT_MASK is the value that |
251 that starts at zero, increases by one and its limit is NITERS. | 257 the mask should have during the first iteration and NEXT_MASK is the |
252 | 258 value that it should have on subsequent iterations. */ |
253 Assumption: the exit-condition of LOOP is the last stmt in the loop. */ | 259 |
254 | 260 static void |
255 void | 261 vect_set_loop_mask (struct loop *loop, tree mask, tree init_mask, |
256 slpeel_make_loop_iterate_ntimes (struct loop *loop, tree niters) | 262 tree next_mask) |
263 { | |
264 gphi *phi = create_phi_node (mask, loop->header); | |
265 add_phi_arg (phi, init_mask, loop_preheader_edge (loop), UNKNOWN_LOCATION); | |
266 add_phi_arg (phi, next_mask, loop_latch_edge (loop), UNKNOWN_LOCATION); | |
267 } | |
268 | |
269 /* Add SEQ to the end of LOOP's preheader block. */ | |
270 | |
271 static void | |
272 add_preheader_seq (struct loop *loop, gimple_seq seq) | |
273 { | |
274 if (seq) | |
275 { | |
276 edge pe = loop_preheader_edge (loop); | |
277 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq); | |
278 gcc_assert (!new_bb); | |
279 } | |
280 } | |
281 | |
282 /* Add SEQ to the beginning of LOOP's header block. */ | |
283 | |
284 static void | |
285 add_header_seq (struct loop *loop, gimple_seq seq) | |
286 { | |
287 if (seq) | |
288 { | |
289 gimple_stmt_iterator gsi = gsi_after_labels (loop->header); | |
290 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT); | |
291 } | |
292 } | |
293 | |
294 /* Return true if the target can interleave elements of two vectors. | |
295 OFFSET is 0 if the first half of the vectors should be interleaved | |
296 or 1 if the second half should. When returning true, store the | |
297 associated permutation in INDICES. */ | |
298 | |
299 static bool | |
300 interleave_supported_p (vec_perm_indices *indices, tree vectype, | |
301 unsigned int offset) | |
302 { | |
303 poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype); | |
304 poly_uint64 base = exact_div (nelts, 2) * offset; | |
305 vec_perm_builder sel (nelts, 2, 3); | |
306 for (unsigned int i = 0; i < 3; ++i) | |
307 { | |
308 sel.quick_push (base + i); | |
309 sel.quick_push (base + i + nelts); | |
310 } | |
311 indices->new_vector (sel, 2, nelts); | |
312 return can_vec_perm_const_p (TYPE_MODE (vectype), *indices); | |
313 } | |
314 | |
315 /* Try to use permutes to define the masks in DEST_RGM using the masks | |
316 in SRC_RGM, given that the former has twice as many masks as the | |
317 latter. Return true on success, adding any new statements to SEQ. */ | |
318 | |
319 static bool | |
320 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm, | |
321 rgroup_masks *src_rgm) | |
322 { | |
323 tree src_masktype = src_rgm->mask_type; | |
324 tree dest_masktype = dest_rgm->mask_type; | |
325 machine_mode src_mode = TYPE_MODE (src_masktype); | |
326 if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter | |
327 && optab_handler (vec_unpacku_hi_optab, src_mode) != CODE_FOR_nothing | |
328 && optab_handler (vec_unpacku_lo_optab, src_mode) != CODE_FOR_nothing) | |
329 { | |
330 /* Unpacking the source masks gives at least as many mask bits as | |
331 we need. We can then VIEW_CONVERT any excess bits away. */ | |
332 tree unpack_masktype = vect_halve_mask_nunits (src_masktype); | |
333 for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i) | |
334 { | |
335 tree src = src_rgm->masks[i / 2]; | |
336 tree dest = dest_rgm->masks[i]; | |
337 tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1) | |
338 ? VEC_UNPACK_HI_EXPR | |
339 : VEC_UNPACK_LO_EXPR); | |
340 gassign *stmt; | |
341 if (dest_masktype == unpack_masktype) | |
342 stmt = gimple_build_assign (dest, code, src); | |
343 else | |
344 { | |
345 tree temp = make_ssa_name (unpack_masktype); | |
346 stmt = gimple_build_assign (temp, code, src); | |
347 gimple_seq_add_stmt (seq, stmt); | |
348 stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR, | |
349 build1 (VIEW_CONVERT_EXPR, | |
350 dest_masktype, temp)); | |
351 } | |
352 gimple_seq_add_stmt (seq, stmt); | |
353 } | |
354 return true; | |
355 } | |
356 vec_perm_indices indices[2]; | |
357 if (dest_masktype == src_masktype | |
358 && interleave_supported_p (&indices[0], src_masktype, 0) | |
359 && interleave_supported_p (&indices[1], src_masktype, 1)) | |
360 { | |
361 /* The destination requires twice as many mask bits as the source, so | |
362 we can use interleaving permutes to double up the number of bits. */ | |
363 tree masks[2]; | |
364 for (unsigned int i = 0; i < 2; ++i) | |
365 masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]); | |
366 for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i) | |
367 { | |
368 tree src = src_rgm->masks[i / 2]; | |
369 tree dest = dest_rgm->masks[i]; | |
370 gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR, | |
371 src, src, masks[i & 1]); | |
372 gimple_seq_add_stmt (seq, stmt); | |
373 } | |
374 return true; | |
375 } | |
376 return false; | |
377 } | |
378 | |
379 /* Helper for vect_set_loop_condition_masked. Generate definitions for | |
380 all the masks in RGM and return a mask that is nonzero when the loop | |
381 needs to iterate. Add any new preheader statements to PREHEADER_SEQ. | |
382 Use LOOP_COND_GSI to insert code before the exit gcond. | |
383 | |
384 RGM belongs to loop LOOP. The loop originally iterated NITERS | |
385 times and has been vectorized according to LOOP_VINFO. Each iteration | |
386 of the vectorized loop handles VF iterations of the scalar loop. | |
387 | |
388 If NITERS_SKIP is nonnull, the first iteration of the vectorized loop | |
389 starts with NITERS_SKIP dummy iterations of the scalar loop before | |
390 the real work starts. The mask elements for these dummy iterations | |
391 must be 0, to ensure that the extra iterations do not have an effect. | |
392 | |
393 It is known that: | |
394 | |
395 NITERS * RGM->max_nscalars_per_iter | |
396 | |
397 does not overflow. However, MIGHT_WRAP_P says whether an induction | |
398 variable that starts at 0 and has step: | |
399 | |
400 VF * RGM->max_nscalars_per_iter | |
401 | |
402 might overflow before hitting a value above: | |
403 | |
404 (NITERS + NITERS_SKIP) * RGM->max_nscalars_per_iter | |
405 | |
406 This means that we cannot guarantee that such an induction variable | |
407 would ever hit a value that produces a set of all-false masks for RGM. */ | |
408 | |
409 static tree | |
410 vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo, | |
411 gimple_seq *preheader_seq, | |
412 gimple_stmt_iterator loop_cond_gsi, | |
413 rgroup_masks *rgm, tree vf, | |
414 tree niters, tree niters_skip, | |
415 bool might_wrap_p) | |
416 { | |
417 tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo); | |
418 tree mask_type = rgm->mask_type; | |
419 unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter; | |
420 poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type); | |
421 | |
422 /* Calculate the maximum number of scalar values that the rgroup | |
423 handles in total, the number that it handles for each iteration | |
424 of the vector loop, and the number that it should skip during the | |
425 first iteration of the vector loop. */ | |
426 tree nscalars_total = niters; | |
427 tree nscalars_step = vf; | |
428 tree nscalars_skip = niters_skip; | |
429 if (nscalars_per_iter != 1) | |
430 { | |
431 /* We checked before choosing to use a fully-masked loop that these | |
432 multiplications don't overflow. */ | |
433 tree factor = build_int_cst (compare_type, nscalars_per_iter); | |
434 nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type, | |
435 nscalars_total, factor); | |
436 nscalars_step = gimple_build (preheader_seq, MULT_EXPR, compare_type, | |
437 nscalars_step, factor); | |
438 if (nscalars_skip) | |
439 nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type, | |
440 nscalars_skip, factor); | |
441 } | |
442 | |
443 /* Create an induction variable that counts the number of scalars | |
444 processed. */ | |
445 tree index_before_incr, index_after_incr; | |
446 gimple_stmt_iterator incr_gsi; | |
447 bool insert_after; | |
448 tree zero_index = build_int_cst (compare_type, 0); | |
449 standard_iv_increment_position (loop, &incr_gsi, &insert_after); | |
450 create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi, | |
451 insert_after, &index_before_incr, &index_after_incr); | |
452 | |
453 tree test_index, test_limit, first_limit; | |
454 gimple_stmt_iterator *test_gsi; | |
455 if (might_wrap_p) | |
456 { | |
457 /* In principle the loop should stop iterating once the incremented | |
458 IV reaches a value greater than or equal to: | |
459 | |
460 NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP | |
461 | |
462 However, there's no guarantee that this addition doesn't overflow | |
463 the comparison type, or that the IV hits a value above it before | |
464 wrapping around. We therefore adjust the limit down by one | |
465 IV step: | |
466 | |
467 (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP) | |
468 -[infinite-prec] NSCALARS_STEP | |
469 | |
470 and compare the IV against this limit _before_ incrementing it. | |
471 Since the comparison type is unsigned, we actually want the | |
472 subtraction to saturate at zero: | |
473 | |
474 (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP) | |
475 -[sat] NSCALARS_STEP | |
476 | |
477 And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as: | |
478 | |
479 NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP) | |
480 | |
481 where the rightmost subtraction can be done directly in | |
482 COMPARE_TYPE. */ | |
483 test_index = index_before_incr; | |
484 tree adjust = nscalars_step; | |
485 if (nscalars_skip) | |
486 adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type, | |
487 adjust, nscalars_skip); | |
488 test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type, | |
489 nscalars_total, adjust); | |
490 test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type, | |
491 test_limit, adjust); | |
492 test_gsi = &incr_gsi; | |
493 | |
494 /* Get a safe limit for the first iteration. */ | |
495 if (nscalars_skip) | |
496 { | |
497 /* The first vector iteration can handle at most NSCALARS_STEP | |
498 scalars. NSCALARS_STEP <= CONST_LIMIT, and adding | |
499 NSCALARS_SKIP to that cannot overflow. */ | |
500 tree const_limit = build_int_cst (compare_type, | |
501 LOOP_VINFO_VECT_FACTOR (loop_vinfo) | |
502 * nscalars_per_iter); | |
503 first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type, | |
504 nscalars_total, const_limit); | |
505 first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type, | |
506 first_limit, nscalars_skip); | |
507 } | |
508 else | |
509 /* For the first iteration it doesn't matter whether the IV hits | |
510 a value above NSCALARS_TOTAL. That only matters for the latch | |
511 condition. */ | |
512 first_limit = nscalars_total; | |
513 } | |
514 else | |
515 { | |
516 /* Test the incremented IV, which will always hit a value above | |
517 the bound before wrapping. */ | |
518 test_index = index_after_incr; | |
519 test_limit = nscalars_total; | |
520 if (nscalars_skip) | |
521 test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type, | |
522 test_limit, nscalars_skip); | |
523 test_gsi = &loop_cond_gsi; | |
524 | |
525 first_limit = test_limit; | |
526 } | |
527 | |
528 /* Provide a definition of each mask in the group. */ | |
529 tree next_mask = NULL_TREE; | |
530 tree mask; | |
531 unsigned int i; | |
532 FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask) | |
533 { | |
534 /* Previous masks will cover BIAS scalars. This mask covers the | |
535 next batch. */ | |
536 poly_uint64 bias = nscalars_per_mask * i; | |
537 tree bias_tree = build_int_cst (compare_type, bias); | |
538 gimple *tmp_stmt; | |
539 | |
540 /* See whether the first iteration of the vector loop is known | |
541 to have a full mask. */ | |
542 poly_uint64 const_limit; | |
543 bool first_iteration_full | |
544 = (poly_int_tree_p (first_limit, &const_limit) | |
545 && known_ge (const_limit, (i + 1) * nscalars_per_mask)); | |
546 | |
547 /* Rather than have a new IV that starts at BIAS and goes up to | |
548 TEST_LIMIT, prefer to use the same 0-based IV for each mask | |
549 and adjust the bound down by BIAS. */ | |
550 tree this_test_limit = test_limit; | |
551 if (i != 0) | |
552 { | |
553 this_test_limit = gimple_build (preheader_seq, MAX_EXPR, | |
554 compare_type, this_test_limit, | |
555 bias_tree); | |
556 this_test_limit = gimple_build (preheader_seq, MINUS_EXPR, | |
557 compare_type, this_test_limit, | |
558 bias_tree); | |
559 } | |
560 | |
561 /* Create the initial mask. First include all scalars that | |
562 are within the loop limit. */ | |
563 tree init_mask = NULL_TREE; | |
564 if (!first_iteration_full) | |
565 { | |
566 tree start, end; | |
567 if (first_limit == test_limit) | |
568 { | |
569 /* Use a natural test between zero (the initial IV value) | |
570 and the loop limit. The "else" block would be valid too, | |
571 but this choice can avoid the need to load BIAS_TREE into | |
572 a register. */ | |
573 start = zero_index; | |
574 end = this_test_limit; | |
575 } | |
576 else | |
577 { | |
578 /* FIRST_LIMIT is the maximum number of scalars handled by the | |
579 first iteration of the vector loop. Test the portion | |
580 associated with this mask. */ | |
581 start = bias_tree; | |
582 end = first_limit; | |
583 } | |
584 | |
585 init_mask = make_temp_ssa_name (mask_type, NULL, "max_mask"); | |
586 tmp_stmt = vect_gen_while (init_mask, start, end); | |
587 gimple_seq_add_stmt (preheader_seq, tmp_stmt); | |
588 } | |
589 | |
590 /* Now AND out the bits that are within the number of skipped | |
591 scalars. */ | |
592 poly_uint64 const_skip; | |
593 if (nscalars_skip | |
594 && !(poly_int_tree_p (nscalars_skip, &const_skip) | |
595 && known_le (const_skip, bias))) | |
596 { | |
597 tree unskipped_mask = vect_gen_while_not (preheader_seq, mask_type, | |
598 bias_tree, nscalars_skip); | |
599 if (init_mask) | |
600 init_mask = gimple_build (preheader_seq, BIT_AND_EXPR, mask_type, | |
601 init_mask, unskipped_mask); | |
602 else | |
603 init_mask = unskipped_mask; | |
604 } | |
605 | |
606 if (!init_mask) | |
607 /* First iteration is full. */ | |
608 init_mask = build_minus_one_cst (mask_type); | |
609 | |
610 /* Get the mask value for the next iteration of the loop. */ | |
611 next_mask = make_temp_ssa_name (mask_type, NULL, "next_mask"); | |
612 gcall *call = vect_gen_while (next_mask, test_index, this_test_limit); | |
613 gsi_insert_before (test_gsi, call, GSI_SAME_STMT); | |
614 | |
615 vect_set_loop_mask (loop, mask, init_mask, next_mask); | |
616 } | |
617 return next_mask; | |
618 } | |
619 | |
620 /* Make LOOP iterate NITERS times using masking and WHILE_ULT calls. | |
621 LOOP_VINFO describes the vectorization of LOOP. NITERS is the | |
622 number of iterations of the original scalar loop that should be | |
623 handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are | |
624 as for vect_set_loop_condition. | |
625 | |
626 Insert the branch-back condition before LOOP_COND_GSI and return the | |
627 final gcond. */ | |
628 | |
629 static gcond * | |
630 vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo, | |
631 tree niters, tree final_iv, | |
632 bool niters_maybe_zero, | |
633 gimple_stmt_iterator loop_cond_gsi) | |
634 { | |
635 gimple_seq preheader_seq = NULL; | |
636 gimple_seq header_seq = NULL; | |
637 | |
638 tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo); | |
639 unsigned int compare_precision = TYPE_PRECISION (compare_type); | |
640 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo); | |
641 tree orig_niters = niters; | |
642 | |
643 /* Type of the initial value of NITERS. */ | |
644 tree ni_actual_type = TREE_TYPE (niters); | |
645 unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type); | |
646 | |
647 /* Convert NITERS to the same size as the compare. */ | |
648 if (compare_precision > ni_actual_precision | |
649 && niters_maybe_zero) | |
650 { | |
651 /* We know that there is always at least one iteration, so if the | |
652 count is zero then it must have wrapped. Cope with this by | |
653 subtracting 1 before the conversion and adding 1 to the result. */ | |
654 gcc_assert (TYPE_UNSIGNED (ni_actual_type)); | |
655 niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type, | |
656 niters, build_minus_one_cst (ni_actual_type)); | |
657 niters = gimple_convert (&preheader_seq, compare_type, niters); | |
658 niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type, | |
659 niters, build_one_cst (compare_type)); | |
660 } | |
661 else | |
662 niters = gimple_convert (&preheader_seq, compare_type, niters); | |
663 | |
664 /* Convert skip_niters to the right type. */ | |
665 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo); | |
666 | |
667 /* Now calculate the value that the induction variable must be able | |
668 to hit in order to ensure that we end the loop with an all-false mask. | |
669 This involves adding the maximum number of inactive trailing scalar | |
670 iterations. */ | |
671 widest_int iv_limit; | |
672 bool known_max_iters = max_loop_iterations (loop, &iv_limit); | |
673 if (known_max_iters) | |
674 { | |
675 if (niters_skip) | |
676 { | |
677 /* Add the maximum number of skipped iterations to the | |
678 maximum iteration count. */ | |
679 if (TREE_CODE (niters_skip) == INTEGER_CST) | |
680 iv_limit += wi::to_widest (niters_skip); | |
681 else | |
682 iv_limit += max_vf - 1; | |
683 } | |
684 /* IV_LIMIT is the maximum number of latch iterations, which is also | |
685 the maximum in-range IV value. Round this value down to the previous | |
686 vector alignment boundary and then add an extra full iteration. */ | |
687 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); | |
688 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf; | |
689 } | |
690 | |
691 /* Get the vectorization factor in tree form. */ | |
692 tree vf = build_int_cst (compare_type, | |
693 LOOP_VINFO_VECT_FACTOR (loop_vinfo)); | |
694 | |
695 /* Iterate over all the rgroups and fill in their masks. We could use | |
696 the first mask from any rgroup for the loop condition; here we | |
697 arbitrarily pick the last. */ | |
698 tree test_mask = NULL_TREE; | |
699 rgroup_masks *rgm; | |
700 unsigned int i; | |
701 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); | |
702 FOR_EACH_VEC_ELT (*masks, i, rgm) | |
703 if (!rgm->masks.is_empty ()) | |
704 { | |
705 /* First try using permutes. This adds a single vector | |
706 instruction to the loop for each mask, but needs no extra | |
707 loop invariants or IVs. */ | |
708 unsigned int nmasks = i + 1; | |
709 if ((nmasks & 1) == 0) | |
710 { | |
711 rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1]; | |
712 if (!half_rgm->masks.is_empty () | |
713 && vect_maybe_permute_loop_masks (&header_seq, rgm, half_rgm)) | |
714 continue; | |
715 } | |
716 | |
717 /* See whether zero-based IV would ever generate all-false masks | |
718 before wrapping around. */ | |
719 bool might_wrap_p | |
720 = (!known_max_iters | |
721 || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter, | |
722 UNSIGNED) | |
723 > compare_precision)); | |
724 | |
725 /* Set up all masks for this group. */ | |
726 test_mask = vect_set_loop_masks_directly (loop, loop_vinfo, | |
727 &preheader_seq, | |
728 loop_cond_gsi, rgm, vf, | |
729 niters, niters_skip, | |
730 might_wrap_p); | |
731 } | |
732 | |
733 /* Emit all accumulated statements. */ | |
734 add_preheader_seq (loop, preheader_seq); | |
735 add_header_seq (loop, header_seq); | |
736 | |
737 /* Get a boolean result that tells us whether to iterate. */ | |
738 edge exit_edge = single_exit (loop); | |
739 tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR; | |
740 tree zero_mask = build_zero_cst (TREE_TYPE (test_mask)); | |
741 gcond *cond_stmt = gimple_build_cond (code, test_mask, zero_mask, | |
742 NULL_TREE, NULL_TREE); | |
743 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT); | |
744 | |
745 /* The loop iterates (NITERS - 1) / VF + 1 times. | |
746 Subtract one from this to get the latch count. */ | |
747 tree step = build_int_cst (compare_type, | |
748 LOOP_VINFO_VECT_FACTOR (loop_vinfo)); | |
749 tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters, | |
750 build_minus_one_cst (compare_type)); | |
751 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type, | |
752 niters_minus_one, step); | |
753 | |
754 if (final_iv) | |
755 { | |
756 gassign *assign = gimple_build_assign (final_iv, orig_niters); | |
757 gsi_insert_on_edge_immediate (single_exit (loop), assign); | |
758 } | |
759 | |
760 return cond_stmt; | |
761 } | |
762 | |
763 /* Like vect_set_loop_condition, but handle the case in which there | |
764 are no loop masks. */ | |
765 | |
766 static gcond * | |
767 vect_set_loop_condition_unmasked (struct loop *loop, tree niters, | |
768 tree step, tree final_iv, | |
769 bool niters_maybe_zero, | |
770 gimple_stmt_iterator loop_cond_gsi) | |
257 { | 771 { |
258 tree indx_before_incr, indx_after_incr; | 772 tree indx_before_incr, indx_after_incr; |
259 gcond *cond_stmt; | 773 gcond *cond_stmt; |
260 gcond *orig_cond; | 774 gcond *orig_cond; |
775 edge pe = loop_preheader_edge (loop); | |
261 edge exit_edge = single_exit (loop); | 776 edge exit_edge = single_exit (loop); |
262 gimple_stmt_iterator loop_cond_gsi; | |
263 gimple_stmt_iterator incr_gsi; | 777 gimple_stmt_iterator incr_gsi; |
264 bool insert_after; | 778 bool insert_after; |
265 tree init = build_int_cst (TREE_TYPE (niters), 0); | |
266 tree step = build_int_cst (TREE_TYPE (niters), 1); | |
267 source_location loop_loc; | |
268 enum tree_code code; | 779 enum tree_code code; |
780 tree niters_type = TREE_TYPE (niters); | |
269 | 781 |
270 orig_cond = get_loop_exit_condition (loop); | 782 orig_cond = get_loop_exit_condition (loop); |
271 gcc_assert (orig_cond); | 783 gcc_assert (orig_cond); |
272 loop_cond_gsi = gsi_for_stmt (orig_cond); | 784 loop_cond_gsi = gsi_for_stmt (orig_cond); |
273 | 785 |
786 tree init, limit; | |
787 if (!niters_maybe_zero && integer_onep (step)) | |
788 { | |
789 /* In this case we can use a simple 0-based IV: | |
790 | |
791 A: | |
792 x = 0; | |
793 do | |
794 { | |
795 ... | |
796 x += 1; | |
797 } | |
798 while (x < NITERS); */ | |
799 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR; | |
800 init = build_zero_cst (niters_type); | |
801 limit = niters; | |
802 } | |
803 else | |
804 { | |
805 /* The following works for all values of NITERS except 0: | |
806 | |
807 B: | |
808 x = 0; | |
809 do | |
810 { | |
811 ... | |
812 x += STEP; | |
813 } | |
814 while (x <= NITERS - STEP); | |
815 | |
816 so that the loop continues to iterate if x + STEP - 1 < NITERS | |
817 but stops if x + STEP - 1 >= NITERS. | |
818 | |
819 However, if NITERS is zero, x never hits a value above NITERS - STEP | |
820 before wrapping around. There are two obvious ways of dealing with | |
821 this: | |
822 | |
823 - start at STEP - 1 and compare x before incrementing it | |
824 - start at -1 and compare x after incrementing it | |
825 | |
826 The latter is simpler and is what we use. The loop in this case | |
827 looks like: | |
828 | |
829 C: | |
830 x = -1; | |
831 do | |
832 { | |
833 ... | |
834 x += STEP; | |
835 } | |
836 while (x < NITERS - STEP); | |
837 | |
838 In both cases the loop limit is NITERS - STEP. */ | |
839 gimple_seq seq = NULL; | |
840 limit = force_gimple_operand (niters, &seq, true, NULL_TREE); | |
841 limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step); | |
842 if (seq) | |
843 { | |
844 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq); | |
845 gcc_assert (!new_bb); | |
846 } | |
847 if (niters_maybe_zero) | |
848 { | |
849 /* Case C. */ | |
850 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR; | |
851 init = build_all_ones_cst (niters_type); | |
852 } | |
853 else | |
854 { | |
855 /* Case B. */ | |
856 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR; | |
857 init = build_zero_cst (niters_type); | |
858 } | |
859 } | |
860 | |
274 standard_iv_increment_position (loop, &incr_gsi, &insert_after); | 861 standard_iv_increment_position (loop, &incr_gsi, &insert_after); |
275 create_iv (init, step, NULL_TREE, loop, | 862 create_iv (init, step, NULL_TREE, loop, |
276 &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr); | 863 &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr); |
277 | |
278 indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr, | 864 indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr, |
279 true, NULL_TREE, true, | 865 true, NULL_TREE, true, |
280 GSI_SAME_STMT); | 866 GSI_SAME_STMT); |
281 niters = force_gimple_operand_gsi (&loop_cond_gsi, niters, true, NULL_TREE, | 867 limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE, |
282 true, GSI_SAME_STMT); | 868 true, GSI_SAME_STMT); |
283 | 869 |
284 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR; | 870 cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE, |
285 cond_stmt = gimple_build_cond (code, indx_after_incr, niters, NULL_TREE, | |
286 NULL_TREE); | 871 NULL_TREE); |
287 | 872 |
288 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT); | 873 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT); |
289 | 874 |
290 /* Remove old loop exit test: */ | 875 /* Record the number of latch iterations. */ |
291 gsi_remove (&loop_cond_gsi, true); | 876 if (limit == niters) |
292 free_stmt_vec_info (orig_cond); | 877 /* Case A: the loop iterates NITERS times. Subtract one to get the |
293 | 878 latch count. */ |
294 loop_loc = find_loop_location (loop); | 879 loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters, |
880 build_int_cst (niters_type, 1)); | |
881 else | |
882 /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times. | |
883 Subtract one from this to get the latch count. */ | |
884 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type, | |
885 limit, step); | |
886 | |
887 if (final_iv) | |
888 { | |
889 gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR, | |
890 indx_after_incr, init); | |
891 gsi_insert_on_edge_immediate (single_exit (loop), assign); | |
892 } | |
893 | |
894 return cond_stmt; | |
895 } | |
896 | |
897 /* If we're using fully-masked loops, make LOOP iterate: | |
898 | |
899 N == (NITERS - 1) / STEP + 1 | |
900 | |
901 times. When NITERS is zero, this is equivalent to making the loop | |
902 execute (1 << M) / STEP times, where M is the precision of NITERS. | |
903 NITERS_MAYBE_ZERO is true if this last case might occur. | |
904 | |
905 If we're not using fully-masked loops, make LOOP iterate: | |
906 | |
907 N == (NITERS - STEP) / STEP + 1 | |
908 | |
909 times, where NITERS is known to be outside the range [1, STEP - 1]. | |
910 This is equivalent to making the loop execute NITERS / STEP times | |
911 when NITERS is nonzero and (1 << M) / STEP times otherwise. | |
912 NITERS_MAYBE_ZERO again indicates whether this last case might occur. | |
913 | |
914 If FINAL_IV is nonnull, it is an SSA name that should be set to | |
915 N * STEP on exit from the loop. | |
916 | |
917 Assumption: the exit-condition of LOOP is the last stmt in the loop. */ | |
918 | |
919 void | |
920 vect_set_loop_condition (struct loop *loop, loop_vec_info loop_vinfo, | |
921 tree niters, tree step, tree final_iv, | |
922 bool niters_maybe_zero) | |
923 { | |
924 gcond *cond_stmt; | |
925 gcond *orig_cond = get_loop_exit_condition (loop); | |
926 gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond); | |
927 | |
928 if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) | |
929 cond_stmt = vect_set_loop_condition_masked (loop, loop_vinfo, niters, | |
930 final_iv, niters_maybe_zero, | |
931 loop_cond_gsi); | |
932 else | |
933 cond_stmt = vect_set_loop_condition_unmasked (loop, niters, step, | |
934 final_iv, niters_maybe_zero, | |
935 loop_cond_gsi); | |
936 | |
937 /* Remove old loop exit test. */ | |
938 stmt_vec_info orig_cond_info; | |
939 if (loop_vinfo | |
940 && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond))) | |
941 loop_vinfo->remove_stmt (orig_cond_info); | |
942 else | |
943 gsi_remove (&loop_cond_gsi, true); | |
944 | |
295 if (dump_enabled_p ()) | 945 if (dump_enabled_p ()) |
296 { | 946 dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G", |
297 if (LOCATION_LOCUS (loop_loc) != UNKNOWN_LOCATION) | 947 cond_stmt); |
298 dump_printf (MSG_NOTE, "\nloop at %s:%d: ", LOCATION_FILE (loop_loc), | |
299 LOCATION_LINE (loop_loc)); | |
300 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, cond_stmt, 0); | |
301 } | |
302 | |
303 /* Record the number of latch iterations. */ | |
304 loop->nb_iterations = fold_build2 (MINUS_EXPR, TREE_TYPE (niters), niters, | |
305 build_int_cst (TREE_TYPE (niters), 1)); | |
306 } | 948 } |
307 | 949 |
308 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg. | 950 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg. |
309 For all PHI arguments in FROM->dest and TO->dest from those | 951 For all PHI arguments in FROM->dest and TO->dest from those |
310 edges ensure that TO->dest PHI arguments have current_def | 952 edges ensure that TO->dest PHI arguments have current_def |
637 tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0)); | 1279 tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0)); |
638 imm_use_iterator imm_iter; | 1280 imm_use_iterator imm_iter; |
639 gimple *stmt; | 1281 gimple *stmt; |
640 use_operand_p use_p; | 1282 use_operand_p use_p; |
641 | 1283 |
1284 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop) | |
1285 = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop); | |
642 add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION); | 1286 add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION); |
643 gimple_phi_set_result (new_phi, new_vop); | 1287 gimple_phi_set_result (new_phi, new_vop); |
644 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop) | 1288 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop) |
645 if (stmt != new_phi | 1289 if (stmt != new_phi |
646 && !flow_bb_inside_loop_p (loop, gimple_bb (stmt))) | 1290 && !flow_bb_inside_loop_p (loop, gimple_bb (stmt))) |
657 Extract the location of the loop in the source code. | 1301 Extract the location of the loop in the source code. |
658 If the loop is not well formed for vectorization, an estimated | 1302 If the loop is not well formed for vectorization, an estimated |
659 location is calculated. | 1303 location is calculated. |
660 Return the loop location if succeed and NULL if not. */ | 1304 Return the loop location if succeed and NULL if not. */ |
661 | 1305 |
662 source_location | 1306 dump_user_location_t |
663 find_loop_location (struct loop *loop) | 1307 find_loop_location (struct loop *loop) |
664 { | 1308 { |
665 gimple *stmt = NULL; | 1309 gimple *stmt = NULL; |
666 basic_block bb; | 1310 basic_block bb; |
667 gimple_stmt_iterator si; | 1311 gimple_stmt_iterator si; |
668 | 1312 |
669 if (!loop) | 1313 if (!loop) |
670 return UNKNOWN_LOCATION; | 1314 return dump_user_location_t (); |
671 | 1315 |
672 stmt = get_loop_exit_condition (loop); | 1316 stmt = get_loop_exit_condition (loop); |
673 | 1317 |
674 if (stmt | 1318 if (stmt |
675 && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION) | 1319 && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION) |
676 return gimple_location (stmt); | 1320 return stmt; |
677 | 1321 |
678 /* If we got here the loop is probably not "well formed", | 1322 /* If we got here the loop is probably not "well formed", |
679 try to estimate the loop location */ | 1323 try to estimate the loop location */ |
680 | 1324 |
681 if (!loop->header) | 1325 if (!loop->header) |
682 return UNKNOWN_LOCATION; | 1326 return dump_user_location_t (); |
683 | 1327 |
684 bb = loop->header; | 1328 bb = loop->header; |
685 | 1329 |
686 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) | 1330 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) |
687 { | 1331 { |
688 stmt = gsi_stmt (si); | 1332 stmt = gsi_stmt (si); |
689 if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION) | 1333 if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION) |
690 return gimple_location (stmt); | 1334 return stmt; |
691 } | 1335 } |
692 | 1336 |
693 return UNKNOWN_LOCATION; | 1337 return dump_user_location_t (); |
694 } | 1338 } |
695 | 1339 |
696 /* Return true if PHI defines an IV of the loop to be vectorized. */ | 1340 /* Return true if the phi described by STMT_INFO defines an IV of the |
1341 loop to be vectorized. */ | |
697 | 1342 |
698 static bool | 1343 static bool |
699 iv_phi_p (gphi *phi) | 1344 iv_phi_p (stmt_vec_info stmt_info) |
700 { | 1345 { |
1346 gphi *phi = as_a <gphi *> (stmt_info->stmt); | |
701 if (virtual_operand_p (PHI_RESULT (phi))) | 1347 if (virtual_operand_p (PHI_RESULT (phi))) |
702 return false; | 1348 return false; |
703 | 1349 |
704 stmt_vec_info stmt_info = vinfo_for_stmt (phi); | |
705 gcc_assert (stmt_info != NULL); | |
706 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def | 1350 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def |
707 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def) | 1351 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def) |
708 return false; | 1352 return false; |
709 | 1353 |
710 return true; | 1354 return true; |
733 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi)) | 1377 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi)) |
734 { | 1378 { |
735 tree evolution_part; | 1379 tree evolution_part; |
736 | 1380 |
737 gphi *phi = gsi.phi (); | 1381 gphi *phi = gsi.phi (); |
1382 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi); | |
738 if (dump_enabled_p ()) | 1383 if (dump_enabled_p ()) |
739 { | 1384 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", |
740 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: "); | 1385 phi_info->stmt); |
741 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0); | |
742 } | |
743 | 1386 |
744 /* Skip virtual phi's. The data dependences that are associated with | 1387 /* Skip virtual phi's. The data dependences that are associated with |
745 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. | 1388 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. |
746 | 1389 |
747 Skip reduction phis. */ | 1390 Skip reduction phis. */ |
748 if (!iv_phi_p (phi)) | 1391 if (!iv_phi_p (phi_info)) |
749 { | 1392 { |
750 if (dump_enabled_p ()) | 1393 if (dump_enabled_p ()) |
751 dump_printf_loc (MSG_NOTE, vect_location, | 1394 dump_printf_loc (MSG_NOTE, vect_location, |
752 "reduc or virtual phi. skip.\n"); | 1395 "reduc or virtual phi. skip.\n"); |
753 continue; | 1396 continue; |
754 } | 1397 } |
755 | 1398 |
756 /* Analyze the evolution function. */ | 1399 /* Analyze the evolution function. */ |
757 | 1400 |
758 evolution_part | 1401 evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info); |
759 = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (vinfo_for_stmt (phi)); | |
760 if (evolution_part == NULL_TREE) | 1402 if (evolution_part == NULL_TREE) |
761 { | 1403 { |
762 if (dump_enabled_p ()) | 1404 if (dump_enabled_p ()) |
763 dump_printf (MSG_MISSED_OPTIMIZATION, | 1405 dump_printf (MSG_MISSED_OPTIMIZATION, |
764 "No access function or evolution.\n"); | 1406 "No access function or evolution.\n"); |
856 tree var, ni, ni_name; | 1498 tree var, ni, ni_name; |
857 gimple_stmt_iterator last_gsi; | 1499 gimple_stmt_iterator last_gsi; |
858 | 1500 |
859 gphi *phi = gsi.phi (); | 1501 gphi *phi = gsi.phi (); |
860 gphi *phi1 = gsi1.phi (); | 1502 gphi *phi1 = gsi1.phi (); |
1503 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi); | |
861 if (dump_enabled_p ()) | 1504 if (dump_enabled_p ()) |
862 { | 1505 dump_printf_loc (MSG_NOTE, vect_location, |
863 dump_printf_loc (MSG_NOTE, vect_location, | 1506 "vect_update_ivs_after_vectorizer: phi: %G", phi); |
864 "vect_update_ivs_after_vectorizer: phi: "); | |
865 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0); | |
866 } | |
867 | 1507 |
868 /* Skip reduction and virtual phis. */ | 1508 /* Skip reduction and virtual phis. */ |
869 if (!iv_phi_p (phi)) | 1509 if (!iv_phi_p (phi_info)) |
870 { | 1510 { |
871 if (dump_enabled_p ()) | 1511 if (dump_enabled_p ()) |
872 dump_printf_loc (MSG_NOTE, vect_location, | 1512 dump_printf_loc (MSG_NOTE, vect_location, |
873 "reduc or virtual phi. skip.\n"); | 1513 "reduc or virtual phi. skip.\n"); |
874 continue; | 1514 continue; |
875 } | 1515 } |
876 | 1516 |
877 type = TREE_TYPE (gimple_phi_result (phi)); | 1517 type = TREE_TYPE (gimple_phi_result (phi)); |
878 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (vinfo_for_stmt (phi)); | 1518 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info); |
879 step_expr = unshare_expr (step_expr); | 1519 step_expr = unshare_expr (step_expr); |
880 | 1520 |
881 /* FORNOW: We do not support IVs whose evolution function is a polynomial | 1521 /* FORNOW: We do not support IVs whose evolution function is a polynomial |
882 of degree >= 2 or exponential. */ | 1522 of degree >= 2 or exponential. */ |
883 gcc_assert (!tree_is_chrec (step_expr)); | 1523 gcc_assert (!tree_is_chrec (step_expr)); |
907 /* Fix phi expressions in the successor bb. */ | 1547 /* Fix phi expressions in the successor bb. */ |
908 adjust_phi_and_debug_stmts (phi1, update_e, ni_name); | 1548 adjust_phi_and_debug_stmts (phi1, update_e, ni_name); |
909 } | 1549 } |
910 } | 1550 } |
911 | 1551 |
1552 /* Return a gimple value containing the misalignment (measured in vector | |
1553 elements) for the loop described by LOOP_VINFO, i.e. how many elements | |
1554 it is away from a perfectly aligned address. Add any new statements | |
1555 to SEQ. */ | |
1556 | |
1557 static tree | |
1558 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo) | |
1559 { | |
1560 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo); | |
1561 stmt_vec_info stmt_info = dr_info->stmt; | |
1562 tree vectype = STMT_VINFO_VECTYPE (stmt_info); | |
1563 | |
1564 unsigned int target_align = DR_TARGET_ALIGNMENT (dr_info); | |
1565 gcc_assert (target_align != 0); | |
1566 | |
1567 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr), | |
1568 size_zero_node) < 0; | |
1569 tree offset = (negative | |
1570 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) | |
1571 : size_zero_node); | |
1572 tree start_addr = vect_create_addr_base_for_vector_ref (stmt_info, seq, | |
1573 offset); | |
1574 tree type = unsigned_type_for (TREE_TYPE (start_addr)); | |
1575 tree target_align_minus_1 = build_int_cst (type, target_align - 1); | |
1576 HOST_WIDE_INT elem_size | |
1577 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype))); | |
1578 tree elem_size_log = build_int_cst (type, exact_log2 (elem_size)); | |
1579 | |
1580 /* Create: misalign_in_bytes = addr & (target_align - 1). */ | |
1581 tree int_start_addr = fold_convert (type, start_addr); | |
1582 tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr, | |
1583 target_align_minus_1); | |
1584 | |
1585 /* Create: misalign_in_elems = misalign_in_bytes / element_size. */ | |
1586 tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes, | |
1587 elem_size_log); | |
1588 | |
1589 return misalign_in_elems; | |
1590 } | |
1591 | |
912 /* Function vect_gen_prolog_loop_niters | 1592 /* Function vect_gen_prolog_loop_niters |
913 | 1593 |
914 Generate the number of iterations which should be peeled as prolog for the | 1594 Generate the number of iterations which should be peeled as prolog for the |
915 loop represented by LOOP_VINFO. It is calculated as the misalignment of | 1595 loop represented by LOOP_VINFO. It is calculated as the misalignment of |
916 DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). | 1596 DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). |
918 refer to an aligned location. The following computation is generated: | 1598 refer to an aligned location. The following computation is generated: |
919 | 1599 |
920 If the misalignment of DR is known at compile time: | 1600 If the misalignment of DR is known at compile time: |
921 addr_mis = int mis = DR_MISALIGNMENT (dr); | 1601 addr_mis = int mis = DR_MISALIGNMENT (dr); |
922 Else, compute address misalignment in bytes: | 1602 Else, compute address misalignment in bytes: |
923 addr_mis = addr & (vectype_align - 1) | 1603 addr_mis = addr & (target_align - 1) |
924 | 1604 |
925 prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step | 1605 prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step |
926 | 1606 |
927 (elem_size = element type size; an element is the scalar element whose type | 1607 (elem_size = element type size; an element is the scalar element whose type |
928 is the inner type of the vectype) | 1608 is the inner type of the vectype) |
942 | 1622 |
943 static tree | 1623 static tree |
944 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo, | 1624 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo, |
945 basic_block bb, int *bound) | 1625 basic_block bb, int *bound) |
946 { | 1626 { |
947 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo); | 1627 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo); |
948 tree var; | 1628 tree var; |
949 tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)); | 1629 tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)); |
950 gimple_seq stmts = NULL, new_stmts = NULL; | 1630 gimple_seq stmts = NULL, new_stmts = NULL; |
951 tree iters, iters_name; | 1631 tree iters, iters_name; |
952 gimple *dr_stmt = DR_STMT (dr); | 1632 stmt_vec_info stmt_info = dr_info->stmt; |
953 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt); | |
954 tree vectype = STMT_VINFO_VECTYPE (stmt_info); | 1633 tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
955 unsigned int target_align = DR_TARGET_ALIGNMENT (dr); | 1634 unsigned int target_align = DR_TARGET_ALIGNMENT (dr_info); |
956 | 1635 |
957 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0) | 1636 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0) |
958 { | 1637 { |
959 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); | 1638 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); |
960 | 1639 |
965 iters = build_int_cst (niters_type, npeel); | 1644 iters = build_int_cst (niters_type, npeel); |
966 *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); | 1645 *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); |
967 } | 1646 } |
968 else | 1647 else |
969 { | 1648 { |
970 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0; | 1649 tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo); |
971 tree offset = negative | 1650 tree type = TREE_TYPE (misalign_in_elems); |
972 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node; | |
973 tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt, | |
974 &stmts, offset); | |
975 tree type = unsigned_type_for (TREE_TYPE (start_addr)); | |
976 tree target_align_minus_1 = build_int_cst (type, target_align - 1); | |
977 HOST_WIDE_INT elem_size | 1651 HOST_WIDE_INT elem_size |
978 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype))); | 1652 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype))); |
979 tree elem_size_log = build_int_cst (type, exact_log2 (elem_size)); | |
980 HOST_WIDE_INT align_in_elems = target_align / elem_size; | 1653 HOST_WIDE_INT align_in_elems = target_align / elem_size; |
981 tree align_in_elems_minus_1 = build_int_cst (type, align_in_elems - 1); | 1654 tree align_in_elems_minus_1 = build_int_cst (type, align_in_elems - 1); |
982 tree align_in_elems_tree = build_int_cst (type, align_in_elems); | 1655 tree align_in_elems_tree = build_int_cst (type, align_in_elems); |
983 tree misalign_in_bytes; | |
984 tree misalign_in_elems; | |
985 | |
986 /* Create: misalign_in_bytes = addr & (target_align - 1). */ | |
987 misalign_in_bytes | |
988 = fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr), | |
989 target_align_minus_1); | |
990 | |
991 /* Create: misalign_in_elems = misalign_in_bytes / element_size. */ | |
992 misalign_in_elems | |
993 = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes, elem_size_log); | |
994 | 1656 |
995 /* Create: (niters_type) ((align_in_elems - misalign_in_elems) | 1657 /* Create: (niters_type) ((align_in_elems - misalign_in_elems) |
996 & (align_in_elems - 1)). */ | 1658 & (align_in_elems - 1)). */ |
1659 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr), | |
1660 size_zero_node) < 0; | |
997 if (negative) | 1661 if (negative) |
998 iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems, | 1662 iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems, |
999 align_in_elems_tree); | 1663 align_in_elems_tree); |
1000 else | 1664 else |
1001 iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree, | 1665 iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree, |
1004 iters = fold_convert (niters_type, iters); | 1668 iters = fold_convert (niters_type, iters); |
1005 *bound = align_in_elems - 1; | 1669 *bound = align_in_elems - 1; |
1006 } | 1670 } |
1007 | 1671 |
1008 if (dump_enabled_p ()) | 1672 if (dump_enabled_p ()) |
1009 { | 1673 dump_printf_loc (MSG_NOTE, vect_location, |
1010 dump_printf_loc (MSG_NOTE, vect_location, | 1674 "niters for prolog loop: %T\n", iters); |
1011 "niters for prolog loop: "); | |
1012 dump_generic_expr (MSG_NOTE, TDF_SLIM, iters); | |
1013 dump_printf (MSG_NOTE, "\n"); | |
1014 } | |
1015 | 1675 |
1016 var = create_tmp_var (niters_type, "prolog_loop_niters"); | 1676 var = create_tmp_var (niters_type, "prolog_loop_niters"); |
1017 iters_name = force_gimple_operand (iters, &new_stmts, false, var); | 1677 iters_name = force_gimple_operand (iters, &new_stmts, false, var); |
1018 | 1678 |
1019 if (new_stmts) | 1679 if (new_stmts) |
1031 } | 1691 } |
1032 | 1692 |
1033 | 1693 |
1034 /* Function vect_update_init_of_dr | 1694 /* Function vect_update_init_of_dr |
1035 | 1695 |
1036 NITERS iterations were peeled from LOOP. DR represents a data reference | 1696 If CODE is PLUS, the vector loop starts NITERS iterations after the |
1037 in LOOP. This function updates the information recorded in DR to | 1697 scalar one, otherwise CODE is MINUS and the vector loop starts NITERS |
1038 account for the fact that the first NITERS iterations had already been | 1698 iterations before the scalar one (using masking to skip inactive |
1039 executed. Specifically, it updates the OFFSET field of DR. */ | 1699 elements). This function updates the information recorded in DR to |
1700 account for the difference. Specifically, it updates the OFFSET | |
1701 field of DR. */ | |
1040 | 1702 |
1041 static void | 1703 static void |
1042 vect_update_init_of_dr (struct data_reference *dr, tree niters) | 1704 vect_update_init_of_dr (struct data_reference *dr, tree niters, tree_code code) |
1043 { | 1705 { |
1044 tree offset = DR_OFFSET (dr); | 1706 tree offset = DR_OFFSET (dr); |
1045 | 1707 |
1046 niters = fold_build2 (MULT_EXPR, sizetype, | 1708 niters = fold_build2 (MULT_EXPR, sizetype, |
1047 fold_convert (sizetype, niters), | 1709 fold_convert (sizetype, niters), |
1048 fold_convert (sizetype, DR_STEP (dr))); | 1710 fold_convert (sizetype, DR_STEP (dr))); |
1049 offset = fold_build2 (PLUS_EXPR, sizetype, | 1711 offset = fold_build2 (code, sizetype, |
1050 fold_convert (sizetype, offset), niters); | 1712 fold_convert (sizetype, offset), niters); |
1051 DR_OFFSET (dr) = offset; | 1713 DR_OFFSET (dr) = offset; |
1052 } | 1714 } |
1053 | 1715 |
1054 | 1716 |
1055 /* Function vect_update_inits_of_drs | 1717 /* Function vect_update_inits_of_drs |
1056 | 1718 |
1057 NITERS iterations were peeled from the loop represented by LOOP_VINFO. | 1719 Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO. |
1058 This function updates the information recorded for the data references in | 1720 CODE and NITERS are as for vect_update_inits_of_dr. */ |
1059 the loop to account for the fact that the first NITERS iterations had | |
1060 already been executed. Specifically, it updates the initial_condition of | |
1061 the access_function of all the data_references in the loop. */ | |
1062 | 1721 |
1063 static void | 1722 static void |
1064 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters) | 1723 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters, |
1724 tree_code code) | |
1065 { | 1725 { |
1066 unsigned int i; | 1726 unsigned int i; |
1067 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); | 1727 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); |
1068 struct data_reference *dr; | 1728 struct data_reference *dr; |
1069 | 1729 |
1070 if (dump_enabled_p ()) | 1730 DUMP_VECT_SCOPE ("vect_update_inits_of_dr"); |
1071 dump_printf_loc (MSG_NOTE, vect_location, | |
1072 "=== vect_update_inits_of_dr ===\n"); | |
1073 | 1731 |
1074 /* Adjust niters to sizetype and insert stmts on loop preheader edge. */ | 1732 /* Adjust niters to sizetype and insert stmts on loop preheader edge. */ |
1075 if (!types_compatible_p (sizetype, TREE_TYPE (niters))) | 1733 if (!types_compatible_p (sizetype, TREE_TYPE (niters))) |
1076 { | 1734 { |
1077 gimple_seq seq; | 1735 gimple_seq seq; |
1086 gcc_assert (!new_bb); | 1744 gcc_assert (!new_bb); |
1087 } | 1745 } |
1088 } | 1746 } |
1089 | 1747 |
1090 FOR_EACH_VEC_ELT (datarefs, i, dr) | 1748 FOR_EACH_VEC_ELT (datarefs, i, dr) |
1091 vect_update_init_of_dr (dr, niters); | 1749 { |
1092 } | 1750 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr); |
1093 | 1751 if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt)) |
1752 vect_update_init_of_dr (dr, niters, code); | |
1753 } | |
1754 } | |
1755 | |
1756 /* For the information recorded in LOOP_VINFO prepare the loop for peeling | |
1757 by masking. This involves calculating the number of iterations to | |
1758 be peeled and then aligning all memory references appropriately. */ | |
1759 | |
1760 void | |
1761 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo) | |
1762 { | |
1763 tree misalign_in_elems; | |
1764 tree type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo); | |
1765 | |
1766 gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo)); | |
1767 | |
1768 /* From the information recorded in LOOP_VINFO get the number of iterations | |
1769 that need to be skipped via masking. */ | |
1770 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0) | |
1771 { | |
1772 poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo) | |
1773 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)); | |
1774 misalign_in_elems = build_int_cst (type, misalign); | |
1775 } | |
1776 else | |
1777 { | |
1778 gimple_seq seq1 = NULL, seq2 = NULL; | |
1779 misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo); | |
1780 misalign_in_elems = fold_convert (type, misalign_in_elems); | |
1781 misalign_in_elems = force_gimple_operand (misalign_in_elems, | |
1782 &seq2, true, NULL_TREE); | |
1783 gimple_seq_add_seq (&seq1, seq2); | |
1784 if (seq1) | |
1785 { | |
1786 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo)); | |
1787 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1); | |
1788 gcc_assert (!new_bb); | |
1789 } | |
1790 } | |
1791 | |
1792 if (dump_enabled_p ()) | |
1793 dump_printf_loc (MSG_NOTE, vect_location, | |
1794 "misalignment for fully-masked loop: %T\n", | |
1795 misalign_in_elems); | |
1796 | |
1797 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems; | |
1798 | |
1799 vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR); | |
1800 } | |
1094 | 1801 |
1095 /* This function builds ni_name = number of iterations. Statements | 1802 /* This function builds ni_name = number of iterations. Statements |
1096 are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set | 1803 are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set |
1097 it to TRUE if new ssa_var is generated. */ | 1804 it to TRUE if new ssa_var is generated. */ |
1098 | 1805 |
1122 } | 1829 } |
1123 | 1830 |
1124 /* Calculate the number of iterations above which vectorized loop will be | 1831 /* Calculate the number of iterations above which vectorized loop will be |
1125 preferred than scalar loop. NITERS_PROLOG is the number of iterations | 1832 preferred than scalar loop. NITERS_PROLOG is the number of iterations |
1126 of prolog loop. If it's integer const, the integer number is also passed | 1833 of prolog loop. If it's integer const, the integer number is also passed |
1127 in INT_NITERS_PROLOG. BOUND_PROLOG is the upper bound (included) of | 1834 in INT_NITERS_PROLOG. BOUND_PROLOG is the upper bound (inclusive) of the |
1128 number of iterations of prolog loop. VFM1 is vector factor minus one. | 1835 number of iterations of the prolog loop. BOUND_EPILOG is the corresponding |
1129 If CHECK_PROFITABILITY is true, TH is the threshold below which scalar | 1836 value for the epilog loop. If CHECK_PROFITABILITY is true, TH is the |
1130 (rather than vectorized) loop will be executed. This function stores | 1837 threshold below which the scalar (rather than vectorized) loop will be |
1131 upper bound (included) of the result in BOUND_SCALAR. */ | 1838 executed. This function stores the upper bound (inclusive) of the result |
1839 in BOUND_SCALAR. */ | |
1132 | 1840 |
1133 static tree | 1841 static tree |
1134 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog, | 1842 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog, |
1135 int bound_prolog, int vfm1, int th, | 1843 int bound_prolog, poly_int64 bound_epilog, int th, |
1136 int *bound_scalar, bool check_profitability) | 1844 poly_uint64 *bound_scalar, |
1845 bool check_profitability) | |
1137 { | 1846 { |
1138 tree type = TREE_TYPE (niters_prolog); | 1847 tree type = TREE_TYPE (niters_prolog); |
1139 tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog, | 1848 tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog, |
1140 build_int_cst (type, vfm1)); | 1849 build_int_cst (type, bound_epilog)); |
1141 | 1850 |
1142 *bound_scalar = vfm1 + bound_prolog; | 1851 *bound_scalar = bound_prolog + bound_epilog; |
1143 if (check_profitability) | 1852 if (check_profitability) |
1144 { | 1853 { |
1145 /* TH indicates the minimum niters of vectorized loop, while we | 1854 /* TH indicates the minimum niters of vectorized loop, while we |
1146 compute the maximum niters of scalar loop. */ | 1855 compute the maximum niters of scalar loop. */ |
1147 th--; | 1856 th--; |
1148 /* Peeling for constant times. */ | 1857 /* Peeling for constant times. */ |
1149 if (int_niters_prolog >= 0) | 1858 if (int_niters_prolog >= 0) |
1150 { | 1859 { |
1151 *bound_scalar = (int_niters_prolog + vfm1 < th | 1860 *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th); |
1152 ? th | |
1153 : vfm1 + int_niters_prolog); | |
1154 return build_int_cst (type, *bound_scalar); | 1861 return build_int_cst (type, *bound_scalar); |
1155 } | 1862 } |
1156 /* Peeling for unknown times. Note BOUND_PROLOG is the upper | 1863 /* Peeling an unknown number of times. Note that both BOUND_PROLOG |
1157 bound (inlcuded) of niters of prolog loop. */ | 1864 and BOUND_EPILOG are inclusive upper bounds. */ |
1158 if (th >= vfm1 + bound_prolog) | 1865 if (known_ge (th, bound_prolog + bound_epilog)) |
1159 { | 1866 { |
1160 *bound_scalar = th; | 1867 *bound_scalar = th; |
1161 return build_int_cst (type, th); | 1868 return build_int_cst (type, th); |
1162 } | 1869 } |
1163 /* Need to do runtime comparison, but BOUND_SCALAR remains the same. */ | 1870 /* Need to do runtime comparison. */ |
1164 else if (th > vfm1) | 1871 else if (maybe_gt (th, bound_epilog)) |
1165 return fold_build2 (MAX_EXPR, type, build_int_cst (type, th), niters); | 1872 { |
1873 *bound_scalar = upper_bound (*bound_scalar, th); | |
1874 return fold_build2 (MAX_EXPR, type, | |
1875 build_int_cst (type, th), niters); | |
1876 } | |
1166 } | 1877 } |
1167 return niters; | 1878 return niters; |
1168 } | 1879 } |
1169 | 1880 |
1170 /* This function generates the following statements: | 1881 /* NITERS is the number of times that the original scalar loop executes |
1171 | 1882 after peeling. Work out the maximum number of iterations N that can |
1172 niters = number of iterations loop executes (after peeling) | 1883 be handled by the vectorized form of the loop and then either: |
1173 niters_vector = niters / vf | 1884 |
1174 | 1885 a) set *STEP_VECTOR_PTR to the vectorization factor and generate: |
1175 and places them on the loop preheader edge. NITERS_NO_OVERFLOW is | 1886 |
1176 true if NITERS doesn't overflow. */ | 1887 niters_vector = N |
1888 | |
1889 b) set *STEP_VECTOR_PTR to one and generate: | |
1890 | |
1891 niters_vector = N / vf | |
1892 | |
1893 In both cases, store niters_vector in *NITERS_VECTOR_PTR and add | |
1894 any new statements on the loop preheader edge. NITERS_NO_OVERFLOW | |
1895 is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero). */ | |
1177 | 1896 |
1178 void | 1897 void |
1179 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters, | 1898 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters, |
1180 tree *niters_vector_ptr, bool niters_no_overflow) | 1899 tree *niters_vector_ptr, tree *step_vector_ptr, |
1900 bool niters_no_overflow) | |
1181 { | 1901 { |
1182 tree ni_minus_gap, var; | 1902 tree ni_minus_gap, var; |
1183 tree niters_vector, type = TREE_TYPE (niters); | 1903 tree niters_vector, step_vector, type = TREE_TYPE (niters); |
1184 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); | 1904 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
1185 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo)); | 1905 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo)); |
1186 tree log_vf = build_int_cst (type, exact_log2 (vf)); | 1906 tree log_vf = NULL_TREE; |
1187 | 1907 |
1188 /* If epilogue loop is required because of data accesses with gaps, we | 1908 /* If epilogue loop is required because of data accesses with gaps, we |
1189 subtract one iteration from the total number of iterations here for | 1909 subtract one iteration from the total number of iterations here for |
1190 correct calculation of RATIO. */ | 1910 correct calculation of RATIO. */ |
1191 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) | 1911 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) |
1202 } | 1922 } |
1203 } | 1923 } |
1204 else | 1924 else |
1205 ni_minus_gap = niters; | 1925 ni_minus_gap = niters; |
1206 | 1926 |
1207 /* Create: niters >> log2(vf) */ | 1927 unsigned HOST_WIDE_INT const_vf; |
1208 /* If it's known that niters == number of latch executions + 1 doesn't | 1928 if (vf.is_constant (&const_vf) |
1209 overflow, we can generate niters >> log2(vf); otherwise we generate | 1929 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) |
1210 (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio | 1930 { |
1211 will be at least one. */ | 1931 /* Create: niters >> log2(vf) */ |
1212 if (niters_no_overflow) | 1932 /* If it's known that niters == number of latch executions + 1 doesn't |
1213 niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf); | 1933 overflow, we can generate niters >> log2(vf); otherwise we generate |
1934 (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio | |
1935 will be at least one. */ | |
1936 log_vf = build_int_cst (type, exact_log2 (const_vf)); | |
1937 if (niters_no_overflow) | |
1938 niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf); | |
1939 else | |
1940 niters_vector | |
1941 = fold_build2 (PLUS_EXPR, type, | |
1942 fold_build2 (RSHIFT_EXPR, type, | |
1943 fold_build2 (MINUS_EXPR, type, | |
1944 ni_minus_gap, | |
1945 build_int_cst (type, vf)), | |
1946 log_vf), | |
1947 build_int_cst (type, 1)); | |
1948 step_vector = build_one_cst (type); | |
1949 } | |
1214 else | 1950 else |
1215 niters_vector | 1951 { |
1216 = fold_build2 (PLUS_EXPR, type, | 1952 niters_vector = ni_minus_gap; |
1217 fold_build2 (RSHIFT_EXPR, type, | 1953 step_vector = build_int_cst (type, vf); |
1218 fold_build2 (MINUS_EXPR, type, ni_minus_gap, | 1954 } |
1219 build_int_cst (type, vf)), | |
1220 log_vf), | |
1221 build_int_cst (type, 1)); | |
1222 | 1955 |
1223 if (!is_gimple_val (niters_vector)) | 1956 if (!is_gimple_val (niters_vector)) |
1224 { | 1957 { |
1225 var = create_tmp_var (type, "bnd"); | 1958 var = create_tmp_var (type, "bnd"); |
1226 gimple_seq stmts = NULL; | 1959 gimple_seq stmts = NULL; |
1227 niters_vector = force_gimple_operand (niters_vector, &stmts, true, var); | 1960 niters_vector = force_gimple_operand (niters_vector, &stmts, true, var); |
1228 gsi_insert_seq_on_edge_immediate (pe, stmts); | 1961 gsi_insert_seq_on_edge_immediate (pe, stmts); |
1229 /* Peeling algorithm guarantees that vector loop bound is at least ONE, | 1962 /* Peeling algorithm guarantees that vector loop bound is at least ONE, |
1230 we set range information to make niters analyzer's life easier. */ | 1963 we set range information to make niters analyzer's life easier. */ |
1231 if (stmts != NULL) | 1964 if (stmts != NULL && log_vf) |
1232 set_range_info (niters_vector, VR_RANGE, | 1965 set_range_info (niters_vector, VR_RANGE, |
1233 wi::to_wide (build_int_cst (type, 1)), | 1966 wi::to_wide (build_int_cst (type, 1)), |
1234 wi::to_wide (fold_build2 (RSHIFT_EXPR, type, | 1967 wi::to_wide (fold_build2 (RSHIFT_EXPR, type, |
1235 TYPE_MAX_VALUE (type), | 1968 TYPE_MAX_VALUE (type), |
1236 log_vf))); | 1969 log_vf))); |
1237 } | 1970 } |
1238 *niters_vector_ptr = niters_vector; | 1971 *niters_vector_ptr = niters_vector; |
1972 *step_vector_ptr = step_vector; | |
1239 | 1973 |
1240 return; | 1974 return; |
1241 } | 1975 } |
1242 | 1976 |
1243 /* Given NITERS_VECTOR which is the number of iterations for vectorized | 1977 /* Given NITERS_VECTOR which is the number of iterations for vectorized |
1248 static void | 1982 static void |
1249 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo, | 1983 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo, |
1250 tree niters_vector, | 1984 tree niters_vector, |
1251 tree *niters_vector_mult_vf_ptr) | 1985 tree *niters_vector_mult_vf_ptr) |
1252 { | 1986 { |
1253 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); | 1987 /* We should be using a step_vector of VF if VF is variable. */ |
1988 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant (); | |
1254 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | 1989 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
1255 tree type = TREE_TYPE (niters_vector); | 1990 tree type = TREE_TYPE (niters_vector); |
1256 tree log_vf = build_int_cst (type, exact_log2 (vf)); | 1991 tree log_vf = build_int_cst (type, exact_log2 (vf)); |
1257 basic_block exit_bb = single_exit (loop)->dest; | 1992 basic_block exit_bb = single_exit (loop)->dest; |
1258 | 1993 |
1343 gphi *update_phi = gsi_update.phi (); | 2078 gphi *update_phi = gsi_update.phi (); |
1344 | 2079 |
1345 tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e); | 2080 tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e); |
1346 /* Generate lcssa PHI node for the first loop. */ | 2081 /* Generate lcssa PHI node for the first loop. */ |
1347 gphi *vect_phi = (loop == first) ? orig_phi : update_phi; | 2082 gphi *vect_phi = (loop == first) ? orig_phi : update_phi; |
1348 if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi)) | 2083 stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi); |
2084 if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info)) | |
1349 { | 2085 { |
1350 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi)); | 2086 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi)); |
1351 gphi *lcssa_phi = create_phi_node (new_res, between_bb); | 2087 gphi *lcssa_phi = create_phi_node (new_res, between_bb); |
1352 add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION); | 2088 add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION); |
1353 arg = new_res; | 2089 arg = new_res; |
1595 - NITERSM1: The number of iterations of the loop's latch. | 2331 - NITERSM1: The number of iterations of the loop's latch. |
1596 - NITERS_NO_OVERFLOW: No overflow in computing NITERS. | 2332 - NITERS_NO_OVERFLOW: No overflow in computing NITERS. |
1597 - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if | 2333 - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if |
1598 CHECK_PROFITABILITY is true. | 2334 CHECK_PROFITABILITY is true. |
1599 Output: | 2335 Output: |
1600 - NITERS_VECTOR: The number of iterations of loop after vectorization. | 2336 - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should |
2337 iterate after vectorization; see vect_set_loop_condition for details. | |
2338 - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that | |
2339 should be set to the number of scalar iterations handled by the | |
2340 vector loop. The SSA name is only used on exit from the loop. | |
1601 | 2341 |
1602 This function peels prolog and epilog from the loop, adds guards skipping | 2342 This function peels prolog and epilog from the loop, adds guards skipping |
1603 PROLOG and EPILOG for various conditions. As a result, the changed CFG | 2343 PROLOG and EPILOG for various conditions. As a result, the changed CFG |
1604 would look like: | 2344 would look like: |
1605 | 2345 |
1652 versioning conditions if loop versioning is needed. */ | 2392 versioning conditions if loop versioning is needed. */ |
1653 | 2393 |
1654 | 2394 |
1655 struct loop * | 2395 struct loop * |
1656 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, | 2396 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, |
1657 tree *niters_vector, int th, bool check_profitability, | 2397 tree *niters_vector, tree *step_vector, |
1658 bool niters_no_overflow) | 2398 tree *niters_vector_mult_vf_var, int th, |
2399 bool check_profitability, bool niters_no_overflow) | |
1659 { | 2400 { |
1660 edge e, guard_e; | 2401 edge e, guard_e; |
1661 tree type = TREE_TYPE (niters), guard_cond; | 2402 tree type = TREE_TYPE (niters), guard_cond; |
1662 basic_block guard_bb, guard_to; | 2403 basic_block guard_bb, guard_to; |
1663 profile_probability prob_prolog, prob_vector, prob_epilog; | 2404 profile_probability prob_prolog, prob_vector, prob_epilog; |
1664 int bound_prolog = 0, bound_scalar = 0, bound = 0; | 2405 int estimated_vf; |
1665 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); | 2406 int prolog_peeling = 0; |
1666 int prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); | 2407 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo)) |
1667 bool epilog_peeling = (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) | 2408 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); |
1668 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)); | 2409 |
2410 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); | |
2411 poly_uint64 bound_epilog = 0; | |
2412 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) | |
2413 && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)) | |
2414 bound_epilog += vf - 1; | |
2415 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) | |
2416 bound_epilog += 1; | |
2417 bool epilog_peeling = maybe_ne (bound_epilog, 0U); | |
2418 poly_uint64 bound_scalar = bound_epilog; | |
1669 | 2419 |
1670 if (!prolog_peeling && !epilog_peeling) | 2420 if (!prolog_peeling && !epilog_peeling) |
1671 return NULL; | 2421 return NULL; |
1672 | 2422 |
1673 prob_vector = profile_probability::guessed_always ().apply_scale (9, 10); | 2423 prob_vector = profile_probability::guessed_always ().apply_scale (9, 10); |
1674 if ((vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo)) == 2) | 2424 estimated_vf = vect_vf_for_cost (loop_vinfo); |
1675 vf = 3; | 2425 if (estimated_vf == 2) |
2426 estimated_vf = 3; | |
1676 prob_prolog = prob_epilog = profile_probability::guessed_always () | 2427 prob_prolog = prob_epilog = profile_probability::guessed_always () |
1677 .apply_scale (vf - 1, vf); | 2428 .apply_scale (estimated_vf - 1, estimated_vf); |
1678 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); | |
1679 | 2429 |
1680 struct loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo); | 2430 struct loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo); |
1681 struct loop *first_loop = loop; | 2431 struct loop *first_loop = loop; |
1682 bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP; | 2432 bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP; |
1683 create_lcssa_for_virtual_phi (loop); | 2433 create_lcssa_for_virtual_phi (loop); |
1684 update_ssa (TODO_update_ssa_only_virtuals); | 2434 update_ssa (TODO_update_ssa_only_virtuals); |
1685 | 2435 |
1686 if (MAY_HAVE_DEBUG_STMTS) | 2436 if (MAY_HAVE_DEBUG_BIND_STMTS) |
1687 { | 2437 { |
1688 gcc_assert (!adjust_vec.exists ()); | 2438 gcc_assert (!adjust_vec.exists ()); |
1689 adjust_vec.create (32); | 2439 adjust_vec.create (32); |
1690 } | 2440 } |
1691 initialize_original_copy_tables (); | 2441 initialize_original_copy_tables (); |
2442 | |
2443 /* Record the anchor bb at which the guard should be placed if the scalar | |
2444 loop might be preferred. */ | |
2445 basic_block anchor = loop_preheader_edge (loop)->src; | |
2446 | |
2447 /* Generate the number of iterations for the prolog loop. We do this here | |
2448 so that we can also get the upper bound on the number of iterations. */ | |
2449 tree niters_prolog; | |
2450 int bound_prolog = 0; | |
2451 if (prolog_peeling) | |
2452 niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor, | |
2453 &bound_prolog); | |
2454 else | |
2455 niters_prolog = build_int_cst (type, 0); | |
1692 | 2456 |
1693 /* Prolog loop may be skipped. */ | 2457 /* Prolog loop may be skipped. */ |
1694 bool skip_prolog = (prolog_peeling != 0); | 2458 bool skip_prolog = (prolog_peeling != 0); |
1695 /* Skip to epilog if scalar loop may be preferred. It's only needed | 2459 /* Skip to epilog if scalar loop may be preferred. It's only needed |
1696 when we peel for epilog loop and when it hasn't been checked with | 2460 when we peel for epilog loop and when it hasn't been checked with |
1697 loop versioning. */ | 2461 loop versioning. */ |
1698 bool skip_vector = (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) | 2462 bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) |
1699 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)); | 2463 ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo), |
2464 bound_prolog + bound_epilog) | |
2465 : !LOOP_REQUIRES_VERSIONING (loop_vinfo)); | |
1700 /* Epilog loop must be executed if the number of iterations for epilog | 2466 /* Epilog loop must be executed if the number of iterations for epilog |
1701 loop is known at compile time, otherwise we need to add a check at | 2467 loop is known at compile time, otherwise we need to add a check at |
1702 the end of vector loop and skip to the end of epilog loop. */ | 2468 the end of vector loop and skip to the end of epilog loop. */ |
1703 bool skip_epilog = (prolog_peeling < 0 | 2469 bool skip_epilog = (prolog_peeling < 0 |
1704 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)); | 2470 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) |
2471 || !vf.is_constant ()); | |
1705 /* PEELING_FOR_GAPS is special because epilog loop must be executed. */ | 2472 /* PEELING_FOR_GAPS is special because epilog loop must be executed. */ |
1706 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) | 2473 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) |
1707 skip_epilog = false; | 2474 skip_epilog = false; |
1708 | 2475 |
1709 /* Record the anchor bb at which guard should be placed if scalar loop | |
1710 may be preferred. */ | |
1711 basic_block anchor = loop_preheader_edge (loop)->src; | |
1712 if (skip_vector) | 2476 if (skip_vector) |
1713 { | 2477 { |
1714 split_edge (loop_preheader_edge (loop)); | 2478 split_edge (loop_preheader_edge (loop)); |
1715 | 2479 |
1716 /* Due to the order in which we peel prolog and epilog, we first | 2480 /* Due to the order in which we peel prolog and epilog, we first |
1718 avoid adjusting probabilities of both prolog and vector loops | 2482 avoid adjusting probabilities of both prolog and vector loops |
1719 separately. Note in this case, the probability of epilog loop | 2483 separately. Note in this case, the probability of epilog loop |
1720 needs to be scaled back later. */ | 2484 needs to be scaled back later. */ |
1721 basic_block bb_before_loop = loop_preheader_edge (loop)->src; | 2485 basic_block bb_before_loop = loop_preheader_edge (loop)->src; |
1722 if (prob_vector.initialized_p ()) | 2486 if (prob_vector.initialized_p ()) |
1723 scale_bbs_frequencies (&bb_before_loop, 1, prob_vector); | 2487 { |
1724 scale_loop_profile (loop, prob_vector, bound); | 2488 scale_bbs_frequencies (&bb_before_loop, 1, prob_vector); |
1725 } | 2489 scale_loop_profile (loop, prob_vector, 0); |
1726 | 2490 } |
1727 tree niters_prolog = build_int_cst (type, 0); | 2491 } |
1728 source_location loop_loc = find_loop_location (loop); | 2492 |
2493 dump_user_location_t loop_loc = find_loop_location (loop); | |
1729 struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); | 2494 struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); |
1730 if (prolog_peeling) | 2495 if (prolog_peeling) |
1731 { | 2496 { |
1732 e = loop_preheader_edge (loop); | 2497 e = loop_preheader_edge (loop); |
1733 if (!slpeel_can_duplicate_loop_p (loop, e)) | 2498 if (!slpeel_can_duplicate_loop_p (loop, e)) |
1746 } | 2511 } |
1747 slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true); | 2512 slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true); |
1748 first_loop = prolog; | 2513 first_loop = prolog; |
1749 reset_original_copy_tables (); | 2514 reset_original_copy_tables (); |
1750 | 2515 |
1751 /* Generate and update the number of iterations for prolog loop. */ | 2516 /* Update the number of iterations for prolog loop. */ |
1752 niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor, | 2517 tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog)); |
1753 &bound_prolog); | 2518 vect_set_loop_condition (prolog, NULL, niters_prolog, |
1754 slpeel_make_loop_iterate_ntimes (prolog, niters_prolog); | 2519 step_prolog, NULL_TREE, false); |
1755 | 2520 |
1756 /* Skip the prolog loop. */ | 2521 /* Skip the prolog loop. */ |
1757 if (skip_prolog) | 2522 if (skip_prolog) |
1758 { | 2523 { |
1759 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node, | 2524 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node, |
1771 | 2536 |
1772 scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog); | 2537 scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog); |
1773 scale_loop_profile (prolog, prob_prolog, bound_prolog); | 2538 scale_loop_profile (prolog, prob_prolog, bound_prolog); |
1774 } | 2539 } |
1775 /* Update init address of DRs. */ | 2540 /* Update init address of DRs. */ |
1776 vect_update_inits_of_drs (loop_vinfo, niters_prolog); | 2541 vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR); |
1777 /* Update niters for vector loop. */ | 2542 /* Update niters for vector loop. */ |
1778 LOOP_VINFO_NITERS (loop_vinfo) | 2543 LOOP_VINFO_NITERS (loop_vinfo) |
1779 = fold_build2 (MINUS_EXPR, type, niters, niters_prolog); | 2544 = fold_build2 (MINUS_EXPR, type, niters, niters_prolog); |
1780 LOOP_VINFO_NITERSM1 (loop_vinfo) | 2545 LOOP_VINFO_NITERSM1 (loop_vinfo) |
1781 = fold_build2 (MINUS_EXPR, type, | 2546 = fold_build2 (MINUS_EXPR, type, |
1821 iterations of loop is unknown at compile time, otherwise this | 2586 iterations of loop is unknown at compile time, otherwise this |
1822 won't be vectorized. */ | 2587 won't be vectorized. */ |
1823 if (skip_vector) | 2588 if (skip_vector) |
1824 { | 2589 { |
1825 /* Additional epilogue iteration is peeled if gap exists. */ | 2590 /* Additional epilogue iteration is peeled if gap exists. */ |
1826 bool peel_for_gaps = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo); | |
1827 tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling, | 2591 tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling, |
1828 bound_prolog, | 2592 bound_prolog, bound_epilog, |
1829 peel_for_gaps ? vf : vf - 1, | |
1830 th, &bound_scalar, | 2593 th, &bound_scalar, |
1831 check_profitability); | 2594 check_profitability); |
1832 /* Build guard against NITERSM1 since NITERS may overflow. */ | 2595 /* Build guard against NITERSM1 since NITERS may overflow. */ |
1833 guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t); | 2596 guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t); |
1834 guard_bb = anchor; | 2597 guard_bb = anchor; |
1841 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1)); | 2604 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1)); |
1842 slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e); | 2605 slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e); |
1843 | 2606 |
1844 /* Simply propagate profile info from guard_bb to guard_to which is | 2607 /* Simply propagate profile info from guard_bb to guard_to which is |
1845 a merge point of control flow. */ | 2608 a merge point of control flow. */ |
1846 guard_to->frequency = guard_bb->frequency; | |
1847 guard_to->count = guard_bb->count; | 2609 guard_to->count = guard_bb->count; |
2610 | |
1848 /* Scale probability of epilog loop back. | 2611 /* Scale probability of epilog loop back. |
1849 FIXME: We should avoid scaling down and back up. Profile may | 2612 FIXME: We should avoid scaling down and back up. Profile may |
1850 get lost if we scale down to 0. */ | 2613 get lost if we scale down to 0. */ |
1851 int scale_up = REG_BR_PROB_BASE * REG_BR_PROB_BASE | |
1852 / prob_vector.to_reg_br_prob_base (); | |
1853 basic_block *bbs = get_loop_body (epilog); | 2614 basic_block *bbs = get_loop_body (epilog); |
1854 scale_bbs_frequencies_int (bbs, epilog->num_nodes, scale_up, | 2615 for (unsigned int i = 0; i < epilog->num_nodes; i++) |
1855 REG_BR_PROB_BASE); | 2616 bbs[i]->count = bbs[i]->count.apply_scale |
2617 (bbs[i]->count, | |
2618 bbs[i]->count.apply_probability | |
2619 (prob_vector)); | |
1856 free (bbs); | 2620 free (bbs); |
1857 } | 2621 } |
1858 | 2622 |
1859 basic_block bb_before_epilog = loop_preheader_edge (epilog)->src; | 2623 basic_block bb_before_epilog = loop_preheader_edge (epilog)->src; |
1860 tree niters_vector_mult_vf; | 2624 tree niters_vector_mult_vf; |
1861 /* If loop is peeled for non-zero constant times, now niters refers to | 2625 /* If loop is peeled for non-zero constant times, now niters refers to |
1862 orig_niters - prolog_peeling, it won't overflow even the orig_niters | 2626 orig_niters - prolog_peeling, it won't overflow even the orig_niters |
1863 overflows. */ | 2627 overflows. */ |
1864 niters_no_overflow |= (prolog_peeling > 0); | 2628 niters_no_overflow |= (prolog_peeling > 0); |
1865 vect_gen_vector_loop_niters (loop_vinfo, niters, | 2629 vect_gen_vector_loop_niters (loop_vinfo, niters, |
1866 niters_vector, niters_no_overflow); | 2630 niters_vector, step_vector, |
1867 vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector, | 2631 niters_no_overflow); |
1868 &niters_vector_mult_vf); | 2632 if (!integer_onep (*step_vector)) |
2633 { | |
2634 /* On exit from the loop we will have an easy way of calcalating | |
2635 NITERS_VECTOR / STEP * STEP. Install a dummy definition | |
2636 until then. */ | |
2637 niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector)); | |
2638 SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop (); | |
2639 *niters_vector_mult_vf_var = niters_vector_mult_vf; | |
2640 } | |
2641 else | |
2642 vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector, | |
2643 &niters_vector_mult_vf); | |
1869 /* Update IVs of original loop as if they were advanced by | 2644 /* Update IVs of original loop as if they were advanced by |
1870 niters_vector_mult_vf steps. */ | 2645 niters_vector_mult_vf steps. */ |
1871 gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo)); | 2646 gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo)); |
1872 edge update_e = skip_vector ? e : loop_preheader_edge (epilog); | 2647 edge update_e = skip_vector ? e : loop_preheader_edge (epilog); |
1873 vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf, | 2648 vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf, |
1891 { | 2666 { |
1892 prob_epilog = prob_vector * prob_epilog + prob_vector.invert (); | 2667 prob_epilog = prob_vector * prob_epilog + prob_vector.invert (); |
1893 | 2668 |
1894 scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog); | 2669 scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog); |
1895 } | 2670 } |
1896 scale_loop_profile (epilog, prob_epilog, bound); | 2671 scale_loop_profile (epilog, prob_epilog, 0); |
1897 } | 2672 } |
1898 else | 2673 else |
1899 slpeel_update_phi_nodes_for_lcssa (epilog); | 2674 slpeel_update_phi_nodes_for_lcssa (epilog); |
1900 | 2675 |
1901 bound = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? vf - 1 : vf - 2; | 2676 unsigned HOST_WIDE_INT bound; |
1902 /* We share epilog loop with scalar version loop. */ | 2677 if (bound_scalar.is_constant (&bound)) |
1903 bound = MAX (bound, bound_scalar - 1); | 2678 { |
1904 record_niter_bound (epilog, bound, false, true); | 2679 gcc_assert (bound != 0); |
2680 /* -1 to convert loop iterations to latch iterations. */ | |
2681 record_niter_bound (epilog, bound - 1, false, true); | |
2682 } | |
1905 | 2683 |
1906 delete_update_ssa (); | 2684 delete_update_ssa (); |
1907 adjust_vec_debug_stmts (); | 2685 adjust_vec_debug_stmts (); |
1908 scev_reset (); | 2686 scev_reset (); |
1909 } | 2687 } |
1914 } | 2692 } |
1915 | 2693 |
1916 /* Function vect_create_cond_for_niters_checks. | 2694 /* Function vect_create_cond_for_niters_checks. |
1917 | 2695 |
1918 Create a conditional expression that represents the run-time checks for | 2696 Create a conditional expression that represents the run-time checks for |
1919 loop's niter. The loop is guaranteed to to terminate if the run-time | 2697 loop's niter. The loop is guaranteed to terminate if the run-time |
1920 checks hold. | 2698 checks hold. |
1921 | 2699 |
1922 Input: | 2700 Input: |
1923 COND_EXPR - input conditional expression. New conditions will be chained | 2701 COND_EXPR - input conditional expression. New conditions will be chained |
1924 with logical AND operation. If it is NULL, then the function | 2702 with logical AND operation. If it is NULL, then the function |
1986 static void | 2764 static void |
1987 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo, | 2765 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo, |
1988 tree *cond_expr, | 2766 tree *cond_expr, |
1989 gimple_seq *cond_expr_stmt_list) | 2767 gimple_seq *cond_expr_stmt_list) |
1990 { | 2768 { |
1991 vec<gimple *> may_misalign_stmts | 2769 vec<stmt_vec_info> may_misalign_stmts |
1992 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo); | 2770 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo); |
1993 gimple *ref_stmt; | 2771 stmt_vec_info stmt_info; |
1994 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo); | 2772 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo); |
1995 tree mask_cst; | 2773 tree mask_cst; |
1996 unsigned int i; | 2774 unsigned int i; |
1997 tree int_ptrsize_type; | 2775 tree int_ptrsize_type; |
1998 char tmp_name[20]; | 2776 char tmp_name[20]; |
2009 int_ptrsize_type = signed_type_for (ptr_type_node); | 2787 int_ptrsize_type = signed_type_for (ptr_type_node); |
2010 | 2788 |
2011 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address | 2789 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address |
2012 of the first vector of the i'th data reference. */ | 2790 of the first vector of the i'th data reference. */ |
2013 | 2791 |
2014 FOR_EACH_VEC_ELT (may_misalign_stmts, i, ref_stmt) | 2792 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info) |
2015 { | 2793 { |
2016 gimple_seq new_stmt_list = NULL; | 2794 gimple_seq new_stmt_list = NULL; |
2017 tree addr_base; | 2795 tree addr_base; |
2018 tree addr_tmp_name; | 2796 tree addr_tmp_name; |
2019 tree new_or_tmp_name; | 2797 tree new_or_tmp_name; |
2020 gimple *addr_stmt, *or_stmt; | 2798 gimple *addr_stmt, *or_stmt; |
2021 stmt_vec_info stmt_vinfo = vinfo_for_stmt (ref_stmt); | 2799 tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
2022 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo); | |
2023 bool negative = tree_int_cst_compare | 2800 bool negative = tree_int_cst_compare |
2024 (DR_STEP (STMT_VINFO_DATA_REF (stmt_vinfo)), size_zero_node) < 0; | 2801 (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0; |
2025 tree offset = negative | 2802 tree offset = negative |
2026 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node; | 2803 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node; |
2027 | 2804 |
2028 /* create: addr_tmp = (int)(address_of_first_vector) */ | 2805 /* create: addr_tmp = (int)(address_of_first_vector) */ |
2029 addr_base = | 2806 addr_base = |
2030 vect_create_addr_base_for_vector_ref (ref_stmt, &new_stmt_list, | 2807 vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list, |
2031 offset); | 2808 offset); |
2032 if (new_stmt_list != NULL) | 2809 if (new_stmt_list != NULL) |
2033 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list); | 2810 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list); |
2034 | 2811 |
2035 sprintf (tmp_name, "addr2int%d", i); | 2812 sprintf (tmp_name, "addr2int%d", i); |
2090 addr1, addr2); | 2867 addr1, addr2); |
2091 chain_cond_expr (cond_expr, part_cond_expr); | 2868 chain_cond_expr (cond_expr, part_cond_expr); |
2092 } | 2869 } |
2093 } | 2870 } |
2094 | 2871 |
2872 /* Create an expression that is true when all lower-bound conditions for | |
2873 the vectorized loop are met. Chain this condition with *COND_EXPR. */ | |
2874 | |
2875 static void | |
2876 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr) | |
2877 { | |
2878 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo); | |
2879 for (unsigned int i = 0; i < lower_bounds.length (); ++i) | |
2880 { | |
2881 tree expr = lower_bounds[i].expr; | |
2882 tree type = unsigned_type_for (TREE_TYPE (expr)); | |
2883 expr = fold_convert (type, expr); | |
2884 poly_uint64 bound = lower_bounds[i].min_value; | |
2885 if (!lower_bounds[i].unsigned_p) | |
2886 { | |
2887 expr = fold_build2 (PLUS_EXPR, type, expr, | |
2888 build_int_cstu (type, bound - 1)); | |
2889 bound += bound - 1; | |
2890 } | |
2891 tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr, | |
2892 build_int_cstu (type, bound)); | |
2893 chain_cond_expr (cond_expr, part_cond_expr); | |
2894 } | |
2895 } | |
2896 | |
2095 /* Function vect_create_cond_for_alias_checks. | 2897 /* Function vect_create_cond_for_alias_checks. |
2096 | 2898 |
2097 Create a conditional expression that represents the run-time checks for | 2899 Create a conditional expression that represents the run-time checks for |
2098 overlapping of address ranges represented by a list of data references | 2900 overlapping of address ranges represented by a list of data references |
2099 relations passed as input. | 2901 relations passed as input. |
2149 The versioning precondition(s) are placed in *COND_EXPR and | 2951 The versioning precondition(s) are placed in *COND_EXPR and |
2150 *COND_EXPR_STMT_LIST. */ | 2952 *COND_EXPR_STMT_LIST. */ |
2151 | 2953 |
2152 void | 2954 void |
2153 vect_loop_versioning (loop_vec_info loop_vinfo, | 2955 vect_loop_versioning (loop_vec_info loop_vinfo, |
2154 unsigned int th, bool check_profitability) | 2956 unsigned int th, bool check_profitability, |
2957 poly_uint64 versioning_threshold) | |
2155 { | 2958 { |
2156 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop; | 2959 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop; |
2157 struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); | 2960 struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); |
2158 basic_block condition_bb; | 2961 basic_block condition_bb; |
2159 gphi_iterator gsi; | 2962 gphi_iterator gsi; |
2174 | 2977 |
2175 if (check_profitability) | 2978 if (check_profitability) |
2176 cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters, | 2979 cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters, |
2177 build_int_cst (TREE_TYPE (scalar_loop_iters), | 2980 build_int_cst (TREE_TYPE (scalar_loop_iters), |
2178 th - 1)); | 2981 th - 1)); |
2982 if (maybe_ne (versioning_threshold, 0U)) | |
2983 { | |
2984 tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters, | |
2985 build_int_cst (TREE_TYPE (scalar_loop_iters), | |
2986 versioning_threshold - 1)); | |
2987 if (cond_expr) | |
2988 cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node, | |
2989 expr, cond_expr); | |
2990 else | |
2991 cond_expr = expr; | |
2992 } | |
2179 | 2993 |
2180 if (version_niter) | 2994 if (version_niter) |
2181 vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr); | 2995 vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr); |
2182 | 2996 |
2183 if (cond_expr) | 2997 if (cond_expr) |
2189 &cond_expr_stmt_list); | 3003 &cond_expr_stmt_list); |
2190 | 3004 |
2191 if (version_alias) | 3005 if (version_alias) |
2192 { | 3006 { |
2193 vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr); | 3007 vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr); |
3008 vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr); | |
2194 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr); | 3009 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr); |
2195 } | 3010 } |
2196 | 3011 |
2197 cond_expr = force_gimple_operand_1 (cond_expr, &gimplify_stmt_list, | 3012 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr), |
3013 &gimplify_stmt_list, | |
2198 is_gimple_condexpr, NULL_TREE); | 3014 is_gimple_condexpr, NULL_TREE); |
2199 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list); | 3015 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list); |
2200 | 3016 |
2201 initialize_original_copy_tables (); | 3017 initialize_original_copy_tables (); |
2202 if (scalar_loop) | 3018 if (scalar_loop) |
2211 scale_loop_frequencies (loop, prob); | 3027 scale_loop_frequencies (loop, prob); |
2212 /* CONDITION_BB was created above SCALAR_LOOP's preheader, | 3028 /* CONDITION_BB was created above SCALAR_LOOP's preheader, |
2213 while we need to move it above LOOP's preheader. */ | 3029 while we need to move it above LOOP's preheader. */ |
2214 e = loop_preheader_edge (loop); | 3030 e = loop_preheader_edge (loop); |
2215 scalar_e = loop_preheader_edge (scalar_loop); | 3031 scalar_e = loop_preheader_edge (scalar_loop); |
2216 gcc_assert (empty_block_p (e->src) | 3032 /* The vector loop preheader might not be empty, since new |
2217 && single_pred_p (e->src)); | 3033 invariants could have been created while analyzing the loop. */ |
3034 gcc_assert (single_pred_p (e->src)); | |
2218 gcc_assert (empty_block_p (scalar_e->src) | 3035 gcc_assert (empty_block_p (scalar_e->src) |
2219 && single_pred_p (scalar_e->src)); | 3036 && single_pred_p (scalar_e->src)); |
2220 gcc_assert (single_pred_p (condition_bb)); | 3037 gcc_assert (single_pred_p (condition_bb)); |
2221 preheader = e->src; | 3038 preheader = e->src; |
2222 scalar_preheader = scalar_e->src; | 3039 scalar_preheader = scalar_e->src; |
2245 vect_free_loop_info_assumptions (nloop); | 3062 vect_free_loop_info_assumptions (nloop); |
2246 /* And set constraint LOOP_C_INFINITE for niter analyzer. */ | 3063 /* And set constraint LOOP_C_INFINITE for niter analyzer. */ |
2247 loop_constraint_set (loop, LOOP_C_INFINITE); | 3064 loop_constraint_set (loop, LOOP_C_INFINITE); |
2248 } | 3065 } |
2249 | 3066 |
2250 if (LOCATION_LOCUS (vect_location) != UNKNOWN_LOCATION | 3067 if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION |
2251 && dump_enabled_p ()) | 3068 && dump_enabled_p ()) |
2252 { | 3069 { |
2253 if (version_alias) | 3070 if (version_alias) |
2254 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, | 3071 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING, |
3072 vect_location, | |
2255 "loop versioned for vectorization because of " | 3073 "loop versioned for vectorization because of " |
2256 "possible aliasing\n"); | 3074 "possible aliasing\n"); |
2257 if (version_align) | 3075 if (version_align) |
2258 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, | 3076 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING, |
3077 vect_location, | |
2259 "loop versioned for vectorization to enhance " | 3078 "loop versioned for vectorization to enhance " |
2260 "alignment\n"); | 3079 "alignment\n"); |
2261 | 3080 |
2262 } | 3081 } |
2263 free_original_copy_tables (); | 3082 free_original_copy_tables (); |