comparison gcc/tree-ssa-loop-prefetch.c @ 67:f6334be47118

update gcc from gcc-4.6-20100522 to gcc-4.6-20110318
author nobuyasu <dimolto@cr.ie.u-ryukyu.ac.jp>
date Tue, 22 Mar 2011 17:18:12 +0900
parents b7f97abdc517
children 04ced10e8804
comparison
equal deleted inserted replaced
65:65488c3d617d 67:f6334be47118
23 #include "tm.h" 23 #include "tm.h"
24 #include "tree.h" 24 #include "tree.h"
25 #include "tm_p.h" 25 #include "tm_p.h"
26 #include "basic-block.h" 26 #include "basic-block.h"
27 #include "output.h" 27 #include "output.h"
28 #include "diagnostic.h"
29 #include "tree-pretty-print.h" 28 #include "tree-pretty-print.h"
30 #include "tree-flow.h" 29 #include "tree-flow.h"
31 #include "tree-dump.h" 30 #include "tree-dump.h"
32 #include "timevar.h" 31 #include "timevar.h"
33 #include "cfgloop.h" 32 #include "cfgloop.h"
34 #include "expr.h"
35 #include "tree-pass.h" 33 #include "tree-pass.h"
36 #include "insn-config.h" 34 #include "insn-config.h"
37 #include "recog.h" 35 #include "recog.h"
38 #include "hashtab.h" 36 #include "hashtab.h"
39 #include "tree-chrec.h" 37 #include "tree-chrec.h"
40 #include "tree-scalar-evolution.h" 38 #include "tree-scalar-evolution.h"
41 #include "toplev.h" 39 #include "diagnostic-core.h"
42 #include "params.h" 40 #include "params.h"
43 #include "langhooks.h" 41 #include "langhooks.h"
44 #include "tree-inline.h" 42 #include "tree-inline.h"
45 #include "tree-data-ref.h" 43 #include "tree-data-ref.h"
44
45
46 /* FIXME: Needed for optabs, but this should all be moved to a TBD interface
47 between the GIMPLE and RTL worlds. */
48 #include "expr.h"
46 #include "optabs.h" 49 #include "optabs.h"
47 50
48 /* This pass inserts prefetch instructions to optimize cache usage during 51 /* This pass inserts prefetch instructions to optimize cache usage during
49 accesses to arrays in loops. It processes loops sequentially and: 52 accesses to arrays in loops. It processes loops sequentially and:
50 53
74 location 64 iterations before it, and PREFETCH_MOD 64 (since 77 location 64 iterations before it, and PREFETCH_MOD 64 (since
75 it hits the same cache line otherwise). 78 it hits the same cache line otherwise).
76 (2) has PREFETCH_MOD 64 79 (2) has PREFETCH_MOD 64
77 (3) has PREFETCH_MOD 4 80 (3) has PREFETCH_MOD 4
78 (4) has PREFETCH_MOD 1. We do not set PREFETCH_BEFORE here, since 81 (4) has PREFETCH_MOD 1. We do not set PREFETCH_BEFORE here, since
79 the cache line accessed by (4) is the same with probability only 82 the cache line accessed by (5) is the same with probability only
80 7/32. 83 7/32.
81 (5) has PREFETCH_MOD 1 as well. 84 (5) has PREFETCH_MOD 1 as well.
82 85
83 Additionally, we use data dependence analysis to determine for each 86 Additionally, we use data dependence analysis to determine for each
84 reference the distance till the first reuse; this information is used 87 reference the distance till the first reuse; this information is used
104 107
105 6) We actually emit the prefetch instructions. ??? Perhaps emit the 108 6) We actually emit the prefetch instructions. ??? Perhaps emit the
106 prefetch instructions with guards in cases where 5) was not sufficient 109 prefetch instructions with guards in cases where 5) was not sufficient
107 to satisfy the constraints? 110 to satisfy the constraints?
108 111
109 The function is_loop_prefetching_profitable() implements a cost model 112 A cost model is implemented to determine whether or not prefetching is
110 to determine if prefetching is profitable for a given loop. The cost 113 profitable for a given loop. The cost model has three heuristics:
111 model has two heuristcs: 114
112 1. A heuristic that determines whether the given loop has enough CPU 115 1. Function trip_count_to_ahead_ratio_too_small_p implements a
113 ops that can be overlapped with cache missing memory ops. 116 heuristic that determines whether or not the loop has too few
114 If not, the loop won't benefit from prefetching. This is implemented 117 iterations (compared to ahead). Prefetching is not likely to be
115 by requirung the ratio between the instruction count and the mem ref 118 beneficial if the trip count to ahead ratio is below a certain
116 count to be above a certain minimum. 119 minimum.
117 2. A heuristic that disables prefetching in a loop with an unknown trip 120
118 count if the prefetching cost is above a certain limit. The relative 121 2. Function mem_ref_count_reasonable_p implements a heuristic that
119 prefetching cost is estimated by taking the ratio between the 122 determines whether the given loop has enough CPU ops that can be
120 prefetch count and the total intruction count (this models the I-cache 123 overlapped with cache missing memory ops. If not, the loop
121 cost). 124 won't benefit from prefetching. In the implementation,
125 prefetching is not considered beneficial if the ratio between
126 the instruction count and the mem ref count is below a certain
127 minimum.
128
129 3. Function insn_to_prefetch_ratio_too_small_p implements a
130 heuristic that disables prefetching in a loop if the prefetching
131 cost is above a certain limit. The relative prefetching cost is
132 estimated by taking the ratio between the prefetch count and the
133 total intruction count (this models the I-cache cost).
134
122 The limits used in these heuristics are defined as parameters with 135 The limits used in these heuristics are defined as parameters with
123 reasonable default values. Machine-specific default values will be 136 reasonable default values. Machine-specific default values will be
124 added later. 137 added later.
125 138
126 Some other TODO: 139 Some other TODO:
223 236
224 #define PREFETCH_ALL (~(unsigned HOST_WIDE_INT) 0) 237 #define PREFETCH_ALL (~(unsigned HOST_WIDE_INT) 0)
225 238
226 /* Do not generate a prefetch if the unroll factor is significantly less 239 /* Do not generate a prefetch if the unroll factor is significantly less
227 than what is required by the prefetch. This is to avoid redundant 240 than what is required by the prefetch. This is to avoid redundant
228 prefetches. For example, if prefetch_mod is 16 and unroll_factor is 241 prefetches. For example, when prefetch_mod is 16 and unroll_factor is
229 1, this means prefetching requires unrolling the loop 16 times, but 242 2, prefetching requires unrolling the loop 16 times, but
230 the loop is not going to be unrolled. In this case (ratio = 16), 243 the loop is actually unrolled twice. In this case (ratio = 8),
231 prefetching is not likely to be beneficial. */ 244 prefetching is not likely to be beneficial. */
232 245
233 #ifndef PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO 246 #ifndef PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO
234 #define PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO 8 247 #define PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO 4
248 #endif
249
250 /* Some of the prefetch computations have quadratic complexity. We want to
251 avoid huge compile times and, therefore, want to limit the amount of
252 memory references per loop where we consider prefetching. */
253
254 #ifndef PREFETCH_MAX_MEM_REFS_PER_LOOP
255 #define PREFETCH_MAX_MEM_REFS_PER_LOOP 200
235 #endif 256 #endif
236 257
237 /* The memory reference. */ 258 /* The memory reference. */
238 259
239 struct mem_ref 260 struct mem_ref
399 struct ar_data *ar_data = (struct ar_data *) data; 420 struct ar_data *ar_data = (struct ar_data *) data;
400 tree ibase, step, stepsize; 421 tree ibase, step, stepsize;
401 HOST_WIDE_INT idelta = 0, imult = 1; 422 HOST_WIDE_INT idelta = 0, imult = 1;
402 affine_iv iv; 423 affine_iv iv;
403 424
404 if (TREE_CODE (base) == MISALIGNED_INDIRECT_REF
405 || TREE_CODE (base) == ALIGN_INDIRECT_REF)
406 return false;
407
408 if (!simple_iv (ar_data->loop, loop_containing_stmt (ar_data->stmt), 425 if (!simple_iv (ar_data->loop, loop_containing_stmt (ar_data->stmt),
409 *index, &iv, true)) 426 *index, &iv, true))
410 return false; 427 return false;
411 ibase = iv.base; 428 ibase = iv.base;
412 step = iv.step; 429 step = iv.step;
419 } 436 }
420 if (cst_and_fits_in_hwi (ibase)) 437 if (cst_and_fits_in_hwi (ibase))
421 { 438 {
422 idelta += int_cst_value (ibase); 439 idelta += int_cst_value (ibase);
423 ibase = build_int_cst (TREE_TYPE (ibase), 0); 440 ibase = build_int_cst (TREE_TYPE (ibase), 0);
441 }
442
443 if (TREE_CODE (base) == ARRAY_REF)
444 {
445 stepsize = array_ref_element_size (base);
446 if (!cst_and_fits_in_hwi (stepsize))
447 return false;
448 imult = int_cst_value (stepsize);
449 step = fold_build2 (MULT_EXPR, sizetype,
450 fold_convert (sizetype, step),
451 fold_convert (sizetype, stepsize));
452 idelta *= imult;
424 } 453 }
425 454
426 if (*ar_data->step == NULL_TREE) 455 if (*ar_data->step == NULL_TREE)
427 *ar_data->step = step; 456 *ar_data->step = step;
428 else 457 else
429 *ar_data->step = fold_build2 (PLUS_EXPR, sizetype, 458 *ar_data->step = fold_build2 (PLUS_EXPR, sizetype,
430 fold_convert (sizetype, *ar_data->step), 459 fold_convert (sizetype, *ar_data->step),
431 fold_convert (sizetype, step)); 460 fold_convert (sizetype, step));
432 if (TREE_CODE (base) == ARRAY_REF)
433 {
434 stepsize = array_ref_element_size (base);
435 if (!cst_and_fits_in_hwi (stepsize))
436 return false;
437 imult = int_cst_value (stepsize);
438
439 *ar_data->step = fold_build2 (MULT_EXPR, sizetype,
440 fold_convert (sizetype, *ar_data->step),
441 fold_convert (sizetype, step));
442 idelta *= imult;
443 }
444
445 *ar_data->delta += idelta; 461 *ar_data->delta += idelta;
446 *index = ibase; 462 *index = ibase;
447 463
448 return true; 464 return true;
449 } 465 }
464 tree ref = *ref_p; 480 tree ref = *ref_p;
465 481
466 *step = NULL_TREE; 482 *step = NULL_TREE;
467 *delta = 0; 483 *delta = 0;
468 484
469 /* First strip off the component references. Ignore bitfields. */ 485 /* First strip off the component references. Ignore bitfields.
470 if (TREE_CODE (ref) == COMPONENT_REF 486 Also strip off the real and imagine parts of a complex, so that
471 && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1))) 487 they can have the same base. */
472 ref = TREE_OPERAND (ref, 0); 488 if (TREE_CODE (ref) == REALPART_EXPR
489 || TREE_CODE (ref) == IMAGPART_EXPR
490 || (TREE_CODE (ref) == COMPONENT_REF
491 && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1))))
492 {
493 if (TREE_CODE (ref) == IMAGPART_EXPR)
494 *delta += int_size_in_bytes (TREE_TYPE (ref));
495 ref = TREE_OPERAND (ref, 0);
496 }
473 497
474 *ref_p = ref; 498 *ref_p = ref;
475 499
476 for (; TREE_CODE (ref) == COMPONENT_REF; ref = TREE_OPERAND (ref, 0)) 500 for (; TREE_CODE (ref) == COMPONENT_REF; ref = TREE_OPERAND (ref, 0))
477 { 501 {
507 531
508 if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt)) 532 if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt))
509 return false; 533 return false;
510 /* If analyze_ref fails the default is a NULL_TREE. We can stop here. */ 534 /* If analyze_ref fails the default is a NULL_TREE. We can stop here. */
511 if (step == NULL_TREE) 535 if (step == NULL_TREE)
536 return false;
537
538 /* Stop if the address of BASE could not be taken. */
539 if (may_be_nonaddressable_p (base))
540 return false;
541
542 /* Limit non-constant step prefetching only to the innermost loops. */
543 if (!cst_and_fits_in_hwi (step) && loop->inner != NULL)
512 return false; 544 return false;
513 545
514 /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP 546 /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP
515 are integer constants. */ 547 are integer constants. */
516 agrp = find_or_create_group (refs, base, step); 548 agrp = find_or_create_group (refs, base, step);
632 } 664 }
633 665
634 /* Given a CACHE_LINE_SIZE and two inductive memory references 666 /* Given a CACHE_LINE_SIZE and two inductive memory references
635 with a common STEP greater than CACHE_LINE_SIZE and an address 667 with a common STEP greater than CACHE_LINE_SIZE and an address
636 difference DELTA, compute the probability that they will fall 668 difference DELTA, compute the probability that they will fall
637 in different cache lines. DISTINCT_ITERS is the number of 669 in different cache lines. Return true if the computed miss rate
638 distinct iterations after which the pattern repeats itself. 670 is not greater than the ACCEPTABLE_MISS_RATE. DISTINCT_ITERS is the
671 number of distinct iterations after which the pattern repeats itself.
639 ALIGN_UNIT is the unit of alignment in bytes. */ 672 ALIGN_UNIT is the unit of alignment in bytes. */
640 673
641 static int 674 static bool
642 compute_miss_rate (unsigned HOST_WIDE_INT cache_line_size, 675 is_miss_rate_acceptable (unsigned HOST_WIDE_INT cache_line_size,
643 HOST_WIDE_INT step, HOST_WIDE_INT delta, 676 HOST_WIDE_INT step, HOST_WIDE_INT delta,
644 unsigned HOST_WIDE_INT distinct_iters, 677 unsigned HOST_WIDE_INT distinct_iters,
645 int align_unit) 678 int align_unit)
646 { 679 {
647 unsigned align, iter; 680 unsigned align, iter;
648 int total_positions, miss_positions, miss_rate; 681 int total_positions, miss_positions, max_allowed_miss_positions;
649 int address1, address2, cache_line1, cache_line2; 682 int address1, address2, cache_line1, cache_line2;
650 683
651 total_positions = 0; 684 /* It always misses if delta is greater than or equal to the cache
685 line size. */
686 if (delta >= (HOST_WIDE_INT) cache_line_size)
687 return false;
688
652 miss_positions = 0; 689 miss_positions = 0;
690 total_positions = (cache_line_size / align_unit) * distinct_iters;
691 max_allowed_miss_positions = (ACCEPTABLE_MISS_RATE * total_positions) / 1000;
653 692
654 /* Iterate through all possible alignments of the first 693 /* Iterate through all possible alignments of the first
655 memory reference within its cache line. */ 694 memory reference within its cache line. */
656 for (align = 0; align < cache_line_size; align += align_unit) 695 for (align = 0; align < cache_line_size; align += align_unit)
657 696
660 { 699 {
661 address1 = align + step * iter; 700 address1 = align + step * iter;
662 address2 = address1 + delta; 701 address2 = address1 + delta;
663 cache_line1 = address1 / cache_line_size; 702 cache_line1 = address1 / cache_line_size;
664 cache_line2 = address2 / cache_line_size; 703 cache_line2 = address2 / cache_line_size;
665 total_positions += 1;
666 if (cache_line1 != cache_line2) 704 if (cache_line1 != cache_line2)
667 miss_positions += 1; 705 {
706 miss_positions += 1;
707 if (miss_positions > max_allowed_miss_positions)
708 return false;
709 }
668 } 710 }
669 miss_rate = 1000 * miss_positions / total_positions; 711 return true;
670 return miss_rate;
671 } 712 }
672 713
673 /* Prune the prefetch candidate REF using the reuse with BY. 714 /* Prune the prefetch candidate REF using the reuse with BY.
674 If BY_IS_BEFORE is true, BY is before REF in the loop. */ 715 If BY_IS_BEFORE is true, BY is before REF in the loop. */
675 716
681 bool backward; 722 bool backward;
682 HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta; 723 HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta;
683 HOST_WIDE_INT delta = delta_b - delta_r; 724 HOST_WIDE_INT delta = delta_b - delta_r;
684 HOST_WIDE_INT hit_from; 725 HOST_WIDE_INT hit_from;
685 unsigned HOST_WIDE_INT prefetch_before, prefetch_block; 726 unsigned HOST_WIDE_INT prefetch_before, prefetch_block;
686 int miss_rate;
687 HOST_WIDE_INT reduced_step; 727 HOST_WIDE_INT reduced_step;
688 unsigned HOST_WIDE_INT reduced_prefetch_block; 728 unsigned HOST_WIDE_INT reduced_prefetch_block;
689 tree ref_type; 729 tree ref_type;
690 int align_unit; 730 int align_unit;
691 731
780 820
781 prefetch_before = delta / step; 821 prefetch_before = delta / step;
782 delta %= step; 822 delta %= step;
783 ref_type = TREE_TYPE (ref->mem); 823 ref_type = TREE_TYPE (ref->mem);
784 align_unit = TYPE_ALIGN (ref_type) / 8; 824 align_unit = TYPE_ALIGN (ref_type) / 8;
785 miss_rate = compute_miss_rate(prefetch_block, step, delta, 825 if (is_miss_rate_acceptable (prefetch_block, step, delta,
786 reduced_prefetch_block, align_unit); 826 reduced_prefetch_block, align_unit))
787 if (miss_rate <= ACCEPTABLE_MISS_RATE)
788 { 827 {
789 /* Do not reduce prefetch_before if we meet beyond cache size. */ 828 /* Do not reduce prefetch_before if we meet beyond cache size. */
790 if (prefetch_before > L2_CACHE_SIZE_BYTES / PREFETCH_BLOCK) 829 if (prefetch_before > L2_CACHE_SIZE_BYTES / PREFETCH_BLOCK)
791 prefetch_before = PREFETCH_ALL; 830 prefetch_before = PREFETCH_ALL;
792 if (prefetch_before < ref->prefetch_before) 831 if (prefetch_before < ref->prefetch_before)
796 } 835 }
797 836
798 /* Try also the following iteration. */ 837 /* Try also the following iteration. */
799 prefetch_before++; 838 prefetch_before++;
800 delta = step - delta; 839 delta = step - delta;
801 miss_rate = compute_miss_rate(prefetch_block, step, delta, 840 if (is_miss_rate_acceptable (prefetch_block, step, delta,
802 reduced_prefetch_block, align_unit); 841 reduced_prefetch_block, align_unit))
803 if (miss_rate <= ACCEPTABLE_MISS_RATE)
804 { 842 {
805 if (prefetch_before < ref->prefetch_before) 843 if (prefetch_before < ref->prefetch_before)
806 ref->prefetch_before = prefetch_before; 844 ref->prefetch_before = prefetch_before;
807 845
808 return; 846 return;
986 } 1024 }
987 1025
988 return any; 1026 return any;
989 } 1027 }
990 1028
991 /* Estimate the number of prefetches in the given GROUPS. */ 1029 /* Return TRUE if no prefetch is going to be generated in the given
992 1030 GROUPS. */
993 static int 1031
994 estimate_prefetch_count (struct mem_ref_group *groups) 1032 static bool
1033 nothing_to_prefetch_p (struct mem_ref_group *groups)
995 { 1034 {
996 struct mem_ref *ref; 1035 struct mem_ref *ref;
997 int prefetch_count = 0;
998 1036
999 for (; groups; groups = groups->next) 1037 for (; groups; groups = groups->next)
1000 for (ref = groups->refs; ref; ref = ref->next) 1038 for (ref = groups->refs; ref; ref = ref->next)
1001 if (should_issue_prefetch_p (ref)) 1039 if (should_issue_prefetch_p (ref))
1002 prefetch_count++; 1040 return false;
1041
1042 return true;
1043 }
1044
1045 /* Estimate the number of prefetches in the given GROUPS.
1046 UNROLL_FACTOR is the factor by which LOOP was unrolled. */
1047
1048 static int
1049 estimate_prefetch_count (struct mem_ref_group *groups, unsigned unroll_factor)
1050 {
1051 struct mem_ref *ref;
1052 unsigned n_prefetches;
1053 int prefetch_count = 0;
1054
1055 for (; groups; groups = groups->next)
1056 for (ref = groups->refs; ref; ref = ref->next)
1057 if (should_issue_prefetch_p (ref))
1058 {
1059 n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1060 / ref->prefetch_mod);
1061 prefetch_count += n_prefetches;
1062 }
1003 1063
1004 return prefetch_count; 1064 return prefetch_count;
1005 } 1065 }
1006 1066
1007 /* Issue prefetches for the reference REF into loop as decided before. 1067 /* Issue prefetches for the reference REF into loop as decided before.
1029 / ref->prefetch_mod); 1089 / ref->prefetch_mod);
1030 addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node); 1090 addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node);
1031 addr_base = force_gimple_operand_gsi (&bsi, unshare_expr (addr_base), 1091 addr_base = force_gimple_operand_gsi (&bsi, unshare_expr (addr_base),
1032 true, NULL, true, GSI_SAME_STMT); 1092 true, NULL, true, GSI_SAME_STMT);
1033 write_p = ref->write_p ? integer_one_node : integer_zero_node; 1093 write_p = ref->write_p ? integer_one_node : integer_zero_node;
1034 local = build_int_cst (integer_type_node, nontemporal ? 0 : 3); 1094 local = nontemporal ? integer_zero_node : integer_three_node;
1035 1095
1036 for (ap = 0; ap < n_prefetches; ap++) 1096 for (ap = 0; ap < n_prefetches; ap++)
1037 { 1097 {
1038 if (cst_and_fits_in_hwi (ref->group->step)) 1098 if (cst_and_fits_in_hwi (ref->group->step))
1039 { 1099 {
1100 /* Check that we have the storent instruction for the mode. */ 1160 /* Check that we have the storent instruction for the mode. */
1101 mode = TYPE_MODE (TREE_TYPE (ref->mem)); 1161 mode = TYPE_MODE (TREE_TYPE (ref->mem));
1102 if (mode == BLKmode) 1162 if (mode == BLKmode)
1103 return false; 1163 return false;
1104 1164
1105 code = optab_handler (storent_optab, mode)->insn_code; 1165 code = optab_handler (storent_optab, mode);
1106 return code != CODE_FOR_nothing; 1166 return code != CODE_FOR_nothing;
1107 } 1167 }
1108 1168
1109 /* If REF is a nontemporal store, we mark the corresponding modify statement 1169 /* If REF is a nontemporal store, we mark the corresponding modify statement
1110 and return true. Otherwise, we return false. */ 1170 and return true. Otherwise, we return false. */
1134 edge exit; 1194 edge exit;
1135 gimple call; 1195 gimple call;
1136 gimple_stmt_iterator bsi; 1196 gimple_stmt_iterator bsi;
1137 unsigned i; 1197 unsigned i;
1138 1198
1139 for (i = 0; VEC_iterate (edge, exits, i, exit); i++) 1199 FOR_EACH_VEC_ELT (edge, exits, i, exit)
1140 { 1200 {
1141 call = gimple_build_call (FENCE_FOLLOWING_MOVNT, 0); 1201 call = gimple_build_call (FENCE_FOLLOWING_MOVNT, 0);
1142 1202
1143 if (!single_pred_p (exit->dest) 1203 if (!single_pred_p (exit->dest)
1144 /* If possible, we prefer not to insert the fence on other paths 1204 /* If possible, we prefer not to insert the fence on other paths
1171 { 1231 {
1172 VEC (edge, heap) *exits = get_loop_exit_edges (loop); 1232 VEC (edge, heap) *exits = get_loop_exit_edges (loop);
1173 unsigned i; 1233 unsigned i;
1174 edge exit; 1234 edge exit;
1175 1235
1176 for (i = 0; VEC_iterate (edge, exits, i, exit); i++) 1236 FOR_EACH_VEC_ELT (edge, exits, i, exit)
1177 if ((exit->flags & EDGE_ABNORMAL) 1237 if ((exit->flags & EDGE_ABNORMAL)
1178 && exit->dest == EXIT_BLOCK_PTR) 1238 && exit->dest == EXIT_BLOCK_PTR)
1179 ret = false; 1239 ret = false;
1180 1240
1181 VEC_free (edge, heap, exits); 1241 VEC_free (edge, heap, exits);
1390 the innermost loop in that the stride is less than cache size. */ 1450 the innermost loop in that the stride is less than cache size. */
1391 1451
1392 strides = XCNEWVEC (HOST_WIDE_INT, n); 1452 strides = XCNEWVEC (HOST_WIDE_INT, n);
1393 access_fns = DR_ACCESS_FNS (dr); 1453 access_fns = DR_ACCESS_FNS (dr);
1394 1454
1395 for (i = 0; VEC_iterate (tree, access_fns, i, access_fn); i++) 1455 FOR_EACH_VEC_ELT (tree, access_fns, i, access_fn)
1396 { 1456 {
1397 /* Keep track of the reference corresponding to the subscript, so that we 1457 /* Keep track of the reference corresponding to the subscript, so that we
1398 know its stride. */ 1458 know its stride. */
1399 while (handled_component_p (ref) && TREE_CODE (ref) != ARRAY_REF) 1459 while (handled_component_p (ref) && TREE_CODE (ref) != ARRAY_REF)
1400 ref = TREE_OPERAND (ref, 0); 1460 ref = TREE_OPERAND (ref, 0);
1500 are used just as a heuristics to estimate temporality of the 1560 are used just as a heuristics to estimate temporality of the
1501 references, hence we do not need to worry about correctness). */ 1561 references, hence we do not need to worry about correctness). */
1502 for (gr = refs; gr; gr = gr->next) 1562 for (gr = refs; gr; gr = gr->next)
1503 for (ref = gr->refs; ref; ref = ref->next) 1563 for (ref = gr->refs; ref; ref = ref->next)
1504 { 1564 {
1505 dr = create_data_ref (nest, ref->mem, ref->stmt, !ref->write_p); 1565 dr = create_data_ref (nest, loop_containing_stmt (ref->stmt),
1566 ref->mem, ref->stmt, !ref->write_p);
1506 1567
1507 if (dr) 1568 if (dr)
1508 { 1569 {
1509 ref->reuse_distance = volume; 1570 ref->reuse_distance = volume;
1510 dr->aux = ref; 1571 dr->aux = ref;
1512 } 1573 }
1513 else 1574 else
1514 no_other_refs = false; 1575 no_other_refs = false;
1515 } 1576 }
1516 1577
1517 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++) 1578 FOR_EACH_VEC_ELT (data_reference_p, datarefs, i, dr)
1518 { 1579 {
1519 dist = self_reuse_distance (dr, loop_data_size, n, loop); 1580 dist = self_reuse_distance (dr, loop_data_size, n, loop);
1520 ref = (struct mem_ref *) dr->aux; 1581 ref = (struct mem_ref *) dr->aux;
1521 if (ref->reuse_distance > dist) 1582 if (ref->reuse_distance > dist)
1522 ref->reuse_distance = dist; 1583 ref->reuse_distance = dist;
1525 ref->independent_p = true; 1586 ref->independent_p = true;
1526 } 1587 }
1527 1588
1528 compute_all_dependences (datarefs, &dependences, vloops, true); 1589 compute_all_dependences (datarefs, &dependences, vloops, true);
1529 1590
1530 for (i = 0; VEC_iterate (ddr_p, dependences, i, dep); i++) 1591 FOR_EACH_VEC_ELT (ddr_p, dependences, i, dep)
1531 { 1592 {
1532 if (DDR_ARE_DEPENDENT (dep) == chrec_known) 1593 if (DDR_ARE_DEPENDENT (dep) == chrec_known)
1533 continue; 1594 continue;
1534 1595
1535 ref = (struct mem_ref *) DDR_A (dep)->aux; 1596 ref = (struct mem_ref *) DDR_A (dep)->aux;
1604 fprintf (dump_file, " ref %p distance %u\n", 1665 fprintf (dump_file, " ref %p distance %u\n",
1605 (void *) ref, ref->reuse_distance); 1666 (void *) ref, ref->reuse_distance);
1606 } 1667 }
1607 } 1668 }
1608 1669
1609 /* Do a cost-benefit analysis to determine if prefetching is profitable 1670 /* Determine whether or not the trip count to ahead ratio is too small based
1610 for the current loop given the following parameters: 1671 on prefitablility consideration.
1611 AHEAD: the iteration ahead distance, 1672 AHEAD: the iteration ahead distance,
1612 EST_NITER: the estimated trip count, 1673 EST_NITER: the estimated trip count. */
1674
1675 static bool
1676 trip_count_to_ahead_ratio_too_small_p (unsigned ahead, HOST_WIDE_INT est_niter)
1677 {
1678 /* Assume trip count to ahead ratio is big enough if the trip count could not
1679 be estimated at compile time. */
1680 if (est_niter < 0)
1681 return false;
1682
1683 if (est_niter < (HOST_WIDE_INT) (TRIP_COUNT_TO_AHEAD_RATIO * ahead))
1684 {
1685 if (dump_file && (dump_flags & TDF_DETAILS))
1686 fprintf (dump_file,
1687 "Not prefetching -- loop estimated to roll only %d times\n",
1688 (int) est_niter);
1689 return true;
1690 }
1691
1692 return false;
1693 }
1694
1695 /* Determine whether or not the number of memory references in the loop is
1696 reasonable based on the profitablity and compilation time considerations.
1613 NINSNS: estimated number of instructions in the loop, 1697 NINSNS: estimated number of instructions in the loop,
1614 PREFETCH_COUNT: an estimate of the number of prefetches
1615 MEM_REF_COUNT: total number of memory references in the loop. */ 1698 MEM_REF_COUNT: total number of memory references in the loop. */
1616 1699
1617 static bool 1700 static bool
1618 is_loop_prefetching_profitable (unsigned ahead, HOST_WIDE_INT est_niter, 1701 mem_ref_count_reasonable_p (unsigned ninsns, unsigned mem_ref_count)
1619 unsigned ninsns, unsigned prefetch_count, 1702 {
1620 unsigned mem_ref_count, unsigned unroll_factor) 1703 int insn_to_mem_ratio;
1621 {
1622 int insn_to_mem_ratio, insn_to_prefetch_ratio;
1623 1704
1624 if (mem_ref_count == 0) 1705 if (mem_ref_count == 0)
1706 return false;
1707
1708 /* Miss rate computation (is_miss_rate_acceptable) and dependence analysis
1709 (compute_all_dependences) have high costs based on quadratic complexity.
1710 To avoid huge compilation time, we give up prefetching if mem_ref_count
1711 is too large. */
1712 if (mem_ref_count > PREFETCH_MAX_MEM_REFS_PER_LOOP)
1625 return false; 1713 return false;
1626 1714
1627 /* Prefetching improves performance by overlapping cache missing 1715 /* Prefetching improves performance by overlapping cache missing
1628 memory accesses with CPU operations. If the loop does not have 1716 memory accesses with CPU operations. If the loop does not have
1629 enough CPU operations to overlap with memory operations, prefetching 1717 enough CPU operations to overlap with memory operations, prefetching
1641 fprintf (dump_file, 1729 fprintf (dump_file,
1642 "Not prefetching -- instruction to memory reference ratio (%d) too small\n", 1730 "Not prefetching -- instruction to memory reference ratio (%d) too small\n",
1643 insn_to_mem_ratio); 1731 insn_to_mem_ratio);
1644 return false; 1732 return false;
1645 } 1733 }
1734
1735 return true;
1736 }
1737
1738 /* Determine whether or not the instruction to prefetch ratio in the loop is
1739 too small based on the profitablity consideration.
1740 NINSNS: estimated number of instructions in the loop,
1741 PREFETCH_COUNT: an estimate of the number of prefetches,
1742 UNROLL_FACTOR: the factor to unroll the loop if prefetching. */
1743
1744 static bool
1745 insn_to_prefetch_ratio_too_small_p (unsigned ninsns, unsigned prefetch_count,
1746 unsigned unroll_factor)
1747 {
1748 int insn_to_prefetch_ratio;
1646 1749
1647 /* Prefetching most likely causes performance degradation when the instruction 1750 /* Prefetching most likely causes performance degradation when the instruction
1648 to prefetch ratio is too small. Too many prefetch instructions in a loop 1751 to prefetch ratio is too small. Too many prefetch instructions in a loop
1649 may reduce the I-cache performance. 1752 may reduce the I-cache performance.
1650 (unroll_factor * ninsns) is used to estimate the number of instructions in 1753 (unroll_factor * ninsns) is used to estimate the number of instructions in
1661 { 1764 {
1662 if (dump_file && (dump_flags & TDF_DETAILS)) 1765 if (dump_file && (dump_flags & TDF_DETAILS))
1663 fprintf (dump_file, 1766 fprintf (dump_file,
1664 "Not prefetching -- instruction to prefetch ratio (%d) too small\n", 1767 "Not prefetching -- instruction to prefetch ratio (%d) too small\n",
1665 insn_to_prefetch_ratio); 1768 insn_to_prefetch_ratio);
1666 return false; 1769 return true;
1667 } 1770 }
1668 1771
1669 /* Could not do further estimation if the trip count is unknown. Just assume 1772 return false;
1670 prefetching is profitable. Too aggressive??? */
1671 if (est_niter < 0)
1672 return true;
1673
1674 if (est_niter < (HOST_WIDE_INT) (TRIP_COUNT_TO_AHEAD_RATIO * ahead))
1675 {
1676 if (dump_file && (dump_flags & TDF_DETAILS))
1677 fprintf (dump_file,
1678 "Not prefetching -- loop estimated to roll only %d times\n",
1679 (int) est_niter);
1680 return false;
1681 }
1682 return true;
1683 } 1773 }
1684 1774
1685 1775
1686 /* Issue prefetch instructions for array references in LOOP. Returns 1776 /* Issue prefetch instructions for array references in LOOP. Returns
1687 true if the LOOP was unrolled. */ 1777 true if the LOOP was unrolled. */
1702 if (dump_file && (dump_flags & TDF_DETAILS)) 1792 if (dump_file && (dump_flags & TDF_DETAILS))
1703 fprintf (dump_file, " ignored (cold area)\n"); 1793 fprintf (dump_file, " ignored (cold area)\n");
1704 return false; 1794 return false;
1705 } 1795 }
1706 1796
1707 /* Step 1: gather the memory references. */
1708 refs = gather_memory_references (loop, &no_other_refs, &mem_ref_count);
1709
1710 /* Step 2: estimate the reuse effects. */
1711 prune_by_reuse (refs);
1712
1713 prefetch_count = estimate_prefetch_count (refs);
1714 if (prefetch_count == 0)
1715 goto fail;
1716
1717 determine_loop_nest_reuse (loop, refs, no_other_refs);
1718
1719 /* Step 3: determine the ahead and unroll factor. */
1720
1721 /* FIXME: the time should be weighted by the probabilities of the blocks in 1797 /* FIXME: the time should be weighted by the probabilities of the blocks in
1722 the loop body. */ 1798 the loop body. */
1723 time = tree_num_loop_insns (loop, &eni_time_weights); 1799 time = tree_num_loop_insns (loop, &eni_time_weights);
1800 if (time == 0)
1801 return false;
1802
1724 ahead = (PREFETCH_LATENCY + time - 1) / time; 1803 ahead = (PREFETCH_LATENCY + time - 1) / time;
1725 est_niter = estimated_loop_iterations_int (loop, false); 1804 est_niter = estimated_loop_iterations_int (loop, false);
1726 1805
1806 /* Prefetching is not likely to be profitable if the trip count to ahead
1807 ratio is too small. */
1808 if (trip_count_to_ahead_ratio_too_small_p (ahead, est_niter))
1809 return false;
1810
1727 ninsns = tree_num_loop_insns (loop, &eni_size_weights); 1811 ninsns = tree_num_loop_insns (loop, &eni_size_weights);
1812
1813 /* Step 1: gather the memory references. */
1814 refs = gather_memory_references (loop, &no_other_refs, &mem_ref_count);
1815
1816 /* Give up prefetching if the number of memory references in the
1817 loop is not reasonable based on profitablity and compilation time
1818 considerations. */
1819 if (!mem_ref_count_reasonable_p (ninsns, mem_ref_count))
1820 goto fail;
1821
1822 /* Step 2: estimate the reuse effects. */
1823 prune_by_reuse (refs);
1824
1825 if (nothing_to_prefetch_p (refs))
1826 goto fail;
1827
1828 determine_loop_nest_reuse (loop, refs, no_other_refs);
1829
1830 /* Step 3: determine unroll factor. */
1728 unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc, 1831 unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc,
1729 est_niter); 1832 est_niter);
1833
1834 /* Estimate prefetch count for the unrolled loop. */
1835 prefetch_count = estimate_prefetch_count (refs, unroll_factor);
1836 if (prefetch_count == 0)
1837 goto fail;
1838
1730 if (dump_file && (dump_flags & TDF_DETAILS)) 1839 if (dump_file && (dump_flags & TDF_DETAILS))
1731 fprintf (dump_file, "Ahead %d, unroll factor %d, trip count " 1840 fprintf (dump_file, "Ahead %d, unroll factor %d, trip count "
1732 HOST_WIDE_INT_PRINT_DEC "\n" 1841 HOST_WIDE_INT_PRINT_DEC "\n"
1733 "insn count %d, mem ref count %d, prefetch count %d\n", 1842 "insn count %d, mem ref count %d, prefetch count %d\n",
1734 ahead, unroll_factor, est_niter, 1843 ahead, unroll_factor, est_niter,
1735 ninsns, mem_ref_count, prefetch_count); 1844 ninsns, mem_ref_count, prefetch_count);
1736 1845
1737 if (!is_loop_prefetching_profitable (ahead, est_niter, ninsns, prefetch_count, 1846 /* Prefetching is not likely to be profitable if the instruction to prefetch
1738 mem_ref_count, unroll_factor)) 1847 ratio is too small. */
1848 if (insn_to_prefetch_ratio_too_small_p (ninsns, prefetch_count,
1849 unroll_factor))
1739 goto fail; 1850 goto fail;
1740 1851
1741 mark_nontemporal_stores (loop, refs); 1852 mark_nontemporal_stores (loop, refs);
1742 1853
1743 /* Step 4: what to prefetch? */ 1854 /* Step 4: what to prefetch? */
1799 1910
1800 initialize_original_copy_tables (); 1911 initialize_original_copy_tables ();
1801 1912
1802 if (!built_in_decls[BUILT_IN_PREFETCH]) 1913 if (!built_in_decls[BUILT_IN_PREFETCH])
1803 { 1914 {
1804 tree type = build_function_type (void_type_node, 1915 tree type = build_function_type_list (void_type_node,
1805 tree_cons (NULL_TREE, 1916 const_ptr_type_node, NULL_TREE);
1806 const_ptr_type_node,
1807 NULL_TREE));
1808 tree decl = add_builtin_function ("__builtin_prefetch", type, 1917 tree decl = add_builtin_function ("__builtin_prefetch", type,
1809 BUILT_IN_PREFETCH, BUILT_IN_NORMAL, 1918 BUILT_IN_PREFETCH, BUILT_IN_NORMAL,
1810 NULL, NULL_TREE); 1919 NULL, NULL_TREE);
1811 DECL_IS_NOVOPS (decl) = true; 1920 DECL_IS_NOVOPS (decl) = true;
1812 built_in_decls[BUILT_IN_PREFETCH] = decl; 1921 built_in_decls[BUILT_IN_PREFETCH] = decl;