Mercurial > hg > CbC > CbC_gcc
comparison gcc/tree-ssa-loop-prefetch.c @ 67:f6334be47118
update gcc from gcc-4.6-20100522 to gcc-4.6-20110318
author | nobuyasu <dimolto@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Tue, 22 Mar 2011 17:18:12 +0900 |
parents | b7f97abdc517 |
children | 04ced10e8804 |
comparison
equal
deleted
inserted
replaced
65:65488c3d617d | 67:f6334be47118 |
---|---|
23 #include "tm.h" | 23 #include "tm.h" |
24 #include "tree.h" | 24 #include "tree.h" |
25 #include "tm_p.h" | 25 #include "tm_p.h" |
26 #include "basic-block.h" | 26 #include "basic-block.h" |
27 #include "output.h" | 27 #include "output.h" |
28 #include "diagnostic.h" | |
29 #include "tree-pretty-print.h" | 28 #include "tree-pretty-print.h" |
30 #include "tree-flow.h" | 29 #include "tree-flow.h" |
31 #include "tree-dump.h" | 30 #include "tree-dump.h" |
32 #include "timevar.h" | 31 #include "timevar.h" |
33 #include "cfgloop.h" | 32 #include "cfgloop.h" |
34 #include "expr.h" | |
35 #include "tree-pass.h" | 33 #include "tree-pass.h" |
36 #include "insn-config.h" | 34 #include "insn-config.h" |
37 #include "recog.h" | 35 #include "recog.h" |
38 #include "hashtab.h" | 36 #include "hashtab.h" |
39 #include "tree-chrec.h" | 37 #include "tree-chrec.h" |
40 #include "tree-scalar-evolution.h" | 38 #include "tree-scalar-evolution.h" |
41 #include "toplev.h" | 39 #include "diagnostic-core.h" |
42 #include "params.h" | 40 #include "params.h" |
43 #include "langhooks.h" | 41 #include "langhooks.h" |
44 #include "tree-inline.h" | 42 #include "tree-inline.h" |
45 #include "tree-data-ref.h" | 43 #include "tree-data-ref.h" |
44 | |
45 | |
46 /* FIXME: Needed for optabs, but this should all be moved to a TBD interface | |
47 between the GIMPLE and RTL worlds. */ | |
48 #include "expr.h" | |
46 #include "optabs.h" | 49 #include "optabs.h" |
47 | 50 |
48 /* This pass inserts prefetch instructions to optimize cache usage during | 51 /* This pass inserts prefetch instructions to optimize cache usage during |
49 accesses to arrays in loops. It processes loops sequentially and: | 52 accesses to arrays in loops. It processes loops sequentially and: |
50 | 53 |
74 location 64 iterations before it, and PREFETCH_MOD 64 (since | 77 location 64 iterations before it, and PREFETCH_MOD 64 (since |
75 it hits the same cache line otherwise). | 78 it hits the same cache line otherwise). |
76 (2) has PREFETCH_MOD 64 | 79 (2) has PREFETCH_MOD 64 |
77 (3) has PREFETCH_MOD 4 | 80 (3) has PREFETCH_MOD 4 |
78 (4) has PREFETCH_MOD 1. We do not set PREFETCH_BEFORE here, since | 81 (4) has PREFETCH_MOD 1. We do not set PREFETCH_BEFORE here, since |
79 the cache line accessed by (4) is the same with probability only | 82 the cache line accessed by (5) is the same with probability only |
80 7/32. | 83 7/32. |
81 (5) has PREFETCH_MOD 1 as well. | 84 (5) has PREFETCH_MOD 1 as well. |
82 | 85 |
83 Additionally, we use data dependence analysis to determine for each | 86 Additionally, we use data dependence analysis to determine for each |
84 reference the distance till the first reuse; this information is used | 87 reference the distance till the first reuse; this information is used |
104 | 107 |
105 6) We actually emit the prefetch instructions. ??? Perhaps emit the | 108 6) We actually emit the prefetch instructions. ??? Perhaps emit the |
106 prefetch instructions with guards in cases where 5) was not sufficient | 109 prefetch instructions with guards in cases where 5) was not sufficient |
107 to satisfy the constraints? | 110 to satisfy the constraints? |
108 | 111 |
109 The function is_loop_prefetching_profitable() implements a cost model | 112 A cost model is implemented to determine whether or not prefetching is |
110 to determine if prefetching is profitable for a given loop. The cost | 113 profitable for a given loop. The cost model has three heuristics: |
111 model has two heuristcs: | 114 |
112 1. A heuristic that determines whether the given loop has enough CPU | 115 1. Function trip_count_to_ahead_ratio_too_small_p implements a |
113 ops that can be overlapped with cache missing memory ops. | 116 heuristic that determines whether or not the loop has too few |
114 If not, the loop won't benefit from prefetching. This is implemented | 117 iterations (compared to ahead). Prefetching is not likely to be |
115 by requirung the ratio between the instruction count and the mem ref | 118 beneficial if the trip count to ahead ratio is below a certain |
116 count to be above a certain minimum. | 119 minimum. |
117 2. A heuristic that disables prefetching in a loop with an unknown trip | 120 |
118 count if the prefetching cost is above a certain limit. The relative | 121 2. Function mem_ref_count_reasonable_p implements a heuristic that |
119 prefetching cost is estimated by taking the ratio between the | 122 determines whether the given loop has enough CPU ops that can be |
120 prefetch count and the total intruction count (this models the I-cache | 123 overlapped with cache missing memory ops. If not, the loop |
121 cost). | 124 won't benefit from prefetching. In the implementation, |
125 prefetching is not considered beneficial if the ratio between | |
126 the instruction count and the mem ref count is below a certain | |
127 minimum. | |
128 | |
129 3. Function insn_to_prefetch_ratio_too_small_p implements a | |
130 heuristic that disables prefetching in a loop if the prefetching | |
131 cost is above a certain limit. The relative prefetching cost is | |
132 estimated by taking the ratio between the prefetch count and the | |
133 total intruction count (this models the I-cache cost). | |
134 | |
122 The limits used in these heuristics are defined as parameters with | 135 The limits used in these heuristics are defined as parameters with |
123 reasonable default values. Machine-specific default values will be | 136 reasonable default values. Machine-specific default values will be |
124 added later. | 137 added later. |
125 | 138 |
126 Some other TODO: | 139 Some other TODO: |
223 | 236 |
224 #define PREFETCH_ALL (~(unsigned HOST_WIDE_INT) 0) | 237 #define PREFETCH_ALL (~(unsigned HOST_WIDE_INT) 0) |
225 | 238 |
226 /* Do not generate a prefetch if the unroll factor is significantly less | 239 /* Do not generate a prefetch if the unroll factor is significantly less |
227 than what is required by the prefetch. This is to avoid redundant | 240 than what is required by the prefetch. This is to avoid redundant |
228 prefetches. For example, if prefetch_mod is 16 and unroll_factor is | 241 prefetches. For example, when prefetch_mod is 16 and unroll_factor is |
229 1, this means prefetching requires unrolling the loop 16 times, but | 242 2, prefetching requires unrolling the loop 16 times, but |
230 the loop is not going to be unrolled. In this case (ratio = 16), | 243 the loop is actually unrolled twice. In this case (ratio = 8), |
231 prefetching is not likely to be beneficial. */ | 244 prefetching is not likely to be beneficial. */ |
232 | 245 |
233 #ifndef PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO | 246 #ifndef PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO |
234 #define PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO 8 | 247 #define PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO 4 |
248 #endif | |
249 | |
250 /* Some of the prefetch computations have quadratic complexity. We want to | |
251 avoid huge compile times and, therefore, want to limit the amount of | |
252 memory references per loop where we consider prefetching. */ | |
253 | |
254 #ifndef PREFETCH_MAX_MEM_REFS_PER_LOOP | |
255 #define PREFETCH_MAX_MEM_REFS_PER_LOOP 200 | |
235 #endif | 256 #endif |
236 | 257 |
237 /* The memory reference. */ | 258 /* The memory reference. */ |
238 | 259 |
239 struct mem_ref | 260 struct mem_ref |
399 struct ar_data *ar_data = (struct ar_data *) data; | 420 struct ar_data *ar_data = (struct ar_data *) data; |
400 tree ibase, step, stepsize; | 421 tree ibase, step, stepsize; |
401 HOST_WIDE_INT idelta = 0, imult = 1; | 422 HOST_WIDE_INT idelta = 0, imult = 1; |
402 affine_iv iv; | 423 affine_iv iv; |
403 | 424 |
404 if (TREE_CODE (base) == MISALIGNED_INDIRECT_REF | |
405 || TREE_CODE (base) == ALIGN_INDIRECT_REF) | |
406 return false; | |
407 | |
408 if (!simple_iv (ar_data->loop, loop_containing_stmt (ar_data->stmt), | 425 if (!simple_iv (ar_data->loop, loop_containing_stmt (ar_data->stmt), |
409 *index, &iv, true)) | 426 *index, &iv, true)) |
410 return false; | 427 return false; |
411 ibase = iv.base; | 428 ibase = iv.base; |
412 step = iv.step; | 429 step = iv.step; |
419 } | 436 } |
420 if (cst_and_fits_in_hwi (ibase)) | 437 if (cst_and_fits_in_hwi (ibase)) |
421 { | 438 { |
422 idelta += int_cst_value (ibase); | 439 idelta += int_cst_value (ibase); |
423 ibase = build_int_cst (TREE_TYPE (ibase), 0); | 440 ibase = build_int_cst (TREE_TYPE (ibase), 0); |
441 } | |
442 | |
443 if (TREE_CODE (base) == ARRAY_REF) | |
444 { | |
445 stepsize = array_ref_element_size (base); | |
446 if (!cst_and_fits_in_hwi (stepsize)) | |
447 return false; | |
448 imult = int_cst_value (stepsize); | |
449 step = fold_build2 (MULT_EXPR, sizetype, | |
450 fold_convert (sizetype, step), | |
451 fold_convert (sizetype, stepsize)); | |
452 idelta *= imult; | |
424 } | 453 } |
425 | 454 |
426 if (*ar_data->step == NULL_TREE) | 455 if (*ar_data->step == NULL_TREE) |
427 *ar_data->step = step; | 456 *ar_data->step = step; |
428 else | 457 else |
429 *ar_data->step = fold_build2 (PLUS_EXPR, sizetype, | 458 *ar_data->step = fold_build2 (PLUS_EXPR, sizetype, |
430 fold_convert (sizetype, *ar_data->step), | 459 fold_convert (sizetype, *ar_data->step), |
431 fold_convert (sizetype, step)); | 460 fold_convert (sizetype, step)); |
432 if (TREE_CODE (base) == ARRAY_REF) | |
433 { | |
434 stepsize = array_ref_element_size (base); | |
435 if (!cst_and_fits_in_hwi (stepsize)) | |
436 return false; | |
437 imult = int_cst_value (stepsize); | |
438 | |
439 *ar_data->step = fold_build2 (MULT_EXPR, sizetype, | |
440 fold_convert (sizetype, *ar_data->step), | |
441 fold_convert (sizetype, step)); | |
442 idelta *= imult; | |
443 } | |
444 | |
445 *ar_data->delta += idelta; | 461 *ar_data->delta += idelta; |
446 *index = ibase; | 462 *index = ibase; |
447 | 463 |
448 return true; | 464 return true; |
449 } | 465 } |
464 tree ref = *ref_p; | 480 tree ref = *ref_p; |
465 | 481 |
466 *step = NULL_TREE; | 482 *step = NULL_TREE; |
467 *delta = 0; | 483 *delta = 0; |
468 | 484 |
469 /* First strip off the component references. Ignore bitfields. */ | 485 /* First strip off the component references. Ignore bitfields. |
470 if (TREE_CODE (ref) == COMPONENT_REF | 486 Also strip off the real and imagine parts of a complex, so that |
471 && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1))) | 487 they can have the same base. */ |
472 ref = TREE_OPERAND (ref, 0); | 488 if (TREE_CODE (ref) == REALPART_EXPR |
489 || TREE_CODE (ref) == IMAGPART_EXPR | |
490 || (TREE_CODE (ref) == COMPONENT_REF | |
491 && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1)))) | |
492 { | |
493 if (TREE_CODE (ref) == IMAGPART_EXPR) | |
494 *delta += int_size_in_bytes (TREE_TYPE (ref)); | |
495 ref = TREE_OPERAND (ref, 0); | |
496 } | |
473 | 497 |
474 *ref_p = ref; | 498 *ref_p = ref; |
475 | 499 |
476 for (; TREE_CODE (ref) == COMPONENT_REF; ref = TREE_OPERAND (ref, 0)) | 500 for (; TREE_CODE (ref) == COMPONENT_REF; ref = TREE_OPERAND (ref, 0)) |
477 { | 501 { |
507 | 531 |
508 if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt)) | 532 if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt)) |
509 return false; | 533 return false; |
510 /* If analyze_ref fails the default is a NULL_TREE. We can stop here. */ | 534 /* If analyze_ref fails the default is a NULL_TREE. We can stop here. */ |
511 if (step == NULL_TREE) | 535 if (step == NULL_TREE) |
536 return false; | |
537 | |
538 /* Stop if the address of BASE could not be taken. */ | |
539 if (may_be_nonaddressable_p (base)) | |
540 return false; | |
541 | |
542 /* Limit non-constant step prefetching only to the innermost loops. */ | |
543 if (!cst_and_fits_in_hwi (step) && loop->inner != NULL) | |
512 return false; | 544 return false; |
513 | 545 |
514 /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP | 546 /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP |
515 are integer constants. */ | 547 are integer constants. */ |
516 agrp = find_or_create_group (refs, base, step); | 548 agrp = find_or_create_group (refs, base, step); |
632 } | 664 } |
633 | 665 |
634 /* Given a CACHE_LINE_SIZE and two inductive memory references | 666 /* Given a CACHE_LINE_SIZE and two inductive memory references |
635 with a common STEP greater than CACHE_LINE_SIZE and an address | 667 with a common STEP greater than CACHE_LINE_SIZE and an address |
636 difference DELTA, compute the probability that they will fall | 668 difference DELTA, compute the probability that they will fall |
637 in different cache lines. DISTINCT_ITERS is the number of | 669 in different cache lines. Return true if the computed miss rate |
638 distinct iterations after which the pattern repeats itself. | 670 is not greater than the ACCEPTABLE_MISS_RATE. DISTINCT_ITERS is the |
671 number of distinct iterations after which the pattern repeats itself. | |
639 ALIGN_UNIT is the unit of alignment in bytes. */ | 672 ALIGN_UNIT is the unit of alignment in bytes. */ |
640 | 673 |
641 static int | 674 static bool |
642 compute_miss_rate (unsigned HOST_WIDE_INT cache_line_size, | 675 is_miss_rate_acceptable (unsigned HOST_WIDE_INT cache_line_size, |
643 HOST_WIDE_INT step, HOST_WIDE_INT delta, | 676 HOST_WIDE_INT step, HOST_WIDE_INT delta, |
644 unsigned HOST_WIDE_INT distinct_iters, | 677 unsigned HOST_WIDE_INT distinct_iters, |
645 int align_unit) | 678 int align_unit) |
646 { | 679 { |
647 unsigned align, iter; | 680 unsigned align, iter; |
648 int total_positions, miss_positions, miss_rate; | 681 int total_positions, miss_positions, max_allowed_miss_positions; |
649 int address1, address2, cache_line1, cache_line2; | 682 int address1, address2, cache_line1, cache_line2; |
650 | 683 |
651 total_positions = 0; | 684 /* It always misses if delta is greater than or equal to the cache |
685 line size. */ | |
686 if (delta >= (HOST_WIDE_INT) cache_line_size) | |
687 return false; | |
688 | |
652 miss_positions = 0; | 689 miss_positions = 0; |
690 total_positions = (cache_line_size / align_unit) * distinct_iters; | |
691 max_allowed_miss_positions = (ACCEPTABLE_MISS_RATE * total_positions) / 1000; | |
653 | 692 |
654 /* Iterate through all possible alignments of the first | 693 /* Iterate through all possible alignments of the first |
655 memory reference within its cache line. */ | 694 memory reference within its cache line. */ |
656 for (align = 0; align < cache_line_size; align += align_unit) | 695 for (align = 0; align < cache_line_size; align += align_unit) |
657 | 696 |
660 { | 699 { |
661 address1 = align + step * iter; | 700 address1 = align + step * iter; |
662 address2 = address1 + delta; | 701 address2 = address1 + delta; |
663 cache_line1 = address1 / cache_line_size; | 702 cache_line1 = address1 / cache_line_size; |
664 cache_line2 = address2 / cache_line_size; | 703 cache_line2 = address2 / cache_line_size; |
665 total_positions += 1; | |
666 if (cache_line1 != cache_line2) | 704 if (cache_line1 != cache_line2) |
667 miss_positions += 1; | 705 { |
706 miss_positions += 1; | |
707 if (miss_positions > max_allowed_miss_positions) | |
708 return false; | |
709 } | |
668 } | 710 } |
669 miss_rate = 1000 * miss_positions / total_positions; | 711 return true; |
670 return miss_rate; | |
671 } | 712 } |
672 | 713 |
673 /* Prune the prefetch candidate REF using the reuse with BY. | 714 /* Prune the prefetch candidate REF using the reuse with BY. |
674 If BY_IS_BEFORE is true, BY is before REF in the loop. */ | 715 If BY_IS_BEFORE is true, BY is before REF in the loop. */ |
675 | 716 |
681 bool backward; | 722 bool backward; |
682 HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta; | 723 HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta; |
683 HOST_WIDE_INT delta = delta_b - delta_r; | 724 HOST_WIDE_INT delta = delta_b - delta_r; |
684 HOST_WIDE_INT hit_from; | 725 HOST_WIDE_INT hit_from; |
685 unsigned HOST_WIDE_INT prefetch_before, prefetch_block; | 726 unsigned HOST_WIDE_INT prefetch_before, prefetch_block; |
686 int miss_rate; | |
687 HOST_WIDE_INT reduced_step; | 727 HOST_WIDE_INT reduced_step; |
688 unsigned HOST_WIDE_INT reduced_prefetch_block; | 728 unsigned HOST_WIDE_INT reduced_prefetch_block; |
689 tree ref_type; | 729 tree ref_type; |
690 int align_unit; | 730 int align_unit; |
691 | 731 |
780 | 820 |
781 prefetch_before = delta / step; | 821 prefetch_before = delta / step; |
782 delta %= step; | 822 delta %= step; |
783 ref_type = TREE_TYPE (ref->mem); | 823 ref_type = TREE_TYPE (ref->mem); |
784 align_unit = TYPE_ALIGN (ref_type) / 8; | 824 align_unit = TYPE_ALIGN (ref_type) / 8; |
785 miss_rate = compute_miss_rate(prefetch_block, step, delta, | 825 if (is_miss_rate_acceptable (prefetch_block, step, delta, |
786 reduced_prefetch_block, align_unit); | 826 reduced_prefetch_block, align_unit)) |
787 if (miss_rate <= ACCEPTABLE_MISS_RATE) | |
788 { | 827 { |
789 /* Do not reduce prefetch_before if we meet beyond cache size. */ | 828 /* Do not reduce prefetch_before if we meet beyond cache size. */ |
790 if (prefetch_before > L2_CACHE_SIZE_BYTES / PREFETCH_BLOCK) | 829 if (prefetch_before > L2_CACHE_SIZE_BYTES / PREFETCH_BLOCK) |
791 prefetch_before = PREFETCH_ALL; | 830 prefetch_before = PREFETCH_ALL; |
792 if (prefetch_before < ref->prefetch_before) | 831 if (prefetch_before < ref->prefetch_before) |
796 } | 835 } |
797 | 836 |
798 /* Try also the following iteration. */ | 837 /* Try also the following iteration. */ |
799 prefetch_before++; | 838 prefetch_before++; |
800 delta = step - delta; | 839 delta = step - delta; |
801 miss_rate = compute_miss_rate(prefetch_block, step, delta, | 840 if (is_miss_rate_acceptable (prefetch_block, step, delta, |
802 reduced_prefetch_block, align_unit); | 841 reduced_prefetch_block, align_unit)) |
803 if (miss_rate <= ACCEPTABLE_MISS_RATE) | |
804 { | 842 { |
805 if (prefetch_before < ref->prefetch_before) | 843 if (prefetch_before < ref->prefetch_before) |
806 ref->prefetch_before = prefetch_before; | 844 ref->prefetch_before = prefetch_before; |
807 | 845 |
808 return; | 846 return; |
986 } | 1024 } |
987 | 1025 |
988 return any; | 1026 return any; |
989 } | 1027 } |
990 | 1028 |
991 /* Estimate the number of prefetches in the given GROUPS. */ | 1029 /* Return TRUE if no prefetch is going to be generated in the given |
992 | 1030 GROUPS. */ |
993 static int | 1031 |
994 estimate_prefetch_count (struct mem_ref_group *groups) | 1032 static bool |
1033 nothing_to_prefetch_p (struct mem_ref_group *groups) | |
995 { | 1034 { |
996 struct mem_ref *ref; | 1035 struct mem_ref *ref; |
997 int prefetch_count = 0; | |
998 | 1036 |
999 for (; groups; groups = groups->next) | 1037 for (; groups; groups = groups->next) |
1000 for (ref = groups->refs; ref; ref = ref->next) | 1038 for (ref = groups->refs; ref; ref = ref->next) |
1001 if (should_issue_prefetch_p (ref)) | 1039 if (should_issue_prefetch_p (ref)) |
1002 prefetch_count++; | 1040 return false; |
1041 | |
1042 return true; | |
1043 } | |
1044 | |
1045 /* Estimate the number of prefetches in the given GROUPS. | |
1046 UNROLL_FACTOR is the factor by which LOOP was unrolled. */ | |
1047 | |
1048 static int | |
1049 estimate_prefetch_count (struct mem_ref_group *groups, unsigned unroll_factor) | |
1050 { | |
1051 struct mem_ref *ref; | |
1052 unsigned n_prefetches; | |
1053 int prefetch_count = 0; | |
1054 | |
1055 for (; groups; groups = groups->next) | |
1056 for (ref = groups->refs; ref; ref = ref->next) | |
1057 if (should_issue_prefetch_p (ref)) | |
1058 { | |
1059 n_prefetches = ((unroll_factor + ref->prefetch_mod - 1) | |
1060 / ref->prefetch_mod); | |
1061 prefetch_count += n_prefetches; | |
1062 } | |
1003 | 1063 |
1004 return prefetch_count; | 1064 return prefetch_count; |
1005 } | 1065 } |
1006 | 1066 |
1007 /* Issue prefetches for the reference REF into loop as decided before. | 1067 /* Issue prefetches for the reference REF into loop as decided before. |
1029 / ref->prefetch_mod); | 1089 / ref->prefetch_mod); |
1030 addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node); | 1090 addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node); |
1031 addr_base = force_gimple_operand_gsi (&bsi, unshare_expr (addr_base), | 1091 addr_base = force_gimple_operand_gsi (&bsi, unshare_expr (addr_base), |
1032 true, NULL, true, GSI_SAME_STMT); | 1092 true, NULL, true, GSI_SAME_STMT); |
1033 write_p = ref->write_p ? integer_one_node : integer_zero_node; | 1093 write_p = ref->write_p ? integer_one_node : integer_zero_node; |
1034 local = build_int_cst (integer_type_node, nontemporal ? 0 : 3); | 1094 local = nontemporal ? integer_zero_node : integer_three_node; |
1035 | 1095 |
1036 for (ap = 0; ap < n_prefetches; ap++) | 1096 for (ap = 0; ap < n_prefetches; ap++) |
1037 { | 1097 { |
1038 if (cst_and_fits_in_hwi (ref->group->step)) | 1098 if (cst_and_fits_in_hwi (ref->group->step)) |
1039 { | 1099 { |
1100 /* Check that we have the storent instruction for the mode. */ | 1160 /* Check that we have the storent instruction for the mode. */ |
1101 mode = TYPE_MODE (TREE_TYPE (ref->mem)); | 1161 mode = TYPE_MODE (TREE_TYPE (ref->mem)); |
1102 if (mode == BLKmode) | 1162 if (mode == BLKmode) |
1103 return false; | 1163 return false; |
1104 | 1164 |
1105 code = optab_handler (storent_optab, mode)->insn_code; | 1165 code = optab_handler (storent_optab, mode); |
1106 return code != CODE_FOR_nothing; | 1166 return code != CODE_FOR_nothing; |
1107 } | 1167 } |
1108 | 1168 |
1109 /* If REF is a nontemporal store, we mark the corresponding modify statement | 1169 /* If REF is a nontemporal store, we mark the corresponding modify statement |
1110 and return true. Otherwise, we return false. */ | 1170 and return true. Otherwise, we return false. */ |
1134 edge exit; | 1194 edge exit; |
1135 gimple call; | 1195 gimple call; |
1136 gimple_stmt_iterator bsi; | 1196 gimple_stmt_iterator bsi; |
1137 unsigned i; | 1197 unsigned i; |
1138 | 1198 |
1139 for (i = 0; VEC_iterate (edge, exits, i, exit); i++) | 1199 FOR_EACH_VEC_ELT (edge, exits, i, exit) |
1140 { | 1200 { |
1141 call = gimple_build_call (FENCE_FOLLOWING_MOVNT, 0); | 1201 call = gimple_build_call (FENCE_FOLLOWING_MOVNT, 0); |
1142 | 1202 |
1143 if (!single_pred_p (exit->dest) | 1203 if (!single_pred_p (exit->dest) |
1144 /* If possible, we prefer not to insert the fence on other paths | 1204 /* If possible, we prefer not to insert the fence on other paths |
1171 { | 1231 { |
1172 VEC (edge, heap) *exits = get_loop_exit_edges (loop); | 1232 VEC (edge, heap) *exits = get_loop_exit_edges (loop); |
1173 unsigned i; | 1233 unsigned i; |
1174 edge exit; | 1234 edge exit; |
1175 | 1235 |
1176 for (i = 0; VEC_iterate (edge, exits, i, exit); i++) | 1236 FOR_EACH_VEC_ELT (edge, exits, i, exit) |
1177 if ((exit->flags & EDGE_ABNORMAL) | 1237 if ((exit->flags & EDGE_ABNORMAL) |
1178 && exit->dest == EXIT_BLOCK_PTR) | 1238 && exit->dest == EXIT_BLOCK_PTR) |
1179 ret = false; | 1239 ret = false; |
1180 | 1240 |
1181 VEC_free (edge, heap, exits); | 1241 VEC_free (edge, heap, exits); |
1390 the innermost loop in that the stride is less than cache size. */ | 1450 the innermost loop in that the stride is less than cache size. */ |
1391 | 1451 |
1392 strides = XCNEWVEC (HOST_WIDE_INT, n); | 1452 strides = XCNEWVEC (HOST_WIDE_INT, n); |
1393 access_fns = DR_ACCESS_FNS (dr); | 1453 access_fns = DR_ACCESS_FNS (dr); |
1394 | 1454 |
1395 for (i = 0; VEC_iterate (tree, access_fns, i, access_fn); i++) | 1455 FOR_EACH_VEC_ELT (tree, access_fns, i, access_fn) |
1396 { | 1456 { |
1397 /* Keep track of the reference corresponding to the subscript, so that we | 1457 /* Keep track of the reference corresponding to the subscript, so that we |
1398 know its stride. */ | 1458 know its stride. */ |
1399 while (handled_component_p (ref) && TREE_CODE (ref) != ARRAY_REF) | 1459 while (handled_component_p (ref) && TREE_CODE (ref) != ARRAY_REF) |
1400 ref = TREE_OPERAND (ref, 0); | 1460 ref = TREE_OPERAND (ref, 0); |
1500 are used just as a heuristics to estimate temporality of the | 1560 are used just as a heuristics to estimate temporality of the |
1501 references, hence we do not need to worry about correctness). */ | 1561 references, hence we do not need to worry about correctness). */ |
1502 for (gr = refs; gr; gr = gr->next) | 1562 for (gr = refs; gr; gr = gr->next) |
1503 for (ref = gr->refs; ref; ref = ref->next) | 1563 for (ref = gr->refs; ref; ref = ref->next) |
1504 { | 1564 { |
1505 dr = create_data_ref (nest, ref->mem, ref->stmt, !ref->write_p); | 1565 dr = create_data_ref (nest, loop_containing_stmt (ref->stmt), |
1566 ref->mem, ref->stmt, !ref->write_p); | |
1506 | 1567 |
1507 if (dr) | 1568 if (dr) |
1508 { | 1569 { |
1509 ref->reuse_distance = volume; | 1570 ref->reuse_distance = volume; |
1510 dr->aux = ref; | 1571 dr->aux = ref; |
1512 } | 1573 } |
1513 else | 1574 else |
1514 no_other_refs = false; | 1575 no_other_refs = false; |
1515 } | 1576 } |
1516 | 1577 |
1517 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++) | 1578 FOR_EACH_VEC_ELT (data_reference_p, datarefs, i, dr) |
1518 { | 1579 { |
1519 dist = self_reuse_distance (dr, loop_data_size, n, loop); | 1580 dist = self_reuse_distance (dr, loop_data_size, n, loop); |
1520 ref = (struct mem_ref *) dr->aux; | 1581 ref = (struct mem_ref *) dr->aux; |
1521 if (ref->reuse_distance > dist) | 1582 if (ref->reuse_distance > dist) |
1522 ref->reuse_distance = dist; | 1583 ref->reuse_distance = dist; |
1525 ref->independent_p = true; | 1586 ref->independent_p = true; |
1526 } | 1587 } |
1527 | 1588 |
1528 compute_all_dependences (datarefs, &dependences, vloops, true); | 1589 compute_all_dependences (datarefs, &dependences, vloops, true); |
1529 | 1590 |
1530 for (i = 0; VEC_iterate (ddr_p, dependences, i, dep); i++) | 1591 FOR_EACH_VEC_ELT (ddr_p, dependences, i, dep) |
1531 { | 1592 { |
1532 if (DDR_ARE_DEPENDENT (dep) == chrec_known) | 1593 if (DDR_ARE_DEPENDENT (dep) == chrec_known) |
1533 continue; | 1594 continue; |
1534 | 1595 |
1535 ref = (struct mem_ref *) DDR_A (dep)->aux; | 1596 ref = (struct mem_ref *) DDR_A (dep)->aux; |
1604 fprintf (dump_file, " ref %p distance %u\n", | 1665 fprintf (dump_file, " ref %p distance %u\n", |
1605 (void *) ref, ref->reuse_distance); | 1666 (void *) ref, ref->reuse_distance); |
1606 } | 1667 } |
1607 } | 1668 } |
1608 | 1669 |
1609 /* Do a cost-benefit analysis to determine if prefetching is profitable | 1670 /* Determine whether or not the trip count to ahead ratio is too small based |
1610 for the current loop given the following parameters: | 1671 on prefitablility consideration. |
1611 AHEAD: the iteration ahead distance, | 1672 AHEAD: the iteration ahead distance, |
1612 EST_NITER: the estimated trip count, | 1673 EST_NITER: the estimated trip count. */ |
1674 | |
1675 static bool | |
1676 trip_count_to_ahead_ratio_too_small_p (unsigned ahead, HOST_WIDE_INT est_niter) | |
1677 { | |
1678 /* Assume trip count to ahead ratio is big enough if the trip count could not | |
1679 be estimated at compile time. */ | |
1680 if (est_niter < 0) | |
1681 return false; | |
1682 | |
1683 if (est_niter < (HOST_WIDE_INT) (TRIP_COUNT_TO_AHEAD_RATIO * ahead)) | |
1684 { | |
1685 if (dump_file && (dump_flags & TDF_DETAILS)) | |
1686 fprintf (dump_file, | |
1687 "Not prefetching -- loop estimated to roll only %d times\n", | |
1688 (int) est_niter); | |
1689 return true; | |
1690 } | |
1691 | |
1692 return false; | |
1693 } | |
1694 | |
1695 /* Determine whether or not the number of memory references in the loop is | |
1696 reasonable based on the profitablity and compilation time considerations. | |
1613 NINSNS: estimated number of instructions in the loop, | 1697 NINSNS: estimated number of instructions in the loop, |
1614 PREFETCH_COUNT: an estimate of the number of prefetches | |
1615 MEM_REF_COUNT: total number of memory references in the loop. */ | 1698 MEM_REF_COUNT: total number of memory references in the loop. */ |
1616 | 1699 |
1617 static bool | 1700 static bool |
1618 is_loop_prefetching_profitable (unsigned ahead, HOST_WIDE_INT est_niter, | 1701 mem_ref_count_reasonable_p (unsigned ninsns, unsigned mem_ref_count) |
1619 unsigned ninsns, unsigned prefetch_count, | 1702 { |
1620 unsigned mem_ref_count, unsigned unroll_factor) | 1703 int insn_to_mem_ratio; |
1621 { | |
1622 int insn_to_mem_ratio, insn_to_prefetch_ratio; | |
1623 | 1704 |
1624 if (mem_ref_count == 0) | 1705 if (mem_ref_count == 0) |
1706 return false; | |
1707 | |
1708 /* Miss rate computation (is_miss_rate_acceptable) and dependence analysis | |
1709 (compute_all_dependences) have high costs based on quadratic complexity. | |
1710 To avoid huge compilation time, we give up prefetching if mem_ref_count | |
1711 is too large. */ | |
1712 if (mem_ref_count > PREFETCH_MAX_MEM_REFS_PER_LOOP) | |
1625 return false; | 1713 return false; |
1626 | 1714 |
1627 /* Prefetching improves performance by overlapping cache missing | 1715 /* Prefetching improves performance by overlapping cache missing |
1628 memory accesses with CPU operations. If the loop does not have | 1716 memory accesses with CPU operations. If the loop does not have |
1629 enough CPU operations to overlap with memory operations, prefetching | 1717 enough CPU operations to overlap with memory operations, prefetching |
1641 fprintf (dump_file, | 1729 fprintf (dump_file, |
1642 "Not prefetching -- instruction to memory reference ratio (%d) too small\n", | 1730 "Not prefetching -- instruction to memory reference ratio (%d) too small\n", |
1643 insn_to_mem_ratio); | 1731 insn_to_mem_ratio); |
1644 return false; | 1732 return false; |
1645 } | 1733 } |
1734 | |
1735 return true; | |
1736 } | |
1737 | |
1738 /* Determine whether or not the instruction to prefetch ratio in the loop is | |
1739 too small based on the profitablity consideration. | |
1740 NINSNS: estimated number of instructions in the loop, | |
1741 PREFETCH_COUNT: an estimate of the number of prefetches, | |
1742 UNROLL_FACTOR: the factor to unroll the loop if prefetching. */ | |
1743 | |
1744 static bool | |
1745 insn_to_prefetch_ratio_too_small_p (unsigned ninsns, unsigned prefetch_count, | |
1746 unsigned unroll_factor) | |
1747 { | |
1748 int insn_to_prefetch_ratio; | |
1646 | 1749 |
1647 /* Prefetching most likely causes performance degradation when the instruction | 1750 /* Prefetching most likely causes performance degradation when the instruction |
1648 to prefetch ratio is too small. Too many prefetch instructions in a loop | 1751 to prefetch ratio is too small. Too many prefetch instructions in a loop |
1649 may reduce the I-cache performance. | 1752 may reduce the I-cache performance. |
1650 (unroll_factor * ninsns) is used to estimate the number of instructions in | 1753 (unroll_factor * ninsns) is used to estimate the number of instructions in |
1661 { | 1764 { |
1662 if (dump_file && (dump_flags & TDF_DETAILS)) | 1765 if (dump_file && (dump_flags & TDF_DETAILS)) |
1663 fprintf (dump_file, | 1766 fprintf (dump_file, |
1664 "Not prefetching -- instruction to prefetch ratio (%d) too small\n", | 1767 "Not prefetching -- instruction to prefetch ratio (%d) too small\n", |
1665 insn_to_prefetch_ratio); | 1768 insn_to_prefetch_ratio); |
1666 return false; | 1769 return true; |
1667 } | 1770 } |
1668 | 1771 |
1669 /* Could not do further estimation if the trip count is unknown. Just assume | 1772 return false; |
1670 prefetching is profitable. Too aggressive??? */ | |
1671 if (est_niter < 0) | |
1672 return true; | |
1673 | |
1674 if (est_niter < (HOST_WIDE_INT) (TRIP_COUNT_TO_AHEAD_RATIO * ahead)) | |
1675 { | |
1676 if (dump_file && (dump_flags & TDF_DETAILS)) | |
1677 fprintf (dump_file, | |
1678 "Not prefetching -- loop estimated to roll only %d times\n", | |
1679 (int) est_niter); | |
1680 return false; | |
1681 } | |
1682 return true; | |
1683 } | 1773 } |
1684 | 1774 |
1685 | 1775 |
1686 /* Issue prefetch instructions for array references in LOOP. Returns | 1776 /* Issue prefetch instructions for array references in LOOP. Returns |
1687 true if the LOOP was unrolled. */ | 1777 true if the LOOP was unrolled. */ |
1702 if (dump_file && (dump_flags & TDF_DETAILS)) | 1792 if (dump_file && (dump_flags & TDF_DETAILS)) |
1703 fprintf (dump_file, " ignored (cold area)\n"); | 1793 fprintf (dump_file, " ignored (cold area)\n"); |
1704 return false; | 1794 return false; |
1705 } | 1795 } |
1706 | 1796 |
1707 /* Step 1: gather the memory references. */ | |
1708 refs = gather_memory_references (loop, &no_other_refs, &mem_ref_count); | |
1709 | |
1710 /* Step 2: estimate the reuse effects. */ | |
1711 prune_by_reuse (refs); | |
1712 | |
1713 prefetch_count = estimate_prefetch_count (refs); | |
1714 if (prefetch_count == 0) | |
1715 goto fail; | |
1716 | |
1717 determine_loop_nest_reuse (loop, refs, no_other_refs); | |
1718 | |
1719 /* Step 3: determine the ahead and unroll factor. */ | |
1720 | |
1721 /* FIXME: the time should be weighted by the probabilities of the blocks in | 1797 /* FIXME: the time should be weighted by the probabilities of the blocks in |
1722 the loop body. */ | 1798 the loop body. */ |
1723 time = tree_num_loop_insns (loop, &eni_time_weights); | 1799 time = tree_num_loop_insns (loop, &eni_time_weights); |
1800 if (time == 0) | |
1801 return false; | |
1802 | |
1724 ahead = (PREFETCH_LATENCY + time - 1) / time; | 1803 ahead = (PREFETCH_LATENCY + time - 1) / time; |
1725 est_niter = estimated_loop_iterations_int (loop, false); | 1804 est_niter = estimated_loop_iterations_int (loop, false); |
1726 | 1805 |
1806 /* Prefetching is not likely to be profitable if the trip count to ahead | |
1807 ratio is too small. */ | |
1808 if (trip_count_to_ahead_ratio_too_small_p (ahead, est_niter)) | |
1809 return false; | |
1810 | |
1727 ninsns = tree_num_loop_insns (loop, &eni_size_weights); | 1811 ninsns = tree_num_loop_insns (loop, &eni_size_weights); |
1812 | |
1813 /* Step 1: gather the memory references. */ | |
1814 refs = gather_memory_references (loop, &no_other_refs, &mem_ref_count); | |
1815 | |
1816 /* Give up prefetching if the number of memory references in the | |
1817 loop is not reasonable based on profitablity and compilation time | |
1818 considerations. */ | |
1819 if (!mem_ref_count_reasonable_p (ninsns, mem_ref_count)) | |
1820 goto fail; | |
1821 | |
1822 /* Step 2: estimate the reuse effects. */ | |
1823 prune_by_reuse (refs); | |
1824 | |
1825 if (nothing_to_prefetch_p (refs)) | |
1826 goto fail; | |
1827 | |
1828 determine_loop_nest_reuse (loop, refs, no_other_refs); | |
1829 | |
1830 /* Step 3: determine unroll factor. */ | |
1728 unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc, | 1831 unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc, |
1729 est_niter); | 1832 est_niter); |
1833 | |
1834 /* Estimate prefetch count for the unrolled loop. */ | |
1835 prefetch_count = estimate_prefetch_count (refs, unroll_factor); | |
1836 if (prefetch_count == 0) | |
1837 goto fail; | |
1838 | |
1730 if (dump_file && (dump_flags & TDF_DETAILS)) | 1839 if (dump_file && (dump_flags & TDF_DETAILS)) |
1731 fprintf (dump_file, "Ahead %d, unroll factor %d, trip count " | 1840 fprintf (dump_file, "Ahead %d, unroll factor %d, trip count " |
1732 HOST_WIDE_INT_PRINT_DEC "\n" | 1841 HOST_WIDE_INT_PRINT_DEC "\n" |
1733 "insn count %d, mem ref count %d, prefetch count %d\n", | 1842 "insn count %d, mem ref count %d, prefetch count %d\n", |
1734 ahead, unroll_factor, est_niter, | 1843 ahead, unroll_factor, est_niter, |
1735 ninsns, mem_ref_count, prefetch_count); | 1844 ninsns, mem_ref_count, prefetch_count); |
1736 | 1845 |
1737 if (!is_loop_prefetching_profitable (ahead, est_niter, ninsns, prefetch_count, | 1846 /* Prefetching is not likely to be profitable if the instruction to prefetch |
1738 mem_ref_count, unroll_factor)) | 1847 ratio is too small. */ |
1848 if (insn_to_prefetch_ratio_too_small_p (ninsns, prefetch_count, | |
1849 unroll_factor)) | |
1739 goto fail; | 1850 goto fail; |
1740 | 1851 |
1741 mark_nontemporal_stores (loop, refs); | 1852 mark_nontemporal_stores (loop, refs); |
1742 | 1853 |
1743 /* Step 4: what to prefetch? */ | 1854 /* Step 4: what to prefetch? */ |
1799 | 1910 |
1800 initialize_original_copy_tables (); | 1911 initialize_original_copy_tables (); |
1801 | 1912 |
1802 if (!built_in_decls[BUILT_IN_PREFETCH]) | 1913 if (!built_in_decls[BUILT_IN_PREFETCH]) |
1803 { | 1914 { |
1804 tree type = build_function_type (void_type_node, | 1915 tree type = build_function_type_list (void_type_node, |
1805 tree_cons (NULL_TREE, | 1916 const_ptr_type_node, NULL_TREE); |
1806 const_ptr_type_node, | |
1807 NULL_TREE)); | |
1808 tree decl = add_builtin_function ("__builtin_prefetch", type, | 1917 tree decl = add_builtin_function ("__builtin_prefetch", type, |
1809 BUILT_IN_PREFETCH, BUILT_IN_NORMAL, | 1918 BUILT_IN_PREFETCH, BUILT_IN_NORMAL, |
1810 NULL, NULL_TREE); | 1919 NULL, NULL_TREE); |
1811 DECL_IS_NOVOPS (decl) = true; | 1920 DECL_IS_NOVOPS (decl) = true; |
1812 built_in_decls[BUILT_IN_PREFETCH] = decl; | 1921 built_in_decls[BUILT_IN_PREFETCH] = decl; |