comparison gcc/tree-ssa-loop-prefetch.c @ 63:b7f97abdc517 gcc-4.6-20100522

update gcc from gcc-4.5.0 to gcc-4.6
author ryoma <e075725@ie.u-ryukyu.ac.jp>
date Mon, 24 May 2010 12:47:05 +0900
parents 77e2b8dfacca
children f6334be47118
comparison
equal deleted inserted replaced
56:3c8a44c06a95 63:b7f97abdc517
1 /* Array prefetching. 1 /* Array prefetching.
2 Copyright (C) 2005, 2007, 2008 Free Software Foundation, Inc. 2 Copyright (C) 2005, 2007, 2008, 2009, 2010 Free Software Foundation, Inc.
3 3
4 This file is part of GCC. 4 This file is part of GCC.
5 5
6 GCC is free software; you can redistribute it and/or modify it 6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the 7 under the terms of the GNU General Public License as published by the
20 #include "config.h" 20 #include "config.h"
21 #include "system.h" 21 #include "system.h"
22 #include "coretypes.h" 22 #include "coretypes.h"
23 #include "tm.h" 23 #include "tm.h"
24 #include "tree.h" 24 #include "tree.h"
25 #include "rtl.h"
26 #include "tm_p.h" 25 #include "tm_p.h"
27 #include "hard-reg-set.h"
28 #include "basic-block.h" 26 #include "basic-block.h"
29 #include "output.h" 27 #include "output.h"
30 #include "diagnostic.h" 28 #include "diagnostic.h"
29 #include "tree-pretty-print.h"
31 #include "tree-flow.h" 30 #include "tree-flow.h"
32 #include "tree-dump.h" 31 #include "tree-dump.h"
33 #include "timevar.h" 32 #include "timevar.h"
34 #include "cfgloop.h" 33 #include "cfgloop.h"
35 #include "varray.h"
36 #include "expr.h" 34 #include "expr.h"
37 #include "tree-pass.h" 35 #include "tree-pass.h"
38 #include "ggc.h"
39 #include "insn-config.h" 36 #include "insn-config.h"
40 #include "recog.h" 37 #include "recog.h"
41 #include "hashtab.h" 38 #include "hashtab.h"
42 #include "tree-chrec.h" 39 #include "tree-chrec.h"
43 #include "tree-scalar-evolution.h" 40 #include "tree-scalar-evolution.h"
198 195
199 #ifndef FENCE_FOLLOWING_MOVNT 196 #ifndef FENCE_FOLLOWING_MOVNT
200 #define FENCE_FOLLOWING_MOVNT NULL_TREE 197 #define FENCE_FOLLOWING_MOVNT NULL_TREE
201 #endif 198 #endif
202 199
200 /* It is not profitable to prefetch when the trip count is not at
201 least TRIP_COUNT_TO_AHEAD_RATIO times the prefetch ahead distance.
202 For example, in a loop with a prefetch ahead distance of 10,
203 supposing that TRIP_COUNT_TO_AHEAD_RATIO is equal to 4, it is
204 profitable to prefetch when the trip count is greater or equal to
205 40. In that case, 30 out of the 40 iterations will benefit from
206 prefetching. */
207
208 #ifndef TRIP_COUNT_TO_AHEAD_RATIO
209 #define TRIP_COUNT_TO_AHEAD_RATIO 4
210 #endif
211
203 /* The group of references between that reuse may occur. */ 212 /* The group of references between that reuse may occur. */
204 213
205 struct mem_ref_group 214 struct mem_ref_group
206 { 215 {
207 tree base; /* Base of the reference. */ 216 tree base; /* Base of the reference. */
208 HOST_WIDE_INT step; /* Step of the reference. */ 217 tree step; /* Step of the reference. */
209 struct mem_ref *refs; /* References in the group. */ 218 struct mem_ref *refs; /* References in the group. */
210 struct mem_ref_group *next; /* Next group of references. */ 219 struct mem_ref_group *next; /* Next group of references. */
211 }; 220 };
212 221
213 /* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched. */ 222 /* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched. */
214 223
215 #define PREFETCH_ALL (~(unsigned HOST_WIDE_INT) 0) 224 #define PREFETCH_ALL (~(unsigned HOST_WIDE_INT) 0)
225
226 /* Do not generate a prefetch if the unroll factor is significantly less
227 than what is required by the prefetch. This is to avoid redundant
228 prefetches. For example, if prefetch_mod is 16 and unroll_factor is
229 1, this means prefetching requires unrolling the loop 16 times, but
230 the loop is not going to be unrolled. In this case (ratio = 16),
231 prefetching is not likely to be beneficial. */
232
233 #ifndef PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO
234 #define PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO 8
235 #endif
216 236
217 /* The memory reference. */ 237 /* The memory reference. */
218 238
219 struct mem_ref 239 struct mem_ref
220 { 240 {
247 fprintf (file, "Reference %p:\n", (void *) ref); 267 fprintf (file, "Reference %p:\n", (void *) ref);
248 268
249 fprintf (file, " group %p (base ", (void *) ref->group); 269 fprintf (file, " group %p (base ", (void *) ref->group);
250 print_generic_expr (file, ref->group->base, TDF_SLIM); 270 print_generic_expr (file, ref->group->base, TDF_SLIM);
251 fprintf (file, ", step "); 271 fprintf (file, ", step ");
252 fprintf (file, HOST_WIDE_INT_PRINT_DEC, ref->group->step); 272 if (cst_and_fits_in_hwi (ref->group->step))
273 fprintf (file, HOST_WIDE_INT_PRINT_DEC, int_cst_value (ref->group->step));
274 else
275 print_generic_expr (file, ref->group->step, TDF_TREE);
253 fprintf (file, ")\n"); 276 fprintf (file, ")\n");
254 277
255 fprintf (file, " delta "); 278 fprintf (file, " delta ");
256 fprintf (file, HOST_WIDE_INT_PRINT_DEC, ref->delta); 279 fprintf (file, HOST_WIDE_INT_PRINT_DEC, ref->delta);
257 fprintf (file, "\n"); 280 fprintf (file, "\n");
263 286
264 /* Finds a group with BASE and STEP in GROUPS, or creates one if it does not 287 /* Finds a group with BASE and STEP in GROUPS, or creates one if it does not
265 exist. */ 288 exist. */
266 289
267 static struct mem_ref_group * 290 static struct mem_ref_group *
268 find_or_create_group (struct mem_ref_group **groups, tree base, 291 find_or_create_group (struct mem_ref_group **groups, tree base, tree step)
269 HOST_WIDE_INT step)
270 { 292 {
271 struct mem_ref_group *group; 293 struct mem_ref_group *group;
272 294
273 for (; *groups; groups = &(*groups)->next) 295 for (; *groups; groups = &(*groups)->next)
274 { 296 {
275 if ((*groups)->step == step 297 if (operand_equal_p ((*groups)->step, step, 0)
276 && operand_equal_p ((*groups)->base, base, 0)) 298 && operand_equal_p ((*groups)->base, base, 0))
277 return *groups; 299 return *groups;
278 300
279 /* Keep the list of groups sorted by decreasing step. */ 301 /* If step is an integer constant, keep the list of groups sorted
280 if ((*groups)->step < step) 302 by decreasing step. */
303 if (cst_and_fits_in_hwi ((*groups)->step) && cst_and_fits_in_hwi (step)
304 && int_cst_value ((*groups)->step) < int_cst_value (step))
281 break; 305 break;
282 } 306 }
283 307
284 group = XNEW (struct mem_ref_group); 308 group = XNEW (struct mem_ref_group);
285 group->base = base; 309 group->base = base;
360 384
361 struct ar_data 385 struct ar_data
362 { 386 {
363 struct loop *loop; /* Loop of the reference. */ 387 struct loop *loop; /* Loop of the reference. */
364 gimple stmt; /* Statement of the reference. */ 388 gimple stmt; /* Statement of the reference. */
365 HOST_WIDE_INT *step; /* Step of the memory reference. */ 389 tree *step; /* Step of the memory reference. */
366 HOST_WIDE_INT *delta; /* Offset of the memory reference. */ 390 HOST_WIDE_INT *delta; /* Offset of the memory reference. */
367 }; 391 };
368 392
369 /* Analyzes a single INDEX of a memory reference to obtain information 393 /* Analyzes a single INDEX of a memory reference to obtain information
370 described at analyze_ref. Callback for for_each_index. */ 394 described at analyze_ref. Callback for for_each_index. */
372 static bool 396 static bool
373 idx_analyze_ref (tree base, tree *index, void *data) 397 idx_analyze_ref (tree base, tree *index, void *data)
374 { 398 {
375 struct ar_data *ar_data = (struct ar_data *) data; 399 struct ar_data *ar_data = (struct ar_data *) data;
376 tree ibase, step, stepsize; 400 tree ibase, step, stepsize;
377 HOST_WIDE_INT istep, idelta = 0, imult = 1; 401 HOST_WIDE_INT idelta = 0, imult = 1;
378 affine_iv iv; 402 affine_iv iv;
379 403
380 if (TREE_CODE (base) == MISALIGNED_INDIRECT_REF 404 if (TREE_CODE (base) == MISALIGNED_INDIRECT_REF
381 || TREE_CODE (base) == ALIGN_INDIRECT_REF) 405 || TREE_CODE (base) == ALIGN_INDIRECT_REF)
382 return false; 406 return false;
383 407
384 if (!simple_iv (ar_data->loop, loop_containing_stmt (ar_data->stmt), 408 if (!simple_iv (ar_data->loop, loop_containing_stmt (ar_data->stmt),
385 *index, &iv, false)) 409 *index, &iv, true))
386 return false; 410 return false;
387 ibase = iv.base; 411 ibase = iv.base;
388 step = iv.step; 412 step = iv.step;
389 413
390 if (!cst_and_fits_in_hwi (step))
391 return false;
392 istep = int_cst_value (step);
393
394 if (TREE_CODE (ibase) == POINTER_PLUS_EXPR 414 if (TREE_CODE (ibase) == POINTER_PLUS_EXPR
395 && cst_and_fits_in_hwi (TREE_OPERAND (ibase, 1))) 415 && cst_and_fits_in_hwi (TREE_OPERAND (ibase, 1)))
396 { 416 {
397 idelta = int_cst_value (TREE_OPERAND (ibase, 1)); 417 idelta = int_cst_value (TREE_OPERAND (ibase, 1));
398 ibase = TREE_OPERAND (ibase, 0); 418 ibase = TREE_OPERAND (ibase, 0);
401 { 421 {
402 idelta += int_cst_value (ibase); 422 idelta += int_cst_value (ibase);
403 ibase = build_int_cst (TREE_TYPE (ibase), 0); 423 ibase = build_int_cst (TREE_TYPE (ibase), 0);
404 } 424 }
405 425
426 if (*ar_data->step == NULL_TREE)
427 *ar_data->step = step;
428 else
429 *ar_data->step = fold_build2 (PLUS_EXPR, sizetype,
430 fold_convert (sizetype, *ar_data->step),
431 fold_convert (sizetype, step));
406 if (TREE_CODE (base) == ARRAY_REF) 432 if (TREE_CODE (base) == ARRAY_REF)
407 { 433 {
408 stepsize = array_ref_element_size (base); 434 stepsize = array_ref_element_size (base);
409 if (!cst_and_fits_in_hwi (stepsize)) 435 if (!cst_and_fits_in_hwi (stepsize))
410 return false; 436 return false;
411 imult = int_cst_value (stepsize); 437 imult = int_cst_value (stepsize);
412 438
413 istep *= imult; 439 *ar_data->step = fold_build2 (MULT_EXPR, sizetype,
440 fold_convert (sizetype, *ar_data->step),
441 fold_convert (sizetype, step));
414 idelta *= imult; 442 idelta *= imult;
415 } 443 }
416 444
417 *ar_data->step += istep;
418 *ar_data->delta += idelta; 445 *ar_data->delta += idelta;
419 *index = ibase; 446 *index = ibase;
420 447
421 return true; 448 return true;
422 } 449 }
426 reference occurs in statement STMT. Strips nonaddressable component 453 reference occurs in statement STMT. Strips nonaddressable component
427 references from REF_P. */ 454 references from REF_P. */
428 455
429 static bool 456 static bool
430 analyze_ref (struct loop *loop, tree *ref_p, tree *base, 457 analyze_ref (struct loop *loop, tree *ref_p, tree *base,
431 HOST_WIDE_INT *step, HOST_WIDE_INT *delta, 458 tree *step, HOST_WIDE_INT *delta,
432 gimple stmt) 459 gimple stmt)
433 { 460 {
434 struct ar_data ar_data; 461 struct ar_data ar_data;
435 tree off; 462 tree off;
436 HOST_WIDE_INT bit_offset; 463 HOST_WIDE_INT bit_offset;
437 tree ref = *ref_p; 464 tree ref = *ref_p;
438 465
439 *step = 0; 466 *step = NULL_TREE;
440 *delta = 0; 467 *delta = 0;
441 468
442 /* First strip off the component references. Ignore bitfields. */ 469 /* First strip off the component references. Ignore bitfields. */
443 if (TREE_CODE (ref) == COMPONENT_REF 470 if (TREE_CODE (ref) == COMPONENT_REF
444 && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1))) 471 && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1)))
469 496
470 static bool 497 static bool
471 gather_memory_references_ref (struct loop *loop, struct mem_ref_group **refs, 498 gather_memory_references_ref (struct loop *loop, struct mem_ref_group **refs,
472 tree ref, bool write_p, gimple stmt) 499 tree ref, bool write_p, gimple stmt)
473 { 500 {
474 tree base; 501 tree base, step;
475 HOST_WIDE_INT step, delta; 502 HOST_WIDE_INT delta;
476 struct mem_ref_group *agrp; 503 struct mem_ref_group *agrp;
477 504
478 if (get_base_address (ref) == NULL) 505 if (get_base_address (ref) == NULL)
479 return false; 506 return false;
480 507
481 if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt)) 508 if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt))
509 return false;
510 /* If analyze_ref fails the default is a NULL_TREE. We can stop here. */
511 if (step == NULL_TREE)
482 return false; 512 return false;
483 513
484 /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP 514 /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP
485 are integer constants. */ 515 are integer constants. */
486 agrp = find_or_create_group (refs, base, step); 516 agrp = find_or_create_group (refs, base, step);
552 /* Prune the prefetch candidate REF using the self-reuse. */ 582 /* Prune the prefetch candidate REF using the self-reuse. */
553 583
554 static void 584 static void
555 prune_ref_by_self_reuse (struct mem_ref *ref) 585 prune_ref_by_self_reuse (struct mem_ref *ref)
556 { 586 {
557 HOST_WIDE_INT step = ref->group->step; 587 HOST_WIDE_INT step;
558 bool backward = step < 0; 588 bool backward;
589
590 /* If the step size is non constant, we cannot calculate prefetch_mod. */
591 if (!cst_and_fits_in_hwi (ref->group->step))
592 return;
593
594 step = int_cst_value (ref->group->step);
595
596 backward = step < 0;
559 597
560 if (step == 0) 598 if (step == 0)
561 { 599 {
562 /* Prefetch references to invariant address just once. */ 600 /* Prefetch references to invariant address just once. */
563 ref->prefetch_before = 1; 601 ref->prefetch_before = 1;
637 675
638 static void 676 static void
639 prune_ref_by_group_reuse (struct mem_ref *ref, struct mem_ref *by, 677 prune_ref_by_group_reuse (struct mem_ref *ref, struct mem_ref *by,
640 bool by_is_before) 678 bool by_is_before)
641 { 679 {
642 HOST_WIDE_INT step = ref->group->step; 680 HOST_WIDE_INT step;
643 bool backward = step < 0; 681 bool backward;
644 HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta; 682 HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta;
645 HOST_WIDE_INT delta = delta_b - delta_r; 683 HOST_WIDE_INT delta = delta_b - delta_r;
646 HOST_WIDE_INT hit_from; 684 HOST_WIDE_INT hit_from;
647 unsigned HOST_WIDE_INT prefetch_before, prefetch_block; 685 unsigned HOST_WIDE_INT prefetch_before, prefetch_block;
648 int miss_rate; 686 int miss_rate;
649 HOST_WIDE_INT reduced_step; 687 HOST_WIDE_INT reduced_step;
650 unsigned HOST_WIDE_INT reduced_prefetch_block; 688 unsigned HOST_WIDE_INT reduced_prefetch_block;
651 tree ref_type; 689 tree ref_type;
652 int align_unit; 690 int align_unit;
653 691
692 /* If the step is non constant we cannot calculate prefetch_before. */
693 if (!cst_and_fits_in_hwi (ref->group->step)) {
694 return;
695 }
696
697 step = int_cst_value (ref->group->step);
698
699 backward = step < 0;
700
701
654 if (delta == 0) 702 if (delta == 0)
655 { 703 {
656 /* If the references has the same address, only prefetch the 704 /* If the references has the same address, only prefetch the
657 former. */ 705 former. */
658 if (by_is_before) 706 if (by_is_before)
703 { 751 {
704 /* The accesses are sure to meet. Let us check when. */ 752 /* The accesses are sure to meet. Let us check when. */
705 hit_from = ddown (delta_b, PREFETCH_BLOCK) * PREFETCH_BLOCK; 753 hit_from = ddown (delta_b, PREFETCH_BLOCK) * PREFETCH_BLOCK;
706 prefetch_before = (hit_from - delta_r + step - 1) / step; 754 prefetch_before = (hit_from - delta_r + step - 1) / step;
707 755
756 /* Do not reduce prefetch_before if we meet beyond cache size. */
757 if (prefetch_before > (unsigned) abs (L2_CACHE_SIZE_BYTES / step))
758 prefetch_before = PREFETCH_ALL;
708 if (prefetch_before < ref->prefetch_before) 759 if (prefetch_before < ref->prefetch_before)
709 ref->prefetch_before = prefetch_before; 760 ref->prefetch_before = prefetch_before;
710 761
711 return; 762 return;
712 } 763 }
733 align_unit = TYPE_ALIGN (ref_type) / 8; 784 align_unit = TYPE_ALIGN (ref_type) / 8;
734 miss_rate = compute_miss_rate(prefetch_block, step, delta, 785 miss_rate = compute_miss_rate(prefetch_block, step, delta,
735 reduced_prefetch_block, align_unit); 786 reduced_prefetch_block, align_unit);
736 if (miss_rate <= ACCEPTABLE_MISS_RATE) 787 if (miss_rate <= ACCEPTABLE_MISS_RATE)
737 { 788 {
789 /* Do not reduce prefetch_before if we meet beyond cache size. */
790 if (prefetch_before > L2_CACHE_SIZE_BYTES / PREFETCH_BLOCK)
791 prefetch_before = PREFETCH_ALL;
738 if (prefetch_before < ref->prefetch_before) 792 if (prefetch_before < ref->prefetch_before)
739 ref->prefetch_before = prefetch_before; 793 ref->prefetch_before = prefetch_before;
740 794
741 return; 795 return;
742 } 796 }
847 should_issue_prefetch_p (struct mem_ref *ref) 901 should_issue_prefetch_p (struct mem_ref *ref)
848 { 902 {
849 /* For now do not issue prefetches for only first few of the 903 /* For now do not issue prefetches for only first few of the
850 iterations. */ 904 iterations. */
851 if (ref->prefetch_before != PREFETCH_ALL) 905 if (ref->prefetch_before != PREFETCH_ALL)
852 return false; 906 {
907 if (dump_file && (dump_flags & TDF_DETAILS))
908 fprintf (dump_file, "Ignoring %p due to prefetch_before\n",
909 (void *) ref);
910 return false;
911 }
853 912
854 /* Do not prefetch nontemporal stores. */ 913 /* Do not prefetch nontemporal stores. */
855 if (ref->storent_p) 914 if (ref->storent_p)
856 return false; 915 {
916 if (dump_file && (dump_flags & TDF_DETAILS))
917 fprintf (dump_file, "Ignoring nontemporal store %p\n", (void *) ref);
918 return false;
919 }
857 920
858 return true; 921 return true;
859 } 922 }
860 923
861 /* Decide which of the prefetch candidates in GROUPS to prefetch. 924 /* Decide which of the prefetch candidates in GROUPS to prefetch.
893 for (ref = groups->refs; ref; ref = ref->next) 956 for (ref = groups->refs; ref; ref = ref->next)
894 { 957 {
895 if (!should_issue_prefetch_p (ref)) 958 if (!should_issue_prefetch_p (ref))
896 continue; 959 continue;
897 960
961 /* The loop is far from being sufficiently unrolled for this
962 prefetch. Do not generate prefetch to avoid many redudant
963 prefetches. */
964 if (ref->prefetch_mod / unroll_factor > PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO)
965 continue;
966
898 /* If we need to prefetch the reference each PREFETCH_MOD iterations, 967 /* If we need to prefetch the reference each PREFETCH_MOD iterations,
899 and we unroll the loop UNROLL_FACTOR times, we need to insert 968 and we unroll the loop UNROLL_FACTOR times, we need to insert
900 ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each 969 ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each
901 iteration. */ 970 iteration. */
902 n_prefetches = ((unroll_factor + ref->prefetch_mod - 1) 971 n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
941 1010
942 static void 1011 static void
943 issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead) 1012 issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
944 { 1013 {
945 HOST_WIDE_INT delta; 1014 HOST_WIDE_INT delta;
946 tree addr, addr_base, write_p, local; 1015 tree addr, addr_base, write_p, local, forward;
947 gimple prefetch; 1016 gimple prefetch;
948 gimple_stmt_iterator bsi; 1017 gimple_stmt_iterator bsi;
949 unsigned n_prefetches, ap; 1018 unsigned n_prefetches, ap;
950 bool nontemporal = ref->reuse_distance >= L2_CACHE_SIZE_BYTES; 1019 bool nontemporal = ref->reuse_distance >= L2_CACHE_SIZE_BYTES;
951 1020
964 write_p = ref->write_p ? integer_one_node : integer_zero_node; 1033 write_p = ref->write_p ? integer_one_node : integer_zero_node;
965 local = build_int_cst (integer_type_node, nontemporal ? 0 : 3); 1034 local = build_int_cst (integer_type_node, nontemporal ? 0 : 3);
966 1035
967 for (ap = 0; ap < n_prefetches; ap++) 1036 for (ap = 0; ap < n_prefetches; ap++)
968 { 1037 {
969 /* Determine the address to prefetch. */ 1038 if (cst_and_fits_in_hwi (ref->group->step))
970 delta = (ahead + ap * ref->prefetch_mod) * ref->group->step; 1039 {
971 addr = fold_build2 (POINTER_PLUS_EXPR, ptr_type_node, 1040 /* Determine the address to prefetch. */
972 addr_base, size_int (delta)); 1041 delta = (ahead + ap * ref->prefetch_mod) *
973 addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true, NULL, 1042 int_cst_value (ref->group->step);
974 true, GSI_SAME_STMT); 1043 addr = fold_build2 (POINTER_PLUS_EXPR, ptr_type_node,
975 1044 addr_base, size_int (delta));
1045 addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true, NULL,
1046 true, GSI_SAME_STMT);
1047 }
1048 else
1049 {
1050 /* The step size is non-constant but loop-invariant. We use the
1051 heuristic to simply prefetch ahead iterations ahead. */
1052 forward = fold_build2 (MULT_EXPR, sizetype,
1053 fold_convert (sizetype, ref->group->step),
1054 fold_convert (sizetype, size_int (ahead)));
1055 addr = fold_build2 (POINTER_PLUS_EXPR, ptr_type_node, addr_base,
1056 forward);
1057 addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true,
1058 NULL, true, GSI_SAME_STMT);
1059 }
976 /* Create the prefetch instruction. */ 1060 /* Create the prefetch instruction. */
977 prefetch = gimple_build_call (built_in_decls[BUILT_IN_PREFETCH], 1061 prefetch = gimple_build_call (built_in_decls[BUILT_IN_PREFETCH],
978 3, addr, write_p, local); 1062 3, addr, write_p, local);
979 gsi_insert_before (&bsi, prefetch, GSI_SAME_STMT); 1063 gsi_insert_before (&bsi, prefetch, GSI_SAME_STMT);
980 } 1064 }
1531 MEM_REF_COUNT: total number of memory references in the loop. */ 1615 MEM_REF_COUNT: total number of memory references in the loop. */
1532 1616
1533 static bool 1617 static bool
1534 is_loop_prefetching_profitable (unsigned ahead, HOST_WIDE_INT est_niter, 1618 is_loop_prefetching_profitable (unsigned ahead, HOST_WIDE_INT est_niter,
1535 unsigned ninsns, unsigned prefetch_count, 1619 unsigned ninsns, unsigned prefetch_count,
1536 unsigned mem_ref_count) 1620 unsigned mem_ref_count, unsigned unroll_factor)
1537 { 1621 {
1538 int insn_to_mem_ratio, insn_to_prefetch_ratio; 1622 int insn_to_mem_ratio, insn_to_prefetch_ratio;
1539 1623
1540 if (mem_ref_count == 0) 1624 if (mem_ref_count == 0)
1541 return false; 1625 return false;
1550 for each CPU or memory op in the loop. Time estimates for memory ops 1634 for each CPU or memory op in the loop. Time estimates for memory ops
1551 should account for cache misses. */ 1635 should account for cache misses. */
1552 insn_to_mem_ratio = ninsns / mem_ref_count; 1636 insn_to_mem_ratio = ninsns / mem_ref_count;
1553 1637
1554 if (insn_to_mem_ratio < PREFETCH_MIN_INSN_TO_MEM_RATIO) 1638 if (insn_to_mem_ratio < PREFETCH_MIN_INSN_TO_MEM_RATIO)
1555 return false; 1639 {
1556 1640 if (dump_file && (dump_flags & TDF_DETAILS))
1557 /* Profitability of prefetching is highly dependent on the trip count. 1641 fprintf (dump_file,
1558 For a given AHEAD distance, the first AHEAD iterations do not benefit 1642 "Not prefetching -- instruction to memory reference ratio (%d) too small\n",
1559 from prefetching, and the last AHEAD iterations execute useless 1643 insn_to_mem_ratio);
1560 prefetches. So, if the trip count is not large enough relative to AHEAD, 1644 return false;
1561 prefetching may cause serious performance degradation. To avoid this 1645 }
1562 problem when the trip count is not known at compile time, we 1646
1563 conservatively skip loops with high prefetching costs. For now, only 1647 /* Prefetching most likely causes performance degradation when the instruction
1564 the I-cache cost is considered. The relative I-cache cost is estimated 1648 to prefetch ratio is too small. Too many prefetch instructions in a loop
1565 by taking the ratio between the number of prefetches and the total 1649 may reduce the I-cache performance.
1566 number of instructions. Since we are using integer arithmetic, we 1650 (unroll_factor * ninsns) is used to estimate the number of instructions in
1567 compute the reciprocal of this ratio. 1651 the unrolled loop. This implementation is a bit simplistic -- the number
1568 TODO: Account for loop unrolling, which may reduce the costs of 1652 of issued prefetch instructions is also affected by unrolling. So,
1569 shorter stride prefetches. Note that not accounting for loop 1653 prefetch_mod and the unroll factor should be taken into account when
1570 unrolling over-estimates the cost and hence gives more conservative 1654 determining prefetch_count. Also, the number of insns of the unrolled
1571 results. */ 1655 loop will usually be significantly smaller than the number of insns of the
1656 original loop * unroll_factor (at least the induction variable increases
1657 and the exit branches will get eliminated), so it might be better to use
1658 tree_estimate_loop_size + estimated_unrolled_size. */
1659 insn_to_prefetch_ratio = (unroll_factor * ninsns) / prefetch_count;
1660 if (insn_to_prefetch_ratio < MIN_INSN_TO_PREFETCH_RATIO)
1661 {
1662 if (dump_file && (dump_flags & TDF_DETAILS))
1663 fprintf (dump_file,
1664 "Not prefetching -- instruction to prefetch ratio (%d) too small\n",
1665 insn_to_prefetch_ratio);
1666 return false;
1667 }
1668
1669 /* Could not do further estimation if the trip count is unknown. Just assume
1670 prefetching is profitable. Too aggressive??? */
1572 if (est_niter < 0) 1671 if (est_niter < 0)
1573 { 1672 return true;
1574 insn_to_prefetch_ratio = ninsns / prefetch_count; 1673
1575 return insn_to_prefetch_ratio >= MIN_INSN_TO_PREFETCH_RATIO; 1674 if (est_niter < (HOST_WIDE_INT) (TRIP_COUNT_TO_AHEAD_RATIO * ahead))
1576 }
1577
1578 if (est_niter <= (HOST_WIDE_INT) ahead)
1579 { 1675 {
1580 if (dump_file && (dump_flags & TDF_DETAILS)) 1676 if (dump_file && (dump_flags & TDF_DETAILS))
1581 fprintf (dump_file, 1677 fprintf (dump_file,
1582 "Not prefetching -- loop estimated to roll only %d times\n", 1678 "Not prefetching -- loop estimated to roll only %d times\n",
1583 (int) est_niter); 1679 (int) est_niter);
1636 HOST_WIDE_INT_PRINT_DEC "\n" 1732 HOST_WIDE_INT_PRINT_DEC "\n"
1637 "insn count %d, mem ref count %d, prefetch count %d\n", 1733 "insn count %d, mem ref count %d, prefetch count %d\n",
1638 ahead, unroll_factor, est_niter, 1734 ahead, unroll_factor, est_niter,
1639 ninsns, mem_ref_count, prefetch_count); 1735 ninsns, mem_ref_count, prefetch_count);
1640 1736
1641 if (!is_loop_prefetching_profitable (ahead, est_niter, ninsns, 1737 if (!is_loop_prefetching_profitable (ahead, est_niter, ninsns, prefetch_count,
1642 prefetch_count, mem_ref_count)) 1738 mem_ref_count, unroll_factor))
1643 goto fail; 1739 goto fail;
1644 1740
1645 mark_nontemporal_stores (loop, refs); 1741 mark_nontemporal_stores (loop, refs);
1646 1742
1647 /* Step 4: what to prefetch? */ 1743 /* Step 4: what to prefetch? */