Mercurial > hg > CbC > CbC_gcc
comparison gcc/tree-ssa-loop-prefetch.c @ 63:b7f97abdc517 gcc-4.6-20100522
update gcc from gcc-4.5.0 to gcc-4.6
author | ryoma <e075725@ie.u-ryukyu.ac.jp> |
---|---|
date | Mon, 24 May 2010 12:47:05 +0900 |
parents | 77e2b8dfacca |
children | f6334be47118 |
comparison
equal
deleted
inserted
replaced
56:3c8a44c06a95 | 63:b7f97abdc517 |
---|---|
1 /* Array prefetching. | 1 /* Array prefetching. |
2 Copyright (C) 2005, 2007, 2008 Free Software Foundation, Inc. | 2 Copyright (C) 2005, 2007, 2008, 2009, 2010 Free Software Foundation, Inc. |
3 | 3 |
4 This file is part of GCC. | 4 This file is part of GCC. |
5 | 5 |
6 GCC is free software; you can redistribute it and/or modify it | 6 GCC is free software; you can redistribute it and/or modify it |
7 under the terms of the GNU General Public License as published by the | 7 under the terms of the GNU General Public License as published by the |
20 #include "config.h" | 20 #include "config.h" |
21 #include "system.h" | 21 #include "system.h" |
22 #include "coretypes.h" | 22 #include "coretypes.h" |
23 #include "tm.h" | 23 #include "tm.h" |
24 #include "tree.h" | 24 #include "tree.h" |
25 #include "rtl.h" | |
26 #include "tm_p.h" | 25 #include "tm_p.h" |
27 #include "hard-reg-set.h" | |
28 #include "basic-block.h" | 26 #include "basic-block.h" |
29 #include "output.h" | 27 #include "output.h" |
30 #include "diagnostic.h" | 28 #include "diagnostic.h" |
29 #include "tree-pretty-print.h" | |
31 #include "tree-flow.h" | 30 #include "tree-flow.h" |
32 #include "tree-dump.h" | 31 #include "tree-dump.h" |
33 #include "timevar.h" | 32 #include "timevar.h" |
34 #include "cfgloop.h" | 33 #include "cfgloop.h" |
35 #include "varray.h" | |
36 #include "expr.h" | 34 #include "expr.h" |
37 #include "tree-pass.h" | 35 #include "tree-pass.h" |
38 #include "ggc.h" | |
39 #include "insn-config.h" | 36 #include "insn-config.h" |
40 #include "recog.h" | 37 #include "recog.h" |
41 #include "hashtab.h" | 38 #include "hashtab.h" |
42 #include "tree-chrec.h" | 39 #include "tree-chrec.h" |
43 #include "tree-scalar-evolution.h" | 40 #include "tree-scalar-evolution.h" |
198 | 195 |
199 #ifndef FENCE_FOLLOWING_MOVNT | 196 #ifndef FENCE_FOLLOWING_MOVNT |
200 #define FENCE_FOLLOWING_MOVNT NULL_TREE | 197 #define FENCE_FOLLOWING_MOVNT NULL_TREE |
201 #endif | 198 #endif |
202 | 199 |
200 /* It is not profitable to prefetch when the trip count is not at | |
201 least TRIP_COUNT_TO_AHEAD_RATIO times the prefetch ahead distance. | |
202 For example, in a loop with a prefetch ahead distance of 10, | |
203 supposing that TRIP_COUNT_TO_AHEAD_RATIO is equal to 4, it is | |
204 profitable to prefetch when the trip count is greater or equal to | |
205 40. In that case, 30 out of the 40 iterations will benefit from | |
206 prefetching. */ | |
207 | |
208 #ifndef TRIP_COUNT_TO_AHEAD_RATIO | |
209 #define TRIP_COUNT_TO_AHEAD_RATIO 4 | |
210 #endif | |
211 | |
203 /* The group of references between that reuse may occur. */ | 212 /* The group of references between that reuse may occur. */ |
204 | 213 |
205 struct mem_ref_group | 214 struct mem_ref_group |
206 { | 215 { |
207 tree base; /* Base of the reference. */ | 216 tree base; /* Base of the reference. */ |
208 HOST_WIDE_INT step; /* Step of the reference. */ | 217 tree step; /* Step of the reference. */ |
209 struct mem_ref *refs; /* References in the group. */ | 218 struct mem_ref *refs; /* References in the group. */ |
210 struct mem_ref_group *next; /* Next group of references. */ | 219 struct mem_ref_group *next; /* Next group of references. */ |
211 }; | 220 }; |
212 | 221 |
213 /* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched. */ | 222 /* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched. */ |
214 | 223 |
215 #define PREFETCH_ALL (~(unsigned HOST_WIDE_INT) 0) | 224 #define PREFETCH_ALL (~(unsigned HOST_WIDE_INT) 0) |
225 | |
226 /* Do not generate a prefetch if the unroll factor is significantly less | |
227 than what is required by the prefetch. This is to avoid redundant | |
228 prefetches. For example, if prefetch_mod is 16 and unroll_factor is | |
229 1, this means prefetching requires unrolling the loop 16 times, but | |
230 the loop is not going to be unrolled. In this case (ratio = 16), | |
231 prefetching is not likely to be beneficial. */ | |
232 | |
233 #ifndef PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO | |
234 #define PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO 8 | |
235 #endif | |
216 | 236 |
217 /* The memory reference. */ | 237 /* The memory reference. */ |
218 | 238 |
219 struct mem_ref | 239 struct mem_ref |
220 { | 240 { |
247 fprintf (file, "Reference %p:\n", (void *) ref); | 267 fprintf (file, "Reference %p:\n", (void *) ref); |
248 | 268 |
249 fprintf (file, " group %p (base ", (void *) ref->group); | 269 fprintf (file, " group %p (base ", (void *) ref->group); |
250 print_generic_expr (file, ref->group->base, TDF_SLIM); | 270 print_generic_expr (file, ref->group->base, TDF_SLIM); |
251 fprintf (file, ", step "); | 271 fprintf (file, ", step "); |
252 fprintf (file, HOST_WIDE_INT_PRINT_DEC, ref->group->step); | 272 if (cst_and_fits_in_hwi (ref->group->step)) |
273 fprintf (file, HOST_WIDE_INT_PRINT_DEC, int_cst_value (ref->group->step)); | |
274 else | |
275 print_generic_expr (file, ref->group->step, TDF_TREE); | |
253 fprintf (file, ")\n"); | 276 fprintf (file, ")\n"); |
254 | 277 |
255 fprintf (file, " delta "); | 278 fprintf (file, " delta "); |
256 fprintf (file, HOST_WIDE_INT_PRINT_DEC, ref->delta); | 279 fprintf (file, HOST_WIDE_INT_PRINT_DEC, ref->delta); |
257 fprintf (file, "\n"); | 280 fprintf (file, "\n"); |
263 | 286 |
264 /* Finds a group with BASE and STEP in GROUPS, or creates one if it does not | 287 /* Finds a group with BASE and STEP in GROUPS, or creates one if it does not |
265 exist. */ | 288 exist. */ |
266 | 289 |
267 static struct mem_ref_group * | 290 static struct mem_ref_group * |
268 find_or_create_group (struct mem_ref_group **groups, tree base, | 291 find_or_create_group (struct mem_ref_group **groups, tree base, tree step) |
269 HOST_WIDE_INT step) | |
270 { | 292 { |
271 struct mem_ref_group *group; | 293 struct mem_ref_group *group; |
272 | 294 |
273 for (; *groups; groups = &(*groups)->next) | 295 for (; *groups; groups = &(*groups)->next) |
274 { | 296 { |
275 if ((*groups)->step == step | 297 if (operand_equal_p ((*groups)->step, step, 0) |
276 && operand_equal_p ((*groups)->base, base, 0)) | 298 && operand_equal_p ((*groups)->base, base, 0)) |
277 return *groups; | 299 return *groups; |
278 | 300 |
279 /* Keep the list of groups sorted by decreasing step. */ | 301 /* If step is an integer constant, keep the list of groups sorted |
280 if ((*groups)->step < step) | 302 by decreasing step. */ |
303 if (cst_and_fits_in_hwi ((*groups)->step) && cst_and_fits_in_hwi (step) | |
304 && int_cst_value ((*groups)->step) < int_cst_value (step)) | |
281 break; | 305 break; |
282 } | 306 } |
283 | 307 |
284 group = XNEW (struct mem_ref_group); | 308 group = XNEW (struct mem_ref_group); |
285 group->base = base; | 309 group->base = base; |
360 | 384 |
361 struct ar_data | 385 struct ar_data |
362 { | 386 { |
363 struct loop *loop; /* Loop of the reference. */ | 387 struct loop *loop; /* Loop of the reference. */ |
364 gimple stmt; /* Statement of the reference. */ | 388 gimple stmt; /* Statement of the reference. */ |
365 HOST_WIDE_INT *step; /* Step of the memory reference. */ | 389 tree *step; /* Step of the memory reference. */ |
366 HOST_WIDE_INT *delta; /* Offset of the memory reference. */ | 390 HOST_WIDE_INT *delta; /* Offset of the memory reference. */ |
367 }; | 391 }; |
368 | 392 |
369 /* Analyzes a single INDEX of a memory reference to obtain information | 393 /* Analyzes a single INDEX of a memory reference to obtain information |
370 described at analyze_ref. Callback for for_each_index. */ | 394 described at analyze_ref. Callback for for_each_index. */ |
372 static bool | 396 static bool |
373 idx_analyze_ref (tree base, tree *index, void *data) | 397 idx_analyze_ref (tree base, tree *index, void *data) |
374 { | 398 { |
375 struct ar_data *ar_data = (struct ar_data *) data; | 399 struct ar_data *ar_data = (struct ar_data *) data; |
376 tree ibase, step, stepsize; | 400 tree ibase, step, stepsize; |
377 HOST_WIDE_INT istep, idelta = 0, imult = 1; | 401 HOST_WIDE_INT idelta = 0, imult = 1; |
378 affine_iv iv; | 402 affine_iv iv; |
379 | 403 |
380 if (TREE_CODE (base) == MISALIGNED_INDIRECT_REF | 404 if (TREE_CODE (base) == MISALIGNED_INDIRECT_REF |
381 || TREE_CODE (base) == ALIGN_INDIRECT_REF) | 405 || TREE_CODE (base) == ALIGN_INDIRECT_REF) |
382 return false; | 406 return false; |
383 | 407 |
384 if (!simple_iv (ar_data->loop, loop_containing_stmt (ar_data->stmt), | 408 if (!simple_iv (ar_data->loop, loop_containing_stmt (ar_data->stmt), |
385 *index, &iv, false)) | 409 *index, &iv, true)) |
386 return false; | 410 return false; |
387 ibase = iv.base; | 411 ibase = iv.base; |
388 step = iv.step; | 412 step = iv.step; |
389 | 413 |
390 if (!cst_and_fits_in_hwi (step)) | |
391 return false; | |
392 istep = int_cst_value (step); | |
393 | |
394 if (TREE_CODE (ibase) == POINTER_PLUS_EXPR | 414 if (TREE_CODE (ibase) == POINTER_PLUS_EXPR |
395 && cst_and_fits_in_hwi (TREE_OPERAND (ibase, 1))) | 415 && cst_and_fits_in_hwi (TREE_OPERAND (ibase, 1))) |
396 { | 416 { |
397 idelta = int_cst_value (TREE_OPERAND (ibase, 1)); | 417 idelta = int_cst_value (TREE_OPERAND (ibase, 1)); |
398 ibase = TREE_OPERAND (ibase, 0); | 418 ibase = TREE_OPERAND (ibase, 0); |
401 { | 421 { |
402 idelta += int_cst_value (ibase); | 422 idelta += int_cst_value (ibase); |
403 ibase = build_int_cst (TREE_TYPE (ibase), 0); | 423 ibase = build_int_cst (TREE_TYPE (ibase), 0); |
404 } | 424 } |
405 | 425 |
426 if (*ar_data->step == NULL_TREE) | |
427 *ar_data->step = step; | |
428 else | |
429 *ar_data->step = fold_build2 (PLUS_EXPR, sizetype, | |
430 fold_convert (sizetype, *ar_data->step), | |
431 fold_convert (sizetype, step)); | |
406 if (TREE_CODE (base) == ARRAY_REF) | 432 if (TREE_CODE (base) == ARRAY_REF) |
407 { | 433 { |
408 stepsize = array_ref_element_size (base); | 434 stepsize = array_ref_element_size (base); |
409 if (!cst_and_fits_in_hwi (stepsize)) | 435 if (!cst_and_fits_in_hwi (stepsize)) |
410 return false; | 436 return false; |
411 imult = int_cst_value (stepsize); | 437 imult = int_cst_value (stepsize); |
412 | 438 |
413 istep *= imult; | 439 *ar_data->step = fold_build2 (MULT_EXPR, sizetype, |
440 fold_convert (sizetype, *ar_data->step), | |
441 fold_convert (sizetype, step)); | |
414 idelta *= imult; | 442 idelta *= imult; |
415 } | 443 } |
416 | 444 |
417 *ar_data->step += istep; | |
418 *ar_data->delta += idelta; | 445 *ar_data->delta += idelta; |
419 *index = ibase; | 446 *index = ibase; |
420 | 447 |
421 return true; | 448 return true; |
422 } | 449 } |
426 reference occurs in statement STMT. Strips nonaddressable component | 453 reference occurs in statement STMT. Strips nonaddressable component |
427 references from REF_P. */ | 454 references from REF_P. */ |
428 | 455 |
429 static bool | 456 static bool |
430 analyze_ref (struct loop *loop, tree *ref_p, tree *base, | 457 analyze_ref (struct loop *loop, tree *ref_p, tree *base, |
431 HOST_WIDE_INT *step, HOST_WIDE_INT *delta, | 458 tree *step, HOST_WIDE_INT *delta, |
432 gimple stmt) | 459 gimple stmt) |
433 { | 460 { |
434 struct ar_data ar_data; | 461 struct ar_data ar_data; |
435 tree off; | 462 tree off; |
436 HOST_WIDE_INT bit_offset; | 463 HOST_WIDE_INT bit_offset; |
437 tree ref = *ref_p; | 464 tree ref = *ref_p; |
438 | 465 |
439 *step = 0; | 466 *step = NULL_TREE; |
440 *delta = 0; | 467 *delta = 0; |
441 | 468 |
442 /* First strip off the component references. Ignore bitfields. */ | 469 /* First strip off the component references. Ignore bitfields. */ |
443 if (TREE_CODE (ref) == COMPONENT_REF | 470 if (TREE_CODE (ref) == COMPONENT_REF |
444 && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1))) | 471 && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1))) |
469 | 496 |
470 static bool | 497 static bool |
471 gather_memory_references_ref (struct loop *loop, struct mem_ref_group **refs, | 498 gather_memory_references_ref (struct loop *loop, struct mem_ref_group **refs, |
472 tree ref, bool write_p, gimple stmt) | 499 tree ref, bool write_p, gimple stmt) |
473 { | 500 { |
474 tree base; | 501 tree base, step; |
475 HOST_WIDE_INT step, delta; | 502 HOST_WIDE_INT delta; |
476 struct mem_ref_group *agrp; | 503 struct mem_ref_group *agrp; |
477 | 504 |
478 if (get_base_address (ref) == NULL) | 505 if (get_base_address (ref) == NULL) |
479 return false; | 506 return false; |
480 | 507 |
481 if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt)) | 508 if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt)) |
509 return false; | |
510 /* If analyze_ref fails the default is a NULL_TREE. We can stop here. */ | |
511 if (step == NULL_TREE) | |
482 return false; | 512 return false; |
483 | 513 |
484 /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP | 514 /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP |
485 are integer constants. */ | 515 are integer constants. */ |
486 agrp = find_or_create_group (refs, base, step); | 516 agrp = find_or_create_group (refs, base, step); |
552 /* Prune the prefetch candidate REF using the self-reuse. */ | 582 /* Prune the prefetch candidate REF using the self-reuse. */ |
553 | 583 |
554 static void | 584 static void |
555 prune_ref_by_self_reuse (struct mem_ref *ref) | 585 prune_ref_by_self_reuse (struct mem_ref *ref) |
556 { | 586 { |
557 HOST_WIDE_INT step = ref->group->step; | 587 HOST_WIDE_INT step; |
558 bool backward = step < 0; | 588 bool backward; |
589 | |
590 /* If the step size is non constant, we cannot calculate prefetch_mod. */ | |
591 if (!cst_and_fits_in_hwi (ref->group->step)) | |
592 return; | |
593 | |
594 step = int_cst_value (ref->group->step); | |
595 | |
596 backward = step < 0; | |
559 | 597 |
560 if (step == 0) | 598 if (step == 0) |
561 { | 599 { |
562 /* Prefetch references to invariant address just once. */ | 600 /* Prefetch references to invariant address just once. */ |
563 ref->prefetch_before = 1; | 601 ref->prefetch_before = 1; |
637 | 675 |
638 static void | 676 static void |
639 prune_ref_by_group_reuse (struct mem_ref *ref, struct mem_ref *by, | 677 prune_ref_by_group_reuse (struct mem_ref *ref, struct mem_ref *by, |
640 bool by_is_before) | 678 bool by_is_before) |
641 { | 679 { |
642 HOST_WIDE_INT step = ref->group->step; | 680 HOST_WIDE_INT step; |
643 bool backward = step < 0; | 681 bool backward; |
644 HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta; | 682 HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta; |
645 HOST_WIDE_INT delta = delta_b - delta_r; | 683 HOST_WIDE_INT delta = delta_b - delta_r; |
646 HOST_WIDE_INT hit_from; | 684 HOST_WIDE_INT hit_from; |
647 unsigned HOST_WIDE_INT prefetch_before, prefetch_block; | 685 unsigned HOST_WIDE_INT prefetch_before, prefetch_block; |
648 int miss_rate; | 686 int miss_rate; |
649 HOST_WIDE_INT reduced_step; | 687 HOST_WIDE_INT reduced_step; |
650 unsigned HOST_WIDE_INT reduced_prefetch_block; | 688 unsigned HOST_WIDE_INT reduced_prefetch_block; |
651 tree ref_type; | 689 tree ref_type; |
652 int align_unit; | 690 int align_unit; |
653 | 691 |
692 /* If the step is non constant we cannot calculate prefetch_before. */ | |
693 if (!cst_and_fits_in_hwi (ref->group->step)) { | |
694 return; | |
695 } | |
696 | |
697 step = int_cst_value (ref->group->step); | |
698 | |
699 backward = step < 0; | |
700 | |
701 | |
654 if (delta == 0) | 702 if (delta == 0) |
655 { | 703 { |
656 /* If the references has the same address, only prefetch the | 704 /* If the references has the same address, only prefetch the |
657 former. */ | 705 former. */ |
658 if (by_is_before) | 706 if (by_is_before) |
703 { | 751 { |
704 /* The accesses are sure to meet. Let us check when. */ | 752 /* The accesses are sure to meet. Let us check when. */ |
705 hit_from = ddown (delta_b, PREFETCH_BLOCK) * PREFETCH_BLOCK; | 753 hit_from = ddown (delta_b, PREFETCH_BLOCK) * PREFETCH_BLOCK; |
706 prefetch_before = (hit_from - delta_r + step - 1) / step; | 754 prefetch_before = (hit_from - delta_r + step - 1) / step; |
707 | 755 |
756 /* Do not reduce prefetch_before if we meet beyond cache size. */ | |
757 if (prefetch_before > (unsigned) abs (L2_CACHE_SIZE_BYTES / step)) | |
758 prefetch_before = PREFETCH_ALL; | |
708 if (prefetch_before < ref->prefetch_before) | 759 if (prefetch_before < ref->prefetch_before) |
709 ref->prefetch_before = prefetch_before; | 760 ref->prefetch_before = prefetch_before; |
710 | 761 |
711 return; | 762 return; |
712 } | 763 } |
733 align_unit = TYPE_ALIGN (ref_type) / 8; | 784 align_unit = TYPE_ALIGN (ref_type) / 8; |
734 miss_rate = compute_miss_rate(prefetch_block, step, delta, | 785 miss_rate = compute_miss_rate(prefetch_block, step, delta, |
735 reduced_prefetch_block, align_unit); | 786 reduced_prefetch_block, align_unit); |
736 if (miss_rate <= ACCEPTABLE_MISS_RATE) | 787 if (miss_rate <= ACCEPTABLE_MISS_RATE) |
737 { | 788 { |
789 /* Do not reduce prefetch_before if we meet beyond cache size. */ | |
790 if (prefetch_before > L2_CACHE_SIZE_BYTES / PREFETCH_BLOCK) | |
791 prefetch_before = PREFETCH_ALL; | |
738 if (prefetch_before < ref->prefetch_before) | 792 if (prefetch_before < ref->prefetch_before) |
739 ref->prefetch_before = prefetch_before; | 793 ref->prefetch_before = prefetch_before; |
740 | 794 |
741 return; | 795 return; |
742 } | 796 } |
847 should_issue_prefetch_p (struct mem_ref *ref) | 901 should_issue_prefetch_p (struct mem_ref *ref) |
848 { | 902 { |
849 /* For now do not issue prefetches for only first few of the | 903 /* For now do not issue prefetches for only first few of the |
850 iterations. */ | 904 iterations. */ |
851 if (ref->prefetch_before != PREFETCH_ALL) | 905 if (ref->prefetch_before != PREFETCH_ALL) |
852 return false; | 906 { |
907 if (dump_file && (dump_flags & TDF_DETAILS)) | |
908 fprintf (dump_file, "Ignoring %p due to prefetch_before\n", | |
909 (void *) ref); | |
910 return false; | |
911 } | |
853 | 912 |
854 /* Do not prefetch nontemporal stores. */ | 913 /* Do not prefetch nontemporal stores. */ |
855 if (ref->storent_p) | 914 if (ref->storent_p) |
856 return false; | 915 { |
916 if (dump_file && (dump_flags & TDF_DETAILS)) | |
917 fprintf (dump_file, "Ignoring nontemporal store %p\n", (void *) ref); | |
918 return false; | |
919 } | |
857 | 920 |
858 return true; | 921 return true; |
859 } | 922 } |
860 | 923 |
861 /* Decide which of the prefetch candidates in GROUPS to prefetch. | 924 /* Decide which of the prefetch candidates in GROUPS to prefetch. |
893 for (ref = groups->refs; ref; ref = ref->next) | 956 for (ref = groups->refs; ref; ref = ref->next) |
894 { | 957 { |
895 if (!should_issue_prefetch_p (ref)) | 958 if (!should_issue_prefetch_p (ref)) |
896 continue; | 959 continue; |
897 | 960 |
961 /* The loop is far from being sufficiently unrolled for this | |
962 prefetch. Do not generate prefetch to avoid many redudant | |
963 prefetches. */ | |
964 if (ref->prefetch_mod / unroll_factor > PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO) | |
965 continue; | |
966 | |
898 /* If we need to prefetch the reference each PREFETCH_MOD iterations, | 967 /* If we need to prefetch the reference each PREFETCH_MOD iterations, |
899 and we unroll the loop UNROLL_FACTOR times, we need to insert | 968 and we unroll the loop UNROLL_FACTOR times, we need to insert |
900 ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each | 969 ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each |
901 iteration. */ | 970 iteration. */ |
902 n_prefetches = ((unroll_factor + ref->prefetch_mod - 1) | 971 n_prefetches = ((unroll_factor + ref->prefetch_mod - 1) |
941 | 1010 |
942 static void | 1011 static void |
943 issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead) | 1012 issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead) |
944 { | 1013 { |
945 HOST_WIDE_INT delta; | 1014 HOST_WIDE_INT delta; |
946 tree addr, addr_base, write_p, local; | 1015 tree addr, addr_base, write_p, local, forward; |
947 gimple prefetch; | 1016 gimple prefetch; |
948 gimple_stmt_iterator bsi; | 1017 gimple_stmt_iterator bsi; |
949 unsigned n_prefetches, ap; | 1018 unsigned n_prefetches, ap; |
950 bool nontemporal = ref->reuse_distance >= L2_CACHE_SIZE_BYTES; | 1019 bool nontemporal = ref->reuse_distance >= L2_CACHE_SIZE_BYTES; |
951 | 1020 |
964 write_p = ref->write_p ? integer_one_node : integer_zero_node; | 1033 write_p = ref->write_p ? integer_one_node : integer_zero_node; |
965 local = build_int_cst (integer_type_node, nontemporal ? 0 : 3); | 1034 local = build_int_cst (integer_type_node, nontemporal ? 0 : 3); |
966 | 1035 |
967 for (ap = 0; ap < n_prefetches; ap++) | 1036 for (ap = 0; ap < n_prefetches; ap++) |
968 { | 1037 { |
969 /* Determine the address to prefetch. */ | 1038 if (cst_and_fits_in_hwi (ref->group->step)) |
970 delta = (ahead + ap * ref->prefetch_mod) * ref->group->step; | 1039 { |
971 addr = fold_build2 (POINTER_PLUS_EXPR, ptr_type_node, | 1040 /* Determine the address to prefetch. */ |
972 addr_base, size_int (delta)); | 1041 delta = (ahead + ap * ref->prefetch_mod) * |
973 addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true, NULL, | 1042 int_cst_value (ref->group->step); |
974 true, GSI_SAME_STMT); | 1043 addr = fold_build2 (POINTER_PLUS_EXPR, ptr_type_node, |
975 | 1044 addr_base, size_int (delta)); |
1045 addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true, NULL, | |
1046 true, GSI_SAME_STMT); | |
1047 } | |
1048 else | |
1049 { | |
1050 /* The step size is non-constant but loop-invariant. We use the | |
1051 heuristic to simply prefetch ahead iterations ahead. */ | |
1052 forward = fold_build2 (MULT_EXPR, sizetype, | |
1053 fold_convert (sizetype, ref->group->step), | |
1054 fold_convert (sizetype, size_int (ahead))); | |
1055 addr = fold_build2 (POINTER_PLUS_EXPR, ptr_type_node, addr_base, | |
1056 forward); | |
1057 addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true, | |
1058 NULL, true, GSI_SAME_STMT); | |
1059 } | |
976 /* Create the prefetch instruction. */ | 1060 /* Create the prefetch instruction. */ |
977 prefetch = gimple_build_call (built_in_decls[BUILT_IN_PREFETCH], | 1061 prefetch = gimple_build_call (built_in_decls[BUILT_IN_PREFETCH], |
978 3, addr, write_p, local); | 1062 3, addr, write_p, local); |
979 gsi_insert_before (&bsi, prefetch, GSI_SAME_STMT); | 1063 gsi_insert_before (&bsi, prefetch, GSI_SAME_STMT); |
980 } | 1064 } |
1531 MEM_REF_COUNT: total number of memory references in the loop. */ | 1615 MEM_REF_COUNT: total number of memory references in the loop. */ |
1532 | 1616 |
1533 static bool | 1617 static bool |
1534 is_loop_prefetching_profitable (unsigned ahead, HOST_WIDE_INT est_niter, | 1618 is_loop_prefetching_profitable (unsigned ahead, HOST_WIDE_INT est_niter, |
1535 unsigned ninsns, unsigned prefetch_count, | 1619 unsigned ninsns, unsigned prefetch_count, |
1536 unsigned mem_ref_count) | 1620 unsigned mem_ref_count, unsigned unroll_factor) |
1537 { | 1621 { |
1538 int insn_to_mem_ratio, insn_to_prefetch_ratio; | 1622 int insn_to_mem_ratio, insn_to_prefetch_ratio; |
1539 | 1623 |
1540 if (mem_ref_count == 0) | 1624 if (mem_ref_count == 0) |
1541 return false; | 1625 return false; |
1550 for each CPU or memory op in the loop. Time estimates for memory ops | 1634 for each CPU or memory op in the loop. Time estimates for memory ops |
1551 should account for cache misses. */ | 1635 should account for cache misses. */ |
1552 insn_to_mem_ratio = ninsns / mem_ref_count; | 1636 insn_to_mem_ratio = ninsns / mem_ref_count; |
1553 | 1637 |
1554 if (insn_to_mem_ratio < PREFETCH_MIN_INSN_TO_MEM_RATIO) | 1638 if (insn_to_mem_ratio < PREFETCH_MIN_INSN_TO_MEM_RATIO) |
1555 return false; | 1639 { |
1556 | 1640 if (dump_file && (dump_flags & TDF_DETAILS)) |
1557 /* Profitability of prefetching is highly dependent on the trip count. | 1641 fprintf (dump_file, |
1558 For a given AHEAD distance, the first AHEAD iterations do not benefit | 1642 "Not prefetching -- instruction to memory reference ratio (%d) too small\n", |
1559 from prefetching, and the last AHEAD iterations execute useless | 1643 insn_to_mem_ratio); |
1560 prefetches. So, if the trip count is not large enough relative to AHEAD, | 1644 return false; |
1561 prefetching may cause serious performance degradation. To avoid this | 1645 } |
1562 problem when the trip count is not known at compile time, we | 1646 |
1563 conservatively skip loops with high prefetching costs. For now, only | 1647 /* Prefetching most likely causes performance degradation when the instruction |
1564 the I-cache cost is considered. The relative I-cache cost is estimated | 1648 to prefetch ratio is too small. Too many prefetch instructions in a loop |
1565 by taking the ratio between the number of prefetches and the total | 1649 may reduce the I-cache performance. |
1566 number of instructions. Since we are using integer arithmetic, we | 1650 (unroll_factor * ninsns) is used to estimate the number of instructions in |
1567 compute the reciprocal of this ratio. | 1651 the unrolled loop. This implementation is a bit simplistic -- the number |
1568 TODO: Account for loop unrolling, which may reduce the costs of | 1652 of issued prefetch instructions is also affected by unrolling. So, |
1569 shorter stride prefetches. Note that not accounting for loop | 1653 prefetch_mod and the unroll factor should be taken into account when |
1570 unrolling over-estimates the cost and hence gives more conservative | 1654 determining prefetch_count. Also, the number of insns of the unrolled |
1571 results. */ | 1655 loop will usually be significantly smaller than the number of insns of the |
1656 original loop * unroll_factor (at least the induction variable increases | |
1657 and the exit branches will get eliminated), so it might be better to use | |
1658 tree_estimate_loop_size + estimated_unrolled_size. */ | |
1659 insn_to_prefetch_ratio = (unroll_factor * ninsns) / prefetch_count; | |
1660 if (insn_to_prefetch_ratio < MIN_INSN_TO_PREFETCH_RATIO) | |
1661 { | |
1662 if (dump_file && (dump_flags & TDF_DETAILS)) | |
1663 fprintf (dump_file, | |
1664 "Not prefetching -- instruction to prefetch ratio (%d) too small\n", | |
1665 insn_to_prefetch_ratio); | |
1666 return false; | |
1667 } | |
1668 | |
1669 /* Could not do further estimation if the trip count is unknown. Just assume | |
1670 prefetching is profitable. Too aggressive??? */ | |
1572 if (est_niter < 0) | 1671 if (est_niter < 0) |
1573 { | 1672 return true; |
1574 insn_to_prefetch_ratio = ninsns / prefetch_count; | 1673 |
1575 return insn_to_prefetch_ratio >= MIN_INSN_TO_PREFETCH_RATIO; | 1674 if (est_niter < (HOST_WIDE_INT) (TRIP_COUNT_TO_AHEAD_RATIO * ahead)) |
1576 } | |
1577 | |
1578 if (est_niter <= (HOST_WIDE_INT) ahead) | |
1579 { | 1675 { |
1580 if (dump_file && (dump_flags & TDF_DETAILS)) | 1676 if (dump_file && (dump_flags & TDF_DETAILS)) |
1581 fprintf (dump_file, | 1677 fprintf (dump_file, |
1582 "Not prefetching -- loop estimated to roll only %d times\n", | 1678 "Not prefetching -- loop estimated to roll only %d times\n", |
1583 (int) est_niter); | 1679 (int) est_niter); |
1636 HOST_WIDE_INT_PRINT_DEC "\n" | 1732 HOST_WIDE_INT_PRINT_DEC "\n" |
1637 "insn count %d, mem ref count %d, prefetch count %d\n", | 1733 "insn count %d, mem ref count %d, prefetch count %d\n", |
1638 ahead, unroll_factor, est_niter, | 1734 ahead, unroll_factor, est_niter, |
1639 ninsns, mem_ref_count, prefetch_count); | 1735 ninsns, mem_ref_count, prefetch_count); |
1640 | 1736 |
1641 if (!is_loop_prefetching_profitable (ahead, est_niter, ninsns, | 1737 if (!is_loop_prefetching_profitable (ahead, est_niter, ninsns, prefetch_count, |
1642 prefetch_count, mem_ref_count)) | 1738 mem_ref_count, unroll_factor)) |
1643 goto fail; | 1739 goto fail; |
1644 | 1740 |
1645 mark_nontemporal_stores (loop, refs); | 1741 mark_nontemporal_stores (loop, refs); |
1646 | 1742 |
1647 /* Step 4: what to prefetch? */ | 1743 /* Step 4: what to prefetch? */ |