Mercurial > hg > CbC > CbC_gcc
comparison gcc/tree-loop-distribution.c @ 132:d34655255c78
update gcc-8.2
author | mir3636 |
---|---|
date | Thu, 25 Oct 2018 10:21:07 +0900 |
parents | 84e7813d76e9 |
children | 1830386684a0 |
comparison
equal
deleted
inserted
replaced
130:e108057fa461 | 132:d34655255c78 |
---|---|
1 /* Loop distribution. | 1 /* Loop distribution. |
2 Copyright (C) 2006-2017 Free Software Foundation, Inc. | 2 Copyright (C) 2006-2018 Free Software Foundation, Inc. |
3 Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr> | 3 Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr> |
4 and Sebastian Pop <sebastian.pop@amd.com>. | 4 and Sebastian Pop <sebastian.pop@amd.com>. |
5 | 5 |
6 This file is part of GCC. | 6 This file is part of GCC. |
7 | 7 |
88 2) We only fuse partitions in SCC now. A better fusion algorithm is | 88 2) We only fuse partitions in SCC now. A better fusion algorithm is |
89 desired to minimize loop overhead, maximize parallelism and maximize | 89 desired to minimize loop overhead, maximize parallelism and maximize |
90 data reuse. */ | 90 data reuse. */ |
91 | 91 |
92 #include "config.h" | 92 #include "config.h" |
93 #define INCLUDE_ALGORITHM /* stable_sort */ | |
94 #include "system.h" | 93 #include "system.h" |
95 #include "coretypes.h" | 94 #include "coretypes.h" |
96 #include "backend.h" | 95 #include "backend.h" |
97 #include "tree.h" | 96 #include "tree.h" |
98 #include "gimple.h" | 97 #include "gimple.h" |
582 } | 581 } |
583 | 582 |
584 | 583 |
585 /* Kind of distributed loop. */ | 584 /* Kind of distributed loop. */ |
586 enum partition_kind { | 585 enum partition_kind { |
587 PKIND_NORMAL, PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE | 586 PKIND_NORMAL, |
587 /* Partial memset stands for a paritition can be distributed into a loop | |
588 of memset calls, rather than a single memset call. It's handled just | |
589 like a normal parition, i.e, distributed as separate loop, no memset | |
590 call is generated. | |
591 | |
592 Note: This is a hacking fix trying to distribute ZERO-ing stmt in a | |
593 loop nest as deep as possible. As a result, parloop achieves better | |
594 parallelization by parallelizing deeper loop nest. This hack should | |
595 be unnecessary and removed once distributed memset can be understood | |
596 and analyzed in data reference analysis. See PR82604 for more. */ | |
597 PKIND_PARTIAL_MEMSET, | |
598 PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE | |
588 }; | 599 }; |
589 | 600 |
590 /* Type of distributed loop. */ | 601 /* Type of distributed loop. */ |
591 enum partition_type { | 602 enum partition_type { |
592 /* The distributed loop can be executed parallelly. */ | 603 /* The distributed loop can be executed parallelly. */ |
657 /* Returns true if the partition can be generated as a builtin. */ | 668 /* Returns true if the partition can be generated as a builtin. */ |
658 | 669 |
659 static bool | 670 static bool |
660 partition_builtin_p (partition *partition) | 671 partition_builtin_p (partition *partition) |
661 { | 672 { |
662 return partition->kind != PKIND_NORMAL; | 673 return partition->kind > PKIND_PARTIAL_MEMSET; |
663 } | 674 } |
664 | 675 |
665 /* Returns true if the partition contains a reduction. */ | 676 /* Returns true if the partition contains a reduction. */ |
666 | 677 |
667 static bool | 678 static bool |
824 } | 835 } |
825 | 836 |
826 /* Remove stmts not in the PARTITION bitmap. */ | 837 /* Remove stmts not in the PARTITION bitmap. */ |
827 bbs = get_loop_body_in_dom_order (loop); | 838 bbs = get_loop_body_in_dom_order (loop); |
828 | 839 |
829 if (MAY_HAVE_DEBUG_STMTS) | 840 if (MAY_HAVE_DEBUG_BIND_STMTS) |
830 for (i = 0; i < loop->num_nodes; i++) | 841 for (i = 0; i < loop->num_nodes; i++) |
831 { | 842 { |
832 basic_block bb = bbs[i]; | 843 basic_block bb = bbs[i]; |
833 | 844 |
834 for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi); | 845 for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi); |
942 if (!const_with_all_bytes_same (TREE_REALPART (val)) | 953 if (!const_with_all_bytes_same (TREE_REALPART (val)) |
943 && !const_with_all_bytes_same (TREE_IMAGPART (val))) | 954 && !const_with_all_bytes_same (TREE_IMAGPART (val))) |
944 return 0; | 955 return 0; |
945 break; | 956 break; |
946 case VECTOR_CST: | 957 case VECTOR_CST: |
947 unsigned int j; | 958 { |
948 for (j = 0; j < VECTOR_CST_NELTS (val); ++j) | 959 unsigned int count = vector_cst_encoded_nelts (val); |
949 if (const_with_all_bytes_same (VECTOR_CST_ELT (val, j))) | 960 unsigned int j; |
950 break; | 961 for (j = 0; j < count; ++j) |
951 if (j == VECTOR_CST_NELTS (val)) | 962 if (const_with_all_bytes_same (VECTOR_CST_ENCODED_ELT (val, j))) |
952 return 0; | 963 break; |
953 break; | 964 if (j == count) |
965 return 0; | |
966 break; | |
967 } | |
954 default: | 968 default: |
955 break; | 969 break; |
956 } | 970 } |
957 } | 971 } |
958 | 972 |
1122 partition *partition, bool copy_p) | 1136 partition *partition, bool copy_p) |
1123 { | 1137 { |
1124 switch (partition->kind) | 1138 switch (partition->kind) |
1125 { | 1139 { |
1126 case PKIND_NORMAL: | 1140 case PKIND_NORMAL: |
1141 case PKIND_PARTIAL_MEMSET: | |
1127 /* Reductions all have to be in the last partition. */ | 1142 /* Reductions all have to be in the last partition. */ |
1128 gcc_assert (!partition_reduction_p (partition) | 1143 gcc_assert (!partition_reduction_p (partition) |
1129 || !copy_p); | 1144 || !copy_p); |
1130 generate_loops_for_partition (loop, partition, copy_p); | 1145 generate_loops_for_partition (loop, partition, copy_p); |
1131 return false; | 1146 return false; |
1394 return true; | 1409 return true; |
1395 } | 1410 } |
1396 | 1411 |
1397 /* Given data reference DR in LOOP_NEST, this function checks the enclosing | 1412 /* Given data reference DR in LOOP_NEST, this function checks the enclosing |
1398 loops from inner to outer to see if loop's step equals to access size at | 1413 loops from inner to outer to see if loop's step equals to access size at |
1399 each level of loop. Return true if yes; record access base and size in | 1414 each level of loop. Return 2 if we can prove this at all level loops; |
1400 BASE and SIZE; save loop's step at each level of loop in STEPS if it is | 1415 record access base and size in BASE and SIZE; save loop's step at each |
1401 not null. For example: | 1416 level of loop in STEPS if it is not null. For example: |
1402 | 1417 |
1403 int arr[100][100][100]; | 1418 int arr[100][100][100]; |
1404 for (i = 0; i < 100; i++) ;steps[2] = 40000 | 1419 for (i = 0; i < 100; i++) ;steps[2] = 40000 |
1405 for (j = 100; j > 0; j--) ;steps[1] = -400 | 1420 for (j = 100; j > 0; j--) ;steps[1] = -400 |
1406 for (k = 0; k < 100; k++) ;steps[0] = 4 | 1421 for (k = 0; k < 100; k++) ;steps[0] = 4 |
1407 arr[i][j - 1][k] = 0; ;base = &arr, size = 4000000. */ | 1422 arr[i][j - 1][k] = 0; ;base = &arr, size = 4000000 |
1408 | 1423 |
1409 static bool | 1424 Return 1 if we can prove the equality at the innermost loop, but not all |
1425 level loops. In this case, no information is recorded. | |
1426 | |
1427 Return 0 if no equality can be proven at any level loops. */ | |
1428 | |
1429 static int | |
1410 compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base, | 1430 compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base, |
1411 tree *size, vec<tree> *steps = NULL) | 1431 tree *size, vec<tree> *steps = NULL) |
1412 { | 1432 { |
1413 location_t loc = gimple_location (DR_STMT (dr)); | 1433 location_t loc = gimple_location (DR_STMT (dr)); |
1414 basic_block bb = gimple_bb (DR_STMT (dr)); | 1434 basic_block bb = gimple_bb (DR_STMT (dr)); |
1415 struct loop *loop = bb->loop_father; | 1435 struct loop *loop = bb->loop_father; |
1416 tree ref = DR_REF (dr); | 1436 tree ref = DR_REF (dr); |
1417 tree access_base = build_fold_addr_expr (ref); | 1437 tree access_base = build_fold_addr_expr (ref); |
1418 tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (ref)); | 1438 tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (ref)); |
1439 int res = 0; | |
1419 | 1440 |
1420 do { | 1441 do { |
1421 tree scev_fn = analyze_scalar_evolution (loop, access_base); | 1442 tree scev_fn = analyze_scalar_evolution (loop, access_base); |
1422 if (TREE_CODE (scev_fn) != POLYNOMIAL_CHREC) | 1443 if (TREE_CODE (scev_fn) != POLYNOMIAL_CHREC) |
1423 return false; | 1444 return res; |
1424 | 1445 |
1425 access_base = CHREC_LEFT (scev_fn); | 1446 access_base = CHREC_LEFT (scev_fn); |
1426 if (tree_contains_chrecs (access_base, NULL)) | 1447 if (tree_contains_chrecs (access_base, NULL)) |
1427 return false; | 1448 return res; |
1428 | 1449 |
1429 tree scev_step = CHREC_RIGHT (scev_fn); | 1450 tree scev_step = CHREC_RIGHT (scev_fn); |
1430 /* Only support constant steps. */ | 1451 /* Only support constant steps. */ |
1431 if (TREE_CODE (scev_step) != INTEGER_CST) | 1452 if (TREE_CODE (scev_step) != INTEGER_CST) |
1432 return false; | 1453 return res; |
1433 | 1454 |
1434 enum ev_direction access_dir = scev_direction (scev_fn); | 1455 enum ev_direction access_dir = scev_direction (scev_fn); |
1435 if (access_dir == EV_DIR_UNKNOWN) | 1456 if (access_dir == EV_DIR_UNKNOWN) |
1436 return false; | 1457 return res; |
1437 | 1458 |
1438 if (steps != NULL) | 1459 if (steps != NULL) |
1439 steps->safe_push (scev_step); | 1460 steps->safe_push (scev_step); |
1440 | 1461 |
1441 scev_step = fold_convert_loc (loc, sizetype, scev_step); | 1462 scev_step = fold_convert_loc (loc, sizetype, scev_step); |
1444 scev_step = fold_build1_loc (loc, NEGATE_EXPR, sizetype, scev_step); | 1465 scev_step = fold_build1_loc (loc, NEGATE_EXPR, sizetype, scev_step); |
1445 | 1466 |
1446 /* At each level of loop, scev step must equal to access size. In other | 1467 /* At each level of loop, scev step must equal to access size. In other |
1447 words, DR must access consecutive memory between loop iterations. */ | 1468 words, DR must access consecutive memory between loop iterations. */ |
1448 if (!operand_equal_p (scev_step, access_size, 0)) | 1469 if (!operand_equal_p (scev_step, access_size, 0)) |
1449 return false; | 1470 return res; |
1471 | |
1472 /* Access stride can be computed for data reference at least for the | |
1473 innermost loop. */ | |
1474 res = 1; | |
1450 | 1475 |
1451 /* Compute DR's execution times in loop. */ | 1476 /* Compute DR's execution times in loop. */ |
1452 tree niters = number_of_latch_executions (loop); | 1477 tree niters = number_of_latch_executions (loop); |
1453 niters = fold_convert_loc (loc, sizetype, niters); | 1478 niters = fold_convert_loc (loc, sizetype, niters); |
1454 if (dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src, bb)) | 1479 if (dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src, bb)) |
1466 } | 1491 } |
1467 } while (loop != loop_nest && (loop = loop_outer (loop)) != NULL); | 1492 } while (loop != loop_nest && (loop = loop_outer (loop)) != NULL); |
1468 | 1493 |
1469 *base = access_base; | 1494 *base = access_base; |
1470 *size = access_size; | 1495 *size = access_size; |
1471 return true; | 1496 /* Access stride can be computed for data reference at each level loop. */ |
1497 return 2; | |
1472 } | 1498 } |
1473 | 1499 |
1474 /* Allocate and return builtin struct. Record information like DST_DR, | 1500 /* Allocate and return builtin struct. Record information like DST_DR, |
1475 SRC_DR, DST_BASE, SRC_BASE and SIZE in the allocated struct. */ | 1501 SRC_DR, DST_BASE, SRC_BASE and SIZE in the allocated struct. */ |
1476 | 1502 |
1505 if (TREE_CODE (rhs) == SSA_NAME | 1531 if (TREE_CODE (rhs) == SSA_NAME |
1506 && !SSA_NAME_IS_DEFAULT_DEF (rhs) | 1532 && !SSA_NAME_IS_DEFAULT_DEF (rhs) |
1507 && flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (rhs)))) | 1533 && flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (rhs)))) |
1508 return; | 1534 return; |
1509 | 1535 |
1510 if (!compute_access_range (loop, dr, &base, &size)) | 1536 int res = compute_access_range (loop, dr, &base, &size); |
1537 if (res == 0) | |
1538 return; | |
1539 if (res == 1) | |
1540 { | |
1541 partition->kind = PKIND_PARTIAL_MEMSET; | |
1542 return; | |
1543 } | |
1544 | |
1545 poly_uint64 base_offset; | |
1546 unsigned HOST_WIDE_INT const_base_offset; | |
1547 tree base_base = strip_offset (base, &base_offset); | |
1548 if (!base_offset.is_constant (&const_base_offset)) | |
1511 return; | 1549 return; |
1512 | 1550 |
1513 struct builtin_info *builtin; | 1551 struct builtin_info *builtin; |
1514 builtin = alloc_builtin (dr, NULL, base, NULL_TREE, size); | 1552 builtin = alloc_builtin (dr, NULL, base, NULL_TREE, size); |
1515 builtin->dst_base_base = strip_offset (builtin->dst_base, | 1553 builtin->dst_base_base = base_base; |
1516 &builtin->dst_base_offset); | 1554 builtin->dst_base_offset = const_base_offset; |
1517 partition->builtin = builtin; | 1555 partition->builtin = builtin; |
1518 partition->kind = PKIND_MEMSET; | 1556 partition->kind = PKIND_MEMSET; |
1519 } | 1557 } |
1520 | 1558 |
1521 /* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify | 1559 /* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify |
1526 data_reference_p dst_dr, data_reference_p src_dr) | 1564 data_reference_p dst_dr, data_reference_p src_dr) |
1527 { | 1565 { |
1528 tree base, size, src_base, src_size; | 1566 tree base, size, src_base, src_size; |
1529 auto_vec<tree> dst_steps, src_steps; | 1567 auto_vec<tree> dst_steps, src_steps; |
1530 | 1568 |
1531 /* Compute access range of both load and store. They much have the same | 1569 /* Compute access range of both load and store. */ |
1532 access size. */ | 1570 int res = compute_access_range (loop, dst_dr, &base, &size, &dst_steps); |
1533 if (!compute_access_range (loop, dst_dr, &base, &size, &dst_steps) | 1571 if (res != 2) |
1534 || !compute_access_range (loop, src_dr, &src_base, &src_size, &src_steps) | 1572 return; |
1535 || !operand_equal_p (size, src_size, 0)) | 1573 res = compute_access_range (loop, src_dr, &src_base, &src_size, &src_steps); |
1574 if (res != 2) | |
1575 return; | |
1576 | |
1577 /* They much have the same access size. */ | |
1578 if (!operand_equal_p (size, src_size, 0)) | |
1536 return; | 1579 return; |
1537 | 1580 |
1538 /* Load and store in loop nest must access memory in the same way, i.e, | 1581 /* Load and store in loop nest must access memory in the same way, i.e, |
1539 their must have the same steps in each loop of the nest. */ | 1582 their must have the same steps in each loop of the nest. */ |
1540 if (dst_steps.length () != src_steps.length ()) | 1583 if (dst_steps.length () != src_steps.length ()) |
1876 /* Known dependences can still be unordered througout the | 1919 /* Known dependences can still be unordered througout the |
1877 iteration space, see gcc.dg/tree-ssa/ldist-16.c. */ | 1920 iteration space, see gcc.dg/tree-ssa/ldist-16.c. */ |
1878 if (DDR_NUM_DIST_VECTS (ddr) != 1) | 1921 if (DDR_NUM_DIST_VECTS (ddr) != 1) |
1879 this_dir = 2; | 1922 this_dir = 2; |
1880 /* If the overlap is exact preserve stmt order. */ | 1923 /* If the overlap is exact preserve stmt order. */ |
1881 else if (lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), 1)) | 1924 else if (lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), |
1925 DDR_NB_LOOPS (ddr))) | |
1882 ; | 1926 ; |
1883 /* Else as the distance vector is lexicographic positive swap | 1927 /* Else as the distance vector is lexicographic positive swap |
1884 the dependence direction. */ | 1928 the dependence direction. */ |
1885 else | 1929 else |
1886 this_dir = -this_dir; | 1930 this_dir = -this_dir; |
2222 for (i = 0; i < num_sccs; ++i) | 2266 for (i = 0; i < num_sccs; ++i) |
2223 { | 2267 { |
2224 for (j = 0; partitions->iterate (j, &first); ++j) | 2268 for (j = 0; partitions->iterate (j, &first); ++j) |
2225 if (pg->vertices[j].component == i) | 2269 if (pg->vertices[j].component == i) |
2226 break; | 2270 break; |
2271 | |
2272 bool same_type = true, all_builtins = partition_builtin_p (first); | |
2227 for (++j; partitions->iterate (j, &partition); ++j) | 2273 for (++j; partitions->iterate (j, &partition); ++j) |
2228 { | 2274 { |
2229 if (pg->vertices[j].component != i) | 2275 if (pg->vertices[j].component != i) |
2230 continue; | 2276 continue; |
2231 | 2277 |
2232 /* Note we Merge partitions of parallel type on purpose, though | |
2233 the result partition is sequential. The reason is vectorizer | |
2234 can do more accurate runtime alias check in this case. Also | |
2235 it results in more conservative distribution. */ | |
2236 if (first->type != partition->type) | 2278 if (first->type != partition->type) |
2237 { | 2279 { |
2238 bitmap_clear_bit (sccs_to_merge, i); | 2280 same_type = false; |
2239 break; | 2281 break; |
2240 } | 2282 } |
2283 all_builtins &= partition_builtin_p (partition); | |
2241 } | 2284 } |
2285 /* Merge SCC if all partitions in SCC have the same type, though the | |
2286 result partition is sequential, because vectorizer can do better | |
2287 runtime alias check. One expecption is all partitions in SCC are | |
2288 builtins. */ | |
2289 if (!same_type || all_builtins) | |
2290 bitmap_clear_bit (sccs_to_merge, i); | |
2242 } | 2291 } |
2243 | 2292 |
2244 /* Initialize callback data for traversing. */ | 2293 /* Initialize callback data for traversing. */ |
2245 cbdata.sccs_to_merge = sccs_to_merge; | 2294 cbdata.sccs_to_merge = sccs_to_merge; |
2246 cbdata.alias_ddrs = alias_ddrs; | 2295 cbdata.alias_ddrs = alias_ddrs; |
2319 will be accessed by DR in NITERS iterations. */ | 2368 will be accessed by DR in NITERS iterations. */ |
2320 | 2369 |
2321 static tree | 2370 static tree |
2322 data_ref_segment_size (struct data_reference *dr, tree niters) | 2371 data_ref_segment_size (struct data_reference *dr, tree niters) |
2323 { | 2372 { |
2324 tree segment_length; | 2373 niters = size_binop (MINUS_EXPR, |
2325 | 2374 fold_convert (sizetype, niters), |
2326 if (integer_zerop (DR_STEP (dr))) | 2375 size_one_node); |
2327 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr))); | 2376 return size_binop (MULT_EXPR, |
2328 else | 2377 fold_convert (sizetype, DR_STEP (dr)), |
2329 segment_length = size_binop (MULT_EXPR, | 2378 fold_convert (sizetype, niters)); |
2330 fold_convert (sizetype, DR_STEP (dr)), | |
2331 fold_convert (sizetype, niters)); | |
2332 | |
2333 return segment_length; | |
2334 } | 2379 } |
2335 | 2380 |
2336 /* Return true if LOOP's latch is dominated by statement for data reference | 2381 /* Return true if LOOP's latch is dominated by statement for data reference |
2337 DR. */ | 2382 DR. */ |
2338 | 2383 |
2383 if (latch_dominated_by_data_ref (loop, dr_b)) | 2428 if (latch_dominated_by_data_ref (loop, dr_b)) |
2384 seg_length_b = data_ref_segment_size (dr_b, niters_plus_one); | 2429 seg_length_b = data_ref_segment_size (dr_b, niters_plus_one); |
2385 else | 2430 else |
2386 seg_length_b = data_ref_segment_size (dr_b, niters); | 2431 seg_length_b = data_ref_segment_size (dr_b, niters); |
2387 | 2432 |
2433 unsigned HOST_WIDE_INT access_size_a | |
2434 = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_a)))); | |
2435 unsigned HOST_WIDE_INT access_size_b | |
2436 = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_b)))); | |
2437 unsigned int align_a = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_a))); | |
2438 unsigned int align_b = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_b))); | |
2439 | |
2388 dr_with_seg_len_pair_t dr_with_seg_len_pair | 2440 dr_with_seg_len_pair_t dr_with_seg_len_pair |
2389 (dr_with_seg_len (dr_a, seg_length_a), | 2441 (dr_with_seg_len (dr_a, seg_length_a, access_size_a, align_a), |
2390 dr_with_seg_len (dr_b, seg_length_b)); | 2442 dr_with_seg_len (dr_b, seg_length_b, access_size_b, align_b)); |
2391 | 2443 |
2392 /* Canonicalize pairs by sorting the two DR members. */ | 2444 /* Canonicalize pairs by sorting the two DR members. */ |
2393 if (comp_res > 0) | 2445 if (comp_res > 0) |
2394 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second); | 2446 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second); |
2395 | 2447 |
2409 | 2461 |
2410 /* Given data dependence relations in ALIAS_DDRS, generate runtime alias | 2462 /* Given data dependence relations in ALIAS_DDRS, generate runtime alias |
2411 checks and version LOOP under condition of these runtime alias checks. */ | 2463 checks and version LOOP under condition of these runtime alias checks. */ |
2412 | 2464 |
2413 static void | 2465 static void |
2414 version_loop_by_alias_check (struct loop *loop, vec<ddr_p> *alias_ddrs) | 2466 version_loop_by_alias_check (vec<struct partition *> *partitions, |
2467 struct loop *loop, vec<ddr_p> *alias_ddrs) | |
2415 { | 2468 { |
2416 profile_probability prob; | 2469 profile_probability prob; |
2417 basic_block cond_bb; | 2470 basic_block cond_bb; |
2418 struct loop *nloop; | 2471 struct loop *nloop; |
2419 tree lhs, arg0, cond_expr = NULL_TREE; | 2472 tree lhs, arg0, cond_expr = NULL_TREE; |
2432 create_runtime_alias_checks (loop, &comp_alias_pairs, &cond_expr); | 2485 create_runtime_alias_checks (loop, &comp_alias_pairs, &cond_expr); |
2433 cond_expr = force_gimple_operand_1 (cond_expr, &cond_stmts, | 2486 cond_expr = force_gimple_operand_1 (cond_expr, &cond_stmts, |
2434 is_gimple_val, NULL_TREE); | 2487 is_gimple_val, NULL_TREE); |
2435 | 2488 |
2436 /* Depend on vectorizer to fold IFN_LOOP_DIST_ALIAS. */ | 2489 /* Depend on vectorizer to fold IFN_LOOP_DIST_ALIAS. */ |
2437 if (flag_tree_loop_vectorize) | 2490 bool cancelable_p = flag_tree_loop_vectorize; |
2438 { | 2491 if (cancelable_p) |
2439 /* Generate internal function call for loop distribution alias check. */ | 2492 { |
2493 unsigned i = 0; | |
2494 struct partition *partition; | |
2495 for (; partitions->iterate (i, &partition); ++i) | |
2496 if (!partition_builtin_p (partition)) | |
2497 break; | |
2498 | |
2499 /* If all partitions are builtins, distributing it would be profitable and | |
2500 we don't want to cancel the runtime alias checks. */ | |
2501 if (i == partitions->length ()) | |
2502 cancelable_p = false; | |
2503 } | |
2504 | |
2505 /* Generate internal function call for loop distribution alias check if the | |
2506 runtime alias check should be cancelable. */ | |
2507 if (cancelable_p) | |
2508 { | |
2440 call_stmt = gimple_build_call_internal (IFN_LOOP_DIST_ALIAS, | 2509 call_stmt = gimple_build_call_internal (IFN_LOOP_DIST_ALIAS, |
2441 2, NULL_TREE, cond_expr); | 2510 2, NULL_TREE, cond_expr); |
2442 lhs = make_ssa_name (boolean_type_node); | 2511 lhs = make_ssa_name (boolean_type_node); |
2443 gimple_call_set_lhs (call_stmt, lhs); | 2512 gimple_call_set_lhs (call_stmt, lhs); |
2444 } | 2513 } |
2490 return (alias_ddrs->length () > 0); | 2559 return (alias_ddrs->length () > 0); |
2491 } | 2560 } |
2492 | 2561 |
2493 /* Compare base offset of builtin mem* partitions P1 and P2. */ | 2562 /* Compare base offset of builtin mem* partitions P1 and P2. */ |
2494 | 2563 |
2495 static bool | 2564 static int |
2496 offset_cmp (struct partition *p1, struct partition *p2) | 2565 offset_cmp (const void *vp1, const void *vp2) |
2497 { | 2566 { |
2498 gcc_assert (p1 != NULL && p1->builtin != NULL); | 2567 struct partition *p1 = *(struct partition *const *) vp1; |
2499 gcc_assert (p2 != NULL && p2->builtin != NULL); | 2568 struct partition *p2 = *(struct partition *const *) vp2; |
2500 return p1->builtin->dst_base_offset < p2->builtin->dst_base_offset; | 2569 unsigned HOST_WIDE_INT o1 = p1->builtin->dst_base_offset; |
2570 unsigned HOST_WIDE_INT o2 = p2->builtin->dst_base_offset; | |
2571 return (o2 < o1) - (o1 < o2); | |
2501 } | 2572 } |
2502 | 2573 |
2503 /* Fuse adjacent memset builtin PARTITIONS if possible. This is a special | 2574 /* Fuse adjacent memset builtin PARTITIONS if possible. This is a special |
2504 case optimization transforming below code: | 2575 case optimization transforming below code: |
2505 | 2576 |
2520 static void | 2591 static void |
2521 fuse_memset_builtins (vec<struct partition *> *partitions) | 2592 fuse_memset_builtins (vec<struct partition *> *partitions) |
2522 { | 2593 { |
2523 unsigned i, j; | 2594 unsigned i, j; |
2524 struct partition *part1, *part2; | 2595 struct partition *part1, *part2; |
2596 tree rhs1, rhs2; | |
2525 | 2597 |
2526 for (i = 0; partitions->iterate (i, &part1);) | 2598 for (i = 0; partitions->iterate (i, &part1);) |
2527 { | 2599 { |
2528 if (part1->kind != PKIND_MEMSET) | 2600 if (part1->kind != PKIND_MEMSET) |
2529 { | 2601 { |
2537 { | 2609 { |
2538 if (part2->kind != PKIND_MEMSET | 2610 if (part2->kind != PKIND_MEMSET |
2539 || !operand_equal_p (part1->builtin->dst_base_base, | 2611 || !operand_equal_p (part1->builtin->dst_base_base, |
2540 part2->builtin->dst_base_base, 0)) | 2612 part2->builtin->dst_base_base, 0)) |
2541 break; | 2613 break; |
2614 | |
2615 /* Memset calls setting different values can't be merged. */ | |
2616 rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr)); | |
2617 rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr)); | |
2618 if (!operand_equal_p (rhs1, rhs2, 0)) | |
2619 break; | |
2542 } | 2620 } |
2543 | 2621 |
2544 /* Stable sort is required in order to avoid breaking dependence. */ | 2622 /* Stable sort is required in order to avoid breaking dependence. */ |
2545 std::stable_sort (&(*partitions)[i], | 2623 gcc_stablesort (&(*partitions)[i], j - i, sizeof (*partitions)[i], |
2546 &(*partitions)[i] + j - i, offset_cmp); | 2624 offset_cmp); |
2547 /* Continue with next partition. */ | 2625 /* Continue with next partition. */ |
2548 i = j; | 2626 i = j; |
2549 } | 2627 } |
2550 | 2628 |
2551 /* Merge all consecutive memset builtin partitions. */ | 2629 /* Merge all consecutive memset builtin partitions. */ |
2568 part2->builtin->dst_base_base, 0)) | 2646 part2->builtin->dst_base_base, 0)) |
2569 { | 2647 { |
2570 i++; | 2648 i++; |
2571 continue; | 2649 continue; |
2572 } | 2650 } |
2573 tree rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr)); | 2651 rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr)); |
2574 tree rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr)); | 2652 rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr)); |
2575 int bytev1 = const_with_all_bytes_same (rhs1); | 2653 int bytev1 = const_with_all_bytes_same (rhs1); |
2576 int bytev2 = const_with_all_bytes_same (rhs2); | 2654 int bytev2 = const_with_all_bytes_same (rhs2); |
2577 /* Only merge memset partitions of the same value. */ | 2655 /* Only merge memset partitions of the same value. */ |
2578 if (bytev1 != bytev2 || bytev1 == -1) | 2656 if (bytev1 != bytev2 || bytev1 == -1) |
2579 { | 2657 { |
2609 | 2687 |
2610 if (partitions->length () == 1 | 2688 if (partitions->length () == 1 |
2611 || alias_ddrs->length () > 0) | 2689 || alias_ddrs->length () > 0) |
2612 return; | 2690 return; |
2613 | 2691 |
2614 unsigned num_builtin = 0, num_normal = 0; | 2692 unsigned num_builtin = 0, num_normal = 0, num_partial_memset = 0; |
2615 bool same_type_p = true; | 2693 bool same_type_p = true; |
2616 enum partition_type type = ((*partitions)[0])->type; | 2694 enum partition_type type = ((*partitions)[0])->type; |
2617 for (i = 0; partitions->iterate (i, &partition); ++i) | 2695 for (i = 0; partitions->iterate (i, &partition); ++i) |
2618 { | 2696 { |
2619 same_type_p &= (type == partition->type); | 2697 same_type_p &= (type == partition->type); |
2620 if (partition->kind != PKIND_NORMAL) | 2698 if (partition_builtin_p (partition)) |
2621 num_builtin++; | 2699 { |
2622 else | 2700 num_builtin++; |
2623 num_normal++; | 2701 continue; |
2702 } | |
2703 num_normal++; | |
2704 if (partition->kind == PKIND_PARTIAL_MEMSET) | |
2705 num_partial_memset++; | |
2624 } | 2706 } |
2625 | 2707 |
2626 /* Don't distribute current loop into too many loops given we don't have | 2708 /* Don't distribute current loop into too many loops given we don't have |
2627 memory stream cost model. Be even more conservative in case of loop | 2709 memory stream cost model. Be even more conservative in case of loop |
2628 nest distribution. */ | 2710 nest distribution. */ |
2629 if ((same_type_p && num_builtin == 0) | 2711 if ((same_type_p && num_builtin == 0 |
2712 && (loop->inner == NULL || num_normal != 2 || num_partial_memset != 1)) | |
2630 || (loop->inner != NULL | 2713 || (loop->inner != NULL |
2631 && i >= NUM_PARTITION_THRESHOLD && num_normal > 1) | 2714 && i >= NUM_PARTITION_THRESHOLD && num_normal > 1) |
2632 || (loop->inner == NULL | 2715 || (loop->inner == NULL |
2633 && i >= NUM_PARTITION_THRESHOLD && num_normal > num_builtin)) | 2716 && i >= NUM_PARTITION_THRESHOLD && num_normal > num_builtin)) |
2634 { | 2717 { |
2772 /* Apply our simple cost model - fuse partitions with similar | 2855 /* Apply our simple cost model - fuse partitions with similar |
2773 memory accesses. */ | 2856 memory accesses. */ |
2774 for (i = 0; partitions.iterate (i, &into); ++i) | 2857 for (i = 0; partitions.iterate (i, &into); ++i) |
2775 { | 2858 { |
2776 bool changed = false; | 2859 bool changed = false; |
2777 if (partition_builtin_p (into)) | 2860 if (partition_builtin_p (into) || into->kind == PKIND_PARTIAL_MEMSET) |
2778 continue; | 2861 continue; |
2779 for (int j = i + 1; | 2862 for (int j = i + 1; |
2780 partitions.iterate (j, &partition); ++j) | 2863 partitions.iterate (j, &partition); ++j) |
2781 { | 2864 { |
2782 if (share_memory_accesses (rdg, into, partition)) | 2865 if (share_memory_accesses (rdg, into, partition)) |
2822 nbp = 0; | 2905 nbp = 0; |
2823 goto ldist_done; | 2906 goto ldist_done; |
2824 } | 2907 } |
2825 | 2908 |
2826 if (version_for_distribution_p (&partitions, &alias_ddrs)) | 2909 if (version_for_distribution_p (&partitions, &alias_ddrs)) |
2827 version_loop_by_alias_check (loop, &alias_ddrs); | 2910 version_loop_by_alias_check (&partitions, loop, &alias_ddrs); |
2828 | 2911 |
2829 if (dump_file && (dump_flags & TDF_DETAILS)) | 2912 if (dump_file && (dump_flags & TDF_DETAILS)) |
2830 { | 2913 { |
2831 fprintf (dump_file, | 2914 fprintf (dump_file, |
2832 "distribute loop <%d> into partitions:\n", loop->num); | 2915 "distribute loop <%d> into partitions:\n", loop->num); |
2952 prepare_perfect_loop_nest (struct loop *loop) | 3035 prepare_perfect_loop_nest (struct loop *loop) |
2953 { | 3036 { |
2954 struct loop *outer = loop_outer (loop); | 3037 struct loop *outer = loop_outer (loop); |
2955 tree niters = number_of_latch_executions (loop); | 3038 tree niters = number_of_latch_executions (loop); |
2956 | 3039 |
2957 /* TODO: We only support the innermost 2-level loop nest distribution | 3040 /* TODO: We only support the innermost 3-level loop nest distribution |
2958 because of compilation time issue for now. This should be relaxed | 3041 because of compilation time issue for now. This should be relaxed |
2959 in the future. */ | 3042 in the future. Note we only allow 3-level loop nest distribution |
2960 while (loop->inner == NULL | 3043 when parallelizing loops. */ |
3044 while ((loop->inner == NULL | |
3045 || (loop->inner->inner == NULL && flag_tree_parallelize_loops > 1)) | |
2961 && loop_outer (outer) | 3046 && loop_outer (outer) |
2962 && outer->inner == loop && loop->next == NULL | 3047 && outer->inner == loop && loop->next == NULL |
2963 && single_exit (outer) | 3048 && single_exit (outer) |
2964 && optimize_loop_for_speed_p (outer) | 3049 && optimize_loop_for_speed_p (outer) |
2965 && !chrec_contains_symbols_defined_in_loop (niters, outer->num) | 3050 && !chrec_contains_symbols_defined_in_loop (niters, outer->num) |
3032 auto_vec<gimple *> work_list; | 3117 auto_vec<gimple *> work_list; |
3033 if (!find_seed_stmts_for_distribution (loop, &work_list)) | 3118 if (!find_seed_stmts_for_distribution (loop, &work_list)) |
3034 break; | 3119 break; |
3035 | 3120 |
3036 const char *str = loop->inner ? " nest" : ""; | 3121 const char *str = loop->inner ? " nest" : ""; |
3037 location_t loc = find_loop_location (loop); | 3122 dump_user_location_t loc = find_loop_location (loop); |
3038 if (!cd) | 3123 if (!cd) |
3039 { | 3124 { |
3040 calculate_dominance_info (CDI_DOMINATORS); | 3125 calculate_dominance_info (CDI_DOMINATORS); |
3041 calculate_dominance_info (CDI_POST_DOMINATORS); | 3126 calculate_dominance_info (CDI_POST_DOMINATORS); |
3042 cd = new control_dependences (); | 3127 cd = new control_dependences (); |
3092 rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa); | 3177 rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa); |
3093 } | 3178 } |
3094 | 3179 |
3095 checking_verify_loop_structure (); | 3180 checking_verify_loop_structure (); |
3096 | 3181 |
3097 return 0; | 3182 return changed ? TODO_cleanup_cfg : 0; |
3098 } | 3183 } |
3099 | 3184 |
3100 } // anon namespace | 3185 } // anon namespace |
3101 | 3186 |
3102 gimple_opt_pass * | 3187 gimple_opt_pass * |