comparison gcc/tree-loop-distribution.c @ 132:d34655255c78

update gcc-8.2
author mir3636
date Thu, 25 Oct 2018 10:21:07 +0900
parents 84e7813d76e9
children 1830386684a0
comparison
equal deleted inserted replaced
130:e108057fa461 132:d34655255c78
1 /* Loop distribution. 1 /* Loop distribution.
2 Copyright (C) 2006-2017 Free Software Foundation, Inc. 2 Copyright (C) 2006-2018 Free Software Foundation, Inc.
3 Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr> 3 Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr>
4 and Sebastian Pop <sebastian.pop@amd.com>. 4 and Sebastian Pop <sebastian.pop@amd.com>.
5 5
6 This file is part of GCC. 6 This file is part of GCC.
7 7
88 2) We only fuse partitions in SCC now. A better fusion algorithm is 88 2) We only fuse partitions in SCC now. A better fusion algorithm is
89 desired to minimize loop overhead, maximize parallelism and maximize 89 desired to minimize loop overhead, maximize parallelism and maximize
90 data reuse. */ 90 data reuse. */
91 91
92 #include "config.h" 92 #include "config.h"
93 #define INCLUDE_ALGORITHM /* stable_sort */
94 #include "system.h" 93 #include "system.h"
95 #include "coretypes.h" 94 #include "coretypes.h"
96 #include "backend.h" 95 #include "backend.h"
97 #include "tree.h" 96 #include "tree.h"
98 #include "gimple.h" 97 #include "gimple.h"
582 } 581 }
583 582
584 583
585 /* Kind of distributed loop. */ 584 /* Kind of distributed loop. */
586 enum partition_kind { 585 enum partition_kind {
587 PKIND_NORMAL, PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE 586 PKIND_NORMAL,
587 /* Partial memset stands for a paritition can be distributed into a loop
588 of memset calls, rather than a single memset call. It's handled just
589 like a normal parition, i.e, distributed as separate loop, no memset
590 call is generated.
591
592 Note: This is a hacking fix trying to distribute ZERO-ing stmt in a
593 loop nest as deep as possible. As a result, parloop achieves better
594 parallelization by parallelizing deeper loop nest. This hack should
595 be unnecessary and removed once distributed memset can be understood
596 and analyzed in data reference analysis. See PR82604 for more. */
597 PKIND_PARTIAL_MEMSET,
598 PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE
588 }; 599 };
589 600
590 /* Type of distributed loop. */ 601 /* Type of distributed loop. */
591 enum partition_type { 602 enum partition_type {
592 /* The distributed loop can be executed parallelly. */ 603 /* The distributed loop can be executed parallelly. */
657 /* Returns true if the partition can be generated as a builtin. */ 668 /* Returns true if the partition can be generated as a builtin. */
658 669
659 static bool 670 static bool
660 partition_builtin_p (partition *partition) 671 partition_builtin_p (partition *partition)
661 { 672 {
662 return partition->kind != PKIND_NORMAL; 673 return partition->kind > PKIND_PARTIAL_MEMSET;
663 } 674 }
664 675
665 /* Returns true if the partition contains a reduction. */ 676 /* Returns true if the partition contains a reduction. */
666 677
667 static bool 678 static bool
824 } 835 }
825 836
826 /* Remove stmts not in the PARTITION bitmap. */ 837 /* Remove stmts not in the PARTITION bitmap. */
827 bbs = get_loop_body_in_dom_order (loop); 838 bbs = get_loop_body_in_dom_order (loop);
828 839
829 if (MAY_HAVE_DEBUG_STMTS) 840 if (MAY_HAVE_DEBUG_BIND_STMTS)
830 for (i = 0; i < loop->num_nodes; i++) 841 for (i = 0; i < loop->num_nodes; i++)
831 { 842 {
832 basic_block bb = bbs[i]; 843 basic_block bb = bbs[i];
833 844
834 for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi); 845 for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
942 if (!const_with_all_bytes_same (TREE_REALPART (val)) 953 if (!const_with_all_bytes_same (TREE_REALPART (val))
943 && !const_with_all_bytes_same (TREE_IMAGPART (val))) 954 && !const_with_all_bytes_same (TREE_IMAGPART (val)))
944 return 0; 955 return 0;
945 break; 956 break;
946 case VECTOR_CST: 957 case VECTOR_CST:
947 unsigned int j; 958 {
948 for (j = 0; j < VECTOR_CST_NELTS (val); ++j) 959 unsigned int count = vector_cst_encoded_nelts (val);
949 if (const_with_all_bytes_same (VECTOR_CST_ELT (val, j))) 960 unsigned int j;
950 break; 961 for (j = 0; j < count; ++j)
951 if (j == VECTOR_CST_NELTS (val)) 962 if (const_with_all_bytes_same (VECTOR_CST_ENCODED_ELT (val, j)))
952 return 0; 963 break;
953 break; 964 if (j == count)
965 return 0;
966 break;
967 }
954 default: 968 default:
955 break; 969 break;
956 } 970 }
957 } 971 }
958 972
1122 partition *partition, bool copy_p) 1136 partition *partition, bool copy_p)
1123 { 1137 {
1124 switch (partition->kind) 1138 switch (partition->kind)
1125 { 1139 {
1126 case PKIND_NORMAL: 1140 case PKIND_NORMAL:
1141 case PKIND_PARTIAL_MEMSET:
1127 /* Reductions all have to be in the last partition. */ 1142 /* Reductions all have to be in the last partition. */
1128 gcc_assert (!partition_reduction_p (partition) 1143 gcc_assert (!partition_reduction_p (partition)
1129 || !copy_p); 1144 || !copy_p);
1130 generate_loops_for_partition (loop, partition, copy_p); 1145 generate_loops_for_partition (loop, partition, copy_p);
1131 return false; 1146 return false;
1394 return true; 1409 return true;
1395 } 1410 }
1396 1411
1397 /* Given data reference DR in LOOP_NEST, this function checks the enclosing 1412 /* Given data reference DR in LOOP_NEST, this function checks the enclosing
1398 loops from inner to outer to see if loop's step equals to access size at 1413 loops from inner to outer to see if loop's step equals to access size at
1399 each level of loop. Return true if yes; record access base and size in 1414 each level of loop. Return 2 if we can prove this at all level loops;
1400 BASE and SIZE; save loop's step at each level of loop in STEPS if it is 1415 record access base and size in BASE and SIZE; save loop's step at each
1401 not null. For example: 1416 level of loop in STEPS if it is not null. For example:
1402 1417
1403 int arr[100][100][100]; 1418 int arr[100][100][100];
1404 for (i = 0; i < 100; i++) ;steps[2] = 40000 1419 for (i = 0; i < 100; i++) ;steps[2] = 40000
1405 for (j = 100; j > 0; j--) ;steps[1] = -400 1420 for (j = 100; j > 0; j--) ;steps[1] = -400
1406 for (k = 0; k < 100; k++) ;steps[0] = 4 1421 for (k = 0; k < 100; k++) ;steps[0] = 4
1407 arr[i][j - 1][k] = 0; ;base = &arr, size = 4000000. */ 1422 arr[i][j - 1][k] = 0; ;base = &arr, size = 4000000
1408 1423
1409 static bool 1424 Return 1 if we can prove the equality at the innermost loop, but not all
1425 level loops. In this case, no information is recorded.
1426
1427 Return 0 if no equality can be proven at any level loops. */
1428
1429 static int
1410 compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base, 1430 compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base,
1411 tree *size, vec<tree> *steps = NULL) 1431 tree *size, vec<tree> *steps = NULL)
1412 { 1432 {
1413 location_t loc = gimple_location (DR_STMT (dr)); 1433 location_t loc = gimple_location (DR_STMT (dr));
1414 basic_block bb = gimple_bb (DR_STMT (dr)); 1434 basic_block bb = gimple_bb (DR_STMT (dr));
1415 struct loop *loop = bb->loop_father; 1435 struct loop *loop = bb->loop_father;
1416 tree ref = DR_REF (dr); 1436 tree ref = DR_REF (dr);
1417 tree access_base = build_fold_addr_expr (ref); 1437 tree access_base = build_fold_addr_expr (ref);
1418 tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (ref)); 1438 tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (ref));
1439 int res = 0;
1419 1440
1420 do { 1441 do {
1421 tree scev_fn = analyze_scalar_evolution (loop, access_base); 1442 tree scev_fn = analyze_scalar_evolution (loop, access_base);
1422 if (TREE_CODE (scev_fn) != POLYNOMIAL_CHREC) 1443 if (TREE_CODE (scev_fn) != POLYNOMIAL_CHREC)
1423 return false; 1444 return res;
1424 1445
1425 access_base = CHREC_LEFT (scev_fn); 1446 access_base = CHREC_LEFT (scev_fn);
1426 if (tree_contains_chrecs (access_base, NULL)) 1447 if (tree_contains_chrecs (access_base, NULL))
1427 return false; 1448 return res;
1428 1449
1429 tree scev_step = CHREC_RIGHT (scev_fn); 1450 tree scev_step = CHREC_RIGHT (scev_fn);
1430 /* Only support constant steps. */ 1451 /* Only support constant steps. */
1431 if (TREE_CODE (scev_step) != INTEGER_CST) 1452 if (TREE_CODE (scev_step) != INTEGER_CST)
1432 return false; 1453 return res;
1433 1454
1434 enum ev_direction access_dir = scev_direction (scev_fn); 1455 enum ev_direction access_dir = scev_direction (scev_fn);
1435 if (access_dir == EV_DIR_UNKNOWN) 1456 if (access_dir == EV_DIR_UNKNOWN)
1436 return false; 1457 return res;
1437 1458
1438 if (steps != NULL) 1459 if (steps != NULL)
1439 steps->safe_push (scev_step); 1460 steps->safe_push (scev_step);
1440 1461
1441 scev_step = fold_convert_loc (loc, sizetype, scev_step); 1462 scev_step = fold_convert_loc (loc, sizetype, scev_step);
1444 scev_step = fold_build1_loc (loc, NEGATE_EXPR, sizetype, scev_step); 1465 scev_step = fold_build1_loc (loc, NEGATE_EXPR, sizetype, scev_step);
1445 1466
1446 /* At each level of loop, scev step must equal to access size. In other 1467 /* At each level of loop, scev step must equal to access size. In other
1447 words, DR must access consecutive memory between loop iterations. */ 1468 words, DR must access consecutive memory between loop iterations. */
1448 if (!operand_equal_p (scev_step, access_size, 0)) 1469 if (!operand_equal_p (scev_step, access_size, 0))
1449 return false; 1470 return res;
1471
1472 /* Access stride can be computed for data reference at least for the
1473 innermost loop. */
1474 res = 1;
1450 1475
1451 /* Compute DR's execution times in loop. */ 1476 /* Compute DR's execution times in loop. */
1452 tree niters = number_of_latch_executions (loop); 1477 tree niters = number_of_latch_executions (loop);
1453 niters = fold_convert_loc (loc, sizetype, niters); 1478 niters = fold_convert_loc (loc, sizetype, niters);
1454 if (dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src, bb)) 1479 if (dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src, bb))
1466 } 1491 }
1467 } while (loop != loop_nest && (loop = loop_outer (loop)) != NULL); 1492 } while (loop != loop_nest && (loop = loop_outer (loop)) != NULL);
1468 1493
1469 *base = access_base; 1494 *base = access_base;
1470 *size = access_size; 1495 *size = access_size;
1471 return true; 1496 /* Access stride can be computed for data reference at each level loop. */
1497 return 2;
1472 } 1498 }
1473 1499
1474 /* Allocate and return builtin struct. Record information like DST_DR, 1500 /* Allocate and return builtin struct. Record information like DST_DR,
1475 SRC_DR, DST_BASE, SRC_BASE and SIZE in the allocated struct. */ 1501 SRC_DR, DST_BASE, SRC_BASE and SIZE in the allocated struct. */
1476 1502
1505 if (TREE_CODE (rhs) == SSA_NAME 1531 if (TREE_CODE (rhs) == SSA_NAME
1506 && !SSA_NAME_IS_DEFAULT_DEF (rhs) 1532 && !SSA_NAME_IS_DEFAULT_DEF (rhs)
1507 && flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (rhs)))) 1533 && flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (rhs))))
1508 return; 1534 return;
1509 1535
1510 if (!compute_access_range (loop, dr, &base, &size)) 1536 int res = compute_access_range (loop, dr, &base, &size);
1537 if (res == 0)
1538 return;
1539 if (res == 1)
1540 {
1541 partition->kind = PKIND_PARTIAL_MEMSET;
1542 return;
1543 }
1544
1545 poly_uint64 base_offset;
1546 unsigned HOST_WIDE_INT const_base_offset;
1547 tree base_base = strip_offset (base, &base_offset);
1548 if (!base_offset.is_constant (&const_base_offset))
1511 return; 1549 return;
1512 1550
1513 struct builtin_info *builtin; 1551 struct builtin_info *builtin;
1514 builtin = alloc_builtin (dr, NULL, base, NULL_TREE, size); 1552 builtin = alloc_builtin (dr, NULL, base, NULL_TREE, size);
1515 builtin->dst_base_base = strip_offset (builtin->dst_base, 1553 builtin->dst_base_base = base_base;
1516 &builtin->dst_base_offset); 1554 builtin->dst_base_offset = const_base_offset;
1517 partition->builtin = builtin; 1555 partition->builtin = builtin;
1518 partition->kind = PKIND_MEMSET; 1556 partition->kind = PKIND_MEMSET;
1519 } 1557 }
1520 1558
1521 /* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify 1559 /* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify
1526 data_reference_p dst_dr, data_reference_p src_dr) 1564 data_reference_p dst_dr, data_reference_p src_dr)
1527 { 1565 {
1528 tree base, size, src_base, src_size; 1566 tree base, size, src_base, src_size;
1529 auto_vec<tree> dst_steps, src_steps; 1567 auto_vec<tree> dst_steps, src_steps;
1530 1568
1531 /* Compute access range of both load and store. They much have the same 1569 /* Compute access range of both load and store. */
1532 access size. */ 1570 int res = compute_access_range (loop, dst_dr, &base, &size, &dst_steps);
1533 if (!compute_access_range (loop, dst_dr, &base, &size, &dst_steps) 1571 if (res != 2)
1534 || !compute_access_range (loop, src_dr, &src_base, &src_size, &src_steps) 1572 return;
1535 || !operand_equal_p (size, src_size, 0)) 1573 res = compute_access_range (loop, src_dr, &src_base, &src_size, &src_steps);
1574 if (res != 2)
1575 return;
1576
1577 /* They much have the same access size. */
1578 if (!operand_equal_p (size, src_size, 0))
1536 return; 1579 return;
1537 1580
1538 /* Load and store in loop nest must access memory in the same way, i.e, 1581 /* Load and store in loop nest must access memory in the same way, i.e,
1539 their must have the same steps in each loop of the nest. */ 1582 their must have the same steps in each loop of the nest. */
1540 if (dst_steps.length () != src_steps.length ()) 1583 if (dst_steps.length () != src_steps.length ())
1876 /* Known dependences can still be unordered througout the 1919 /* Known dependences can still be unordered througout the
1877 iteration space, see gcc.dg/tree-ssa/ldist-16.c. */ 1920 iteration space, see gcc.dg/tree-ssa/ldist-16.c. */
1878 if (DDR_NUM_DIST_VECTS (ddr) != 1) 1921 if (DDR_NUM_DIST_VECTS (ddr) != 1)
1879 this_dir = 2; 1922 this_dir = 2;
1880 /* If the overlap is exact preserve stmt order. */ 1923 /* If the overlap is exact preserve stmt order. */
1881 else if (lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), 1)) 1924 else if (lambda_vector_zerop (DDR_DIST_VECT (ddr, 0),
1925 DDR_NB_LOOPS (ddr)))
1882 ; 1926 ;
1883 /* Else as the distance vector is lexicographic positive swap 1927 /* Else as the distance vector is lexicographic positive swap
1884 the dependence direction. */ 1928 the dependence direction. */
1885 else 1929 else
1886 this_dir = -this_dir; 1930 this_dir = -this_dir;
2222 for (i = 0; i < num_sccs; ++i) 2266 for (i = 0; i < num_sccs; ++i)
2223 { 2267 {
2224 for (j = 0; partitions->iterate (j, &first); ++j) 2268 for (j = 0; partitions->iterate (j, &first); ++j)
2225 if (pg->vertices[j].component == i) 2269 if (pg->vertices[j].component == i)
2226 break; 2270 break;
2271
2272 bool same_type = true, all_builtins = partition_builtin_p (first);
2227 for (++j; partitions->iterate (j, &partition); ++j) 2273 for (++j; partitions->iterate (j, &partition); ++j)
2228 { 2274 {
2229 if (pg->vertices[j].component != i) 2275 if (pg->vertices[j].component != i)
2230 continue; 2276 continue;
2231 2277
2232 /* Note we Merge partitions of parallel type on purpose, though
2233 the result partition is sequential. The reason is vectorizer
2234 can do more accurate runtime alias check in this case. Also
2235 it results in more conservative distribution. */
2236 if (first->type != partition->type) 2278 if (first->type != partition->type)
2237 { 2279 {
2238 bitmap_clear_bit (sccs_to_merge, i); 2280 same_type = false;
2239 break; 2281 break;
2240 } 2282 }
2283 all_builtins &= partition_builtin_p (partition);
2241 } 2284 }
2285 /* Merge SCC if all partitions in SCC have the same type, though the
2286 result partition is sequential, because vectorizer can do better
2287 runtime alias check. One expecption is all partitions in SCC are
2288 builtins. */
2289 if (!same_type || all_builtins)
2290 bitmap_clear_bit (sccs_to_merge, i);
2242 } 2291 }
2243 2292
2244 /* Initialize callback data for traversing. */ 2293 /* Initialize callback data for traversing. */
2245 cbdata.sccs_to_merge = sccs_to_merge; 2294 cbdata.sccs_to_merge = sccs_to_merge;
2246 cbdata.alias_ddrs = alias_ddrs; 2295 cbdata.alias_ddrs = alias_ddrs;
2319 will be accessed by DR in NITERS iterations. */ 2368 will be accessed by DR in NITERS iterations. */
2320 2369
2321 static tree 2370 static tree
2322 data_ref_segment_size (struct data_reference *dr, tree niters) 2371 data_ref_segment_size (struct data_reference *dr, tree niters)
2323 { 2372 {
2324 tree segment_length; 2373 niters = size_binop (MINUS_EXPR,
2325 2374 fold_convert (sizetype, niters),
2326 if (integer_zerop (DR_STEP (dr))) 2375 size_one_node);
2327 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr))); 2376 return size_binop (MULT_EXPR,
2328 else 2377 fold_convert (sizetype, DR_STEP (dr)),
2329 segment_length = size_binop (MULT_EXPR, 2378 fold_convert (sizetype, niters));
2330 fold_convert (sizetype, DR_STEP (dr)),
2331 fold_convert (sizetype, niters));
2332
2333 return segment_length;
2334 } 2379 }
2335 2380
2336 /* Return true if LOOP's latch is dominated by statement for data reference 2381 /* Return true if LOOP's latch is dominated by statement for data reference
2337 DR. */ 2382 DR. */
2338 2383
2383 if (latch_dominated_by_data_ref (loop, dr_b)) 2428 if (latch_dominated_by_data_ref (loop, dr_b))
2384 seg_length_b = data_ref_segment_size (dr_b, niters_plus_one); 2429 seg_length_b = data_ref_segment_size (dr_b, niters_plus_one);
2385 else 2430 else
2386 seg_length_b = data_ref_segment_size (dr_b, niters); 2431 seg_length_b = data_ref_segment_size (dr_b, niters);
2387 2432
2433 unsigned HOST_WIDE_INT access_size_a
2434 = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_a))));
2435 unsigned HOST_WIDE_INT access_size_b
2436 = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_b))));
2437 unsigned int align_a = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_a)));
2438 unsigned int align_b = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_b)));
2439
2388 dr_with_seg_len_pair_t dr_with_seg_len_pair 2440 dr_with_seg_len_pair_t dr_with_seg_len_pair
2389 (dr_with_seg_len (dr_a, seg_length_a), 2441 (dr_with_seg_len (dr_a, seg_length_a, access_size_a, align_a),
2390 dr_with_seg_len (dr_b, seg_length_b)); 2442 dr_with_seg_len (dr_b, seg_length_b, access_size_b, align_b));
2391 2443
2392 /* Canonicalize pairs by sorting the two DR members. */ 2444 /* Canonicalize pairs by sorting the two DR members. */
2393 if (comp_res > 0) 2445 if (comp_res > 0)
2394 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second); 2446 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2395 2447
2409 2461
2410 /* Given data dependence relations in ALIAS_DDRS, generate runtime alias 2462 /* Given data dependence relations in ALIAS_DDRS, generate runtime alias
2411 checks and version LOOP under condition of these runtime alias checks. */ 2463 checks and version LOOP under condition of these runtime alias checks. */
2412 2464
2413 static void 2465 static void
2414 version_loop_by_alias_check (struct loop *loop, vec<ddr_p> *alias_ddrs) 2466 version_loop_by_alias_check (vec<struct partition *> *partitions,
2467 struct loop *loop, vec<ddr_p> *alias_ddrs)
2415 { 2468 {
2416 profile_probability prob; 2469 profile_probability prob;
2417 basic_block cond_bb; 2470 basic_block cond_bb;
2418 struct loop *nloop; 2471 struct loop *nloop;
2419 tree lhs, arg0, cond_expr = NULL_TREE; 2472 tree lhs, arg0, cond_expr = NULL_TREE;
2432 create_runtime_alias_checks (loop, &comp_alias_pairs, &cond_expr); 2485 create_runtime_alias_checks (loop, &comp_alias_pairs, &cond_expr);
2433 cond_expr = force_gimple_operand_1 (cond_expr, &cond_stmts, 2486 cond_expr = force_gimple_operand_1 (cond_expr, &cond_stmts,
2434 is_gimple_val, NULL_TREE); 2487 is_gimple_val, NULL_TREE);
2435 2488
2436 /* Depend on vectorizer to fold IFN_LOOP_DIST_ALIAS. */ 2489 /* Depend on vectorizer to fold IFN_LOOP_DIST_ALIAS. */
2437 if (flag_tree_loop_vectorize) 2490 bool cancelable_p = flag_tree_loop_vectorize;
2438 { 2491 if (cancelable_p)
2439 /* Generate internal function call for loop distribution alias check. */ 2492 {
2493 unsigned i = 0;
2494 struct partition *partition;
2495 for (; partitions->iterate (i, &partition); ++i)
2496 if (!partition_builtin_p (partition))
2497 break;
2498
2499 /* If all partitions are builtins, distributing it would be profitable and
2500 we don't want to cancel the runtime alias checks. */
2501 if (i == partitions->length ())
2502 cancelable_p = false;
2503 }
2504
2505 /* Generate internal function call for loop distribution alias check if the
2506 runtime alias check should be cancelable. */
2507 if (cancelable_p)
2508 {
2440 call_stmt = gimple_build_call_internal (IFN_LOOP_DIST_ALIAS, 2509 call_stmt = gimple_build_call_internal (IFN_LOOP_DIST_ALIAS,
2441 2, NULL_TREE, cond_expr); 2510 2, NULL_TREE, cond_expr);
2442 lhs = make_ssa_name (boolean_type_node); 2511 lhs = make_ssa_name (boolean_type_node);
2443 gimple_call_set_lhs (call_stmt, lhs); 2512 gimple_call_set_lhs (call_stmt, lhs);
2444 } 2513 }
2490 return (alias_ddrs->length () > 0); 2559 return (alias_ddrs->length () > 0);
2491 } 2560 }
2492 2561
2493 /* Compare base offset of builtin mem* partitions P1 and P2. */ 2562 /* Compare base offset of builtin mem* partitions P1 and P2. */
2494 2563
2495 static bool 2564 static int
2496 offset_cmp (struct partition *p1, struct partition *p2) 2565 offset_cmp (const void *vp1, const void *vp2)
2497 { 2566 {
2498 gcc_assert (p1 != NULL && p1->builtin != NULL); 2567 struct partition *p1 = *(struct partition *const *) vp1;
2499 gcc_assert (p2 != NULL && p2->builtin != NULL); 2568 struct partition *p2 = *(struct partition *const *) vp2;
2500 return p1->builtin->dst_base_offset < p2->builtin->dst_base_offset; 2569 unsigned HOST_WIDE_INT o1 = p1->builtin->dst_base_offset;
2570 unsigned HOST_WIDE_INT o2 = p2->builtin->dst_base_offset;
2571 return (o2 < o1) - (o1 < o2);
2501 } 2572 }
2502 2573
2503 /* Fuse adjacent memset builtin PARTITIONS if possible. This is a special 2574 /* Fuse adjacent memset builtin PARTITIONS if possible. This is a special
2504 case optimization transforming below code: 2575 case optimization transforming below code:
2505 2576
2520 static void 2591 static void
2521 fuse_memset_builtins (vec<struct partition *> *partitions) 2592 fuse_memset_builtins (vec<struct partition *> *partitions)
2522 { 2593 {
2523 unsigned i, j; 2594 unsigned i, j;
2524 struct partition *part1, *part2; 2595 struct partition *part1, *part2;
2596 tree rhs1, rhs2;
2525 2597
2526 for (i = 0; partitions->iterate (i, &part1);) 2598 for (i = 0; partitions->iterate (i, &part1);)
2527 { 2599 {
2528 if (part1->kind != PKIND_MEMSET) 2600 if (part1->kind != PKIND_MEMSET)
2529 { 2601 {
2537 { 2609 {
2538 if (part2->kind != PKIND_MEMSET 2610 if (part2->kind != PKIND_MEMSET
2539 || !operand_equal_p (part1->builtin->dst_base_base, 2611 || !operand_equal_p (part1->builtin->dst_base_base,
2540 part2->builtin->dst_base_base, 0)) 2612 part2->builtin->dst_base_base, 0))
2541 break; 2613 break;
2614
2615 /* Memset calls setting different values can't be merged. */
2616 rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr));
2617 rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr));
2618 if (!operand_equal_p (rhs1, rhs2, 0))
2619 break;
2542 } 2620 }
2543 2621
2544 /* Stable sort is required in order to avoid breaking dependence. */ 2622 /* Stable sort is required in order to avoid breaking dependence. */
2545 std::stable_sort (&(*partitions)[i], 2623 gcc_stablesort (&(*partitions)[i], j - i, sizeof (*partitions)[i],
2546 &(*partitions)[i] + j - i, offset_cmp); 2624 offset_cmp);
2547 /* Continue with next partition. */ 2625 /* Continue with next partition. */
2548 i = j; 2626 i = j;
2549 } 2627 }
2550 2628
2551 /* Merge all consecutive memset builtin partitions. */ 2629 /* Merge all consecutive memset builtin partitions. */
2568 part2->builtin->dst_base_base, 0)) 2646 part2->builtin->dst_base_base, 0))
2569 { 2647 {
2570 i++; 2648 i++;
2571 continue; 2649 continue;
2572 } 2650 }
2573 tree rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr)); 2651 rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr));
2574 tree rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr)); 2652 rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr));
2575 int bytev1 = const_with_all_bytes_same (rhs1); 2653 int bytev1 = const_with_all_bytes_same (rhs1);
2576 int bytev2 = const_with_all_bytes_same (rhs2); 2654 int bytev2 = const_with_all_bytes_same (rhs2);
2577 /* Only merge memset partitions of the same value. */ 2655 /* Only merge memset partitions of the same value. */
2578 if (bytev1 != bytev2 || bytev1 == -1) 2656 if (bytev1 != bytev2 || bytev1 == -1)
2579 { 2657 {
2609 2687
2610 if (partitions->length () == 1 2688 if (partitions->length () == 1
2611 || alias_ddrs->length () > 0) 2689 || alias_ddrs->length () > 0)
2612 return; 2690 return;
2613 2691
2614 unsigned num_builtin = 0, num_normal = 0; 2692 unsigned num_builtin = 0, num_normal = 0, num_partial_memset = 0;
2615 bool same_type_p = true; 2693 bool same_type_p = true;
2616 enum partition_type type = ((*partitions)[0])->type; 2694 enum partition_type type = ((*partitions)[0])->type;
2617 for (i = 0; partitions->iterate (i, &partition); ++i) 2695 for (i = 0; partitions->iterate (i, &partition); ++i)
2618 { 2696 {
2619 same_type_p &= (type == partition->type); 2697 same_type_p &= (type == partition->type);
2620 if (partition->kind != PKIND_NORMAL) 2698 if (partition_builtin_p (partition))
2621 num_builtin++; 2699 {
2622 else 2700 num_builtin++;
2623 num_normal++; 2701 continue;
2702 }
2703 num_normal++;
2704 if (partition->kind == PKIND_PARTIAL_MEMSET)
2705 num_partial_memset++;
2624 } 2706 }
2625 2707
2626 /* Don't distribute current loop into too many loops given we don't have 2708 /* Don't distribute current loop into too many loops given we don't have
2627 memory stream cost model. Be even more conservative in case of loop 2709 memory stream cost model. Be even more conservative in case of loop
2628 nest distribution. */ 2710 nest distribution. */
2629 if ((same_type_p && num_builtin == 0) 2711 if ((same_type_p && num_builtin == 0
2712 && (loop->inner == NULL || num_normal != 2 || num_partial_memset != 1))
2630 || (loop->inner != NULL 2713 || (loop->inner != NULL
2631 && i >= NUM_PARTITION_THRESHOLD && num_normal > 1) 2714 && i >= NUM_PARTITION_THRESHOLD && num_normal > 1)
2632 || (loop->inner == NULL 2715 || (loop->inner == NULL
2633 && i >= NUM_PARTITION_THRESHOLD && num_normal > num_builtin)) 2716 && i >= NUM_PARTITION_THRESHOLD && num_normal > num_builtin))
2634 { 2717 {
2772 /* Apply our simple cost model - fuse partitions with similar 2855 /* Apply our simple cost model - fuse partitions with similar
2773 memory accesses. */ 2856 memory accesses. */
2774 for (i = 0; partitions.iterate (i, &into); ++i) 2857 for (i = 0; partitions.iterate (i, &into); ++i)
2775 { 2858 {
2776 bool changed = false; 2859 bool changed = false;
2777 if (partition_builtin_p (into)) 2860 if (partition_builtin_p (into) || into->kind == PKIND_PARTIAL_MEMSET)
2778 continue; 2861 continue;
2779 for (int j = i + 1; 2862 for (int j = i + 1;
2780 partitions.iterate (j, &partition); ++j) 2863 partitions.iterate (j, &partition); ++j)
2781 { 2864 {
2782 if (share_memory_accesses (rdg, into, partition)) 2865 if (share_memory_accesses (rdg, into, partition))
2822 nbp = 0; 2905 nbp = 0;
2823 goto ldist_done; 2906 goto ldist_done;
2824 } 2907 }
2825 2908
2826 if (version_for_distribution_p (&partitions, &alias_ddrs)) 2909 if (version_for_distribution_p (&partitions, &alias_ddrs))
2827 version_loop_by_alias_check (loop, &alias_ddrs); 2910 version_loop_by_alias_check (&partitions, loop, &alias_ddrs);
2828 2911
2829 if (dump_file && (dump_flags & TDF_DETAILS)) 2912 if (dump_file && (dump_flags & TDF_DETAILS))
2830 { 2913 {
2831 fprintf (dump_file, 2914 fprintf (dump_file,
2832 "distribute loop <%d> into partitions:\n", loop->num); 2915 "distribute loop <%d> into partitions:\n", loop->num);
2952 prepare_perfect_loop_nest (struct loop *loop) 3035 prepare_perfect_loop_nest (struct loop *loop)
2953 { 3036 {
2954 struct loop *outer = loop_outer (loop); 3037 struct loop *outer = loop_outer (loop);
2955 tree niters = number_of_latch_executions (loop); 3038 tree niters = number_of_latch_executions (loop);
2956 3039
2957 /* TODO: We only support the innermost 2-level loop nest distribution 3040 /* TODO: We only support the innermost 3-level loop nest distribution
2958 because of compilation time issue for now. This should be relaxed 3041 because of compilation time issue for now. This should be relaxed
2959 in the future. */ 3042 in the future. Note we only allow 3-level loop nest distribution
2960 while (loop->inner == NULL 3043 when parallelizing loops. */
3044 while ((loop->inner == NULL
3045 || (loop->inner->inner == NULL && flag_tree_parallelize_loops > 1))
2961 && loop_outer (outer) 3046 && loop_outer (outer)
2962 && outer->inner == loop && loop->next == NULL 3047 && outer->inner == loop && loop->next == NULL
2963 && single_exit (outer) 3048 && single_exit (outer)
2964 && optimize_loop_for_speed_p (outer) 3049 && optimize_loop_for_speed_p (outer)
2965 && !chrec_contains_symbols_defined_in_loop (niters, outer->num) 3050 && !chrec_contains_symbols_defined_in_loop (niters, outer->num)
3032 auto_vec<gimple *> work_list; 3117 auto_vec<gimple *> work_list;
3033 if (!find_seed_stmts_for_distribution (loop, &work_list)) 3118 if (!find_seed_stmts_for_distribution (loop, &work_list))
3034 break; 3119 break;
3035 3120
3036 const char *str = loop->inner ? " nest" : ""; 3121 const char *str = loop->inner ? " nest" : "";
3037 location_t loc = find_loop_location (loop); 3122 dump_user_location_t loc = find_loop_location (loop);
3038 if (!cd) 3123 if (!cd)
3039 { 3124 {
3040 calculate_dominance_info (CDI_DOMINATORS); 3125 calculate_dominance_info (CDI_DOMINATORS);
3041 calculate_dominance_info (CDI_POST_DOMINATORS); 3126 calculate_dominance_info (CDI_POST_DOMINATORS);
3042 cd = new control_dependences (); 3127 cd = new control_dependences ();
3092 rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa); 3177 rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
3093 } 3178 }
3094 3179
3095 checking_verify_loop_structure (); 3180 checking_verify_loop_structure ();
3096 3181
3097 return 0; 3182 return changed ? TODO_cleanup_cfg : 0;
3098 } 3183 }
3099 3184
3100 } // anon namespace 3185 } // anon namespace
3101 3186
3102 gimple_opt_pass * 3187 gimple_opt_pass *