Mercurial > hg > CbC > CbC_gcc
diff gcc/tree-loop-distribution.c @ 145:1830386684a0
gcc-9.2.0
author | anatofuz |
---|---|
date | Thu, 13 Feb 2020 11:34:05 +0900 |
parents | 84e7813d76e9 |
children |
line wrap: on
line diff
--- a/gcc/tree-loop-distribution.c Thu Oct 25 07:37:49 2018 +0900 +++ b/gcc/tree-loop-distribution.c Thu Feb 13 11:34:05 2020 +0900 @@ -1,5 +1,5 @@ /* Loop distribution. - Copyright (C) 2006-2018 Free Software Foundation, Inc. + Copyright (C) 2006-2020 Free Software Foundation, Inc. Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr> and Sebastian Pop <sebastian.pop@amd.com>. @@ -112,12 +112,13 @@ #include "tree-ssa.h" #include "cfgloop.h" #include "tree-scalar-evolution.h" -#include "params.h" #include "tree-vectorizer.h" +#include "tree-eh.h" +#include "gimple-fold.h" #define MAX_DATAREFS_NUM \ - ((unsigned) PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS)) + ((unsigned) param_loop_max_datarefs_for_datadeps) /* Threshold controlling number of distributed partitions. Given it may be unnecessary if a memory stream cost model is invented in the future, @@ -153,18 +154,10 @@ return (DDR_A (ddr1) == DDR_A (ddr2) && DDR_B (ddr1) == DDR_B (ddr2)); } -/* The loop (nest) to be distributed. */ -static vec<loop_p> loop_nest; - -/* Vector of data references in the loop to be distributed. */ -static vec<data_reference_p> datarefs_vec; - -/* Store index of data reference in aux field. */ + + #define DR_INDEX(dr) ((uintptr_t) (dr)->aux) -/* Hash table for data dependence relation in the loop to be distributed. */ -static hash_table<ddr_hasher> *ddrs_table; - /* A Reduced Dependence Graph (RDG) vertex representing a statement. */ struct rdg_vertex { @@ -211,6 +204,83 @@ #define RDGE_TYPE(E) ((struct rdg_edge *) ((E)->data))->type +/* Kind of distributed loop. */ +enum partition_kind { + PKIND_NORMAL, + /* Partial memset stands for a paritition can be distributed into a loop + of memset calls, rather than a single memset call. It's handled just + like a normal parition, i.e, distributed as separate loop, no memset + call is generated. + + Note: This is a hacking fix trying to distribute ZERO-ing stmt in a + loop nest as deep as possible. As a result, parloop achieves better + parallelization by parallelizing deeper loop nest. This hack should + be unnecessary and removed once distributed memset can be understood + and analyzed in data reference analysis. See PR82604 for more. */ + PKIND_PARTIAL_MEMSET, + PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE +}; + +/* Type of distributed loop. */ +enum partition_type { + /* The distributed loop can be executed parallelly. */ + PTYPE_PARALLEL = 0, + /* The distributed loop has to be executed sequentially. */ + PTYPE_SEQUENTIAL +}; + +/* Builtin info for loop distribution. */ +struct builtin_info +{ + /* data-references a kind != PKIND_NORMAL partition is about. */ + data_reference_p dst_dr; + data_reference_p src_dr; + /* Base address and size of memory objects operated by the builtin. Note + both dest and source memory objects must have the same size. */ + tree dst_base; + tree src_base; + tree size; + /* Base and offset part of dst_base after stripping constant offset. This + is only used in memset builtin distribution for now. */ + tree dst_base_base; + unsigned HOST_WIDE_INT dst_base_offset; +}; + +/* Partition for loop distribution. */ +struct partition +{ + /* Statements of the partition. */ + bitmap stmts; + /* True if the partition defines variable which is used outside of loop. */ + bool reduction_p; + location_t loc; + enum partition_kind kind; + enum partition_type type; + /* Data references in the partition. */ + bitmap datarefs; + /* Information of builtin parition. */ + struct builtin_info *builtin; +}; + +/* Partitions are fused because of different reasons. */ +enum fuse_type +{ + FUSE_NON_BUILTIN = 0, + FUSE_REDUCTION = 1, + FUSE_SHARE_REF = 2, + FUSE_SAME_SCC = 3, + FUSE_FINALIZE = 4 +}; + +/* Description on different fusing reason. */ +static const char *fuse_message[] = { + "they are non-builtins", + "they have reductions", + "they have shared memory refs", + "they are in the same dependence scc", + "there is no point to distribute loop"}; + + /* Dump vertex I in RDG to FILE. */ static void @@ -431,11 +501,205 @@ } } -/* Build the vertices of the reduced dependence graph RDG. Return false - if that failed. */ - -static bool -create_rdg_vertices (struct graph *rdg, vec<gimple *> stmts, loop_p loop) + +class loop_distribution +{ + private: + /* The loop (nest) to be distributed. */ + vec<loop_p> loop_nest; + + /* Vector of data references in the loop to be distributed. */ + vec<data_reference_p> datarefs_vec; + + /* If there is nonaddressable data reference in above vector. */ + bool has_nonaddressable_dataref_p; + + /* Store index of data reference in aux field. */ + + /* Hash table for data dependence relation in the loop to be distributed. */ + hash_table<ddr_hasher> *ddrs_table; + + /* Array mapping basic block's index to its topological order. */ + int *bb_top_order_index; + /* And size of the array. */ + int bb_top_order_index_size; + + /* Build the vertices of the reduced dependence graph RDG. Return false + if that failed. */ + bool create_rdg_vertices (struct graph *rdg, vec<gimple *> stmts, loop_p loop); + + /* Initialize STMTS with all the statements of LOOP. We use topological + order to discover all statements. The order is important because + generate_loops_for_partition is using the same traversal for identifying + statements in loop copies. */ + void stmts_from_loop (class loop *loop, vec<gimple *> *stmts); + + + /* Build the Reduced Dependence Graph (RDG) with one vertex per statement of + LOOP, and one edge per flow dependence or control dependence from control + dependence CD. During visiting each statement, data references are also + collected and recorded in global data DATAREFS_VEC. */ + struct graph * build_rdg (class loop *loop, control_dependences *cd); + +/* Merge PARTITION into the partition DEST. RDG is the reduced dependence + graph and we update type for result partition if it is non-NULL. */ + void partition_merge_into (struct graph *rdg, + partition *dest, partition *partition, + enum fuse_type ft); + + + /* Return data dependence relation for data references A and B. The two + data references must be in lexicographic order wrto reduced dependence + graph RDG. We firstly try to find ddr from global ddr hash table. If + it doesn't exist, compute the ddr and cache it. */ + data_dependence_relation * get_data_dependence (struct graph *rdg, + data_reference_p a, + data_reference_p b); + + + /* In reduced dependence graph RDG for loop distribution, return true if + dependence between references DR1 and DR2 leads to a dependence cycle + and such dependence cycle can't be resolved by runtime alias check. */ + bool data_dep_in_cycle_p (struct graph *rdg, data_reference_p dr1, + data_reference_p dr2); + + + /* Given reduced dependence graph RDG, PARTITION1 and PARTITION2, update + PARTITION1's type after merging PARTITION2 into PARTITION1. */ + void update_type_for_merge (struct graph *rdg, + partition *partition1, partition *partition2); + + + /* Returns a partition with all the statements needed for computing + the vertex V of the RDG, also including the loop exit conditions. */ + partition *build_rdg_partition_for_vertex (struct graph *rdg, int v); + + /* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify + if it forms builtin memcpy or memmove call. */ + void classify_builtin_ldst (loop_p loop, struct graph *rdg, partition *partition, + data_reference_p dst_dr, data_reference_p src_dr); + + /* Classifies the builtin kind we can generate for PARTITION of RDG and LOOP. + For the moment we detect memset, memcpy and memmove patterns. Bitmap + STMT_IN_ALL_PARTITIONS contains statements belonging to all partitions. + Returns true if there is a reduction in all partitions and we + possibly did not mark PARTITION as having one for this reason. */ + + bool + classify_partition (loop_p loop, + struct graph *rdg, partition *partition, + bitmap stmt_in_all_partitions); + + + /* Returns true when PARTITION1 and PARTITION2 access the same memory + object in RDG. */ + bool share_memory_accesses (struct graph *rdg, + partition *partition1, partition *partition2); + + /* For each seed statement in STARTING_STMTS, this function builds + partition for it by adding depended statements according to RDG. + All partitions are recorded in PARTITIONS. */ + void rdg_build_partitions (struct graph *rdg, + vec<gimple *> starting_stmts, + vec<partition *> *partitions); + + /* Compute partition dependence created by the data references in DRS1 + and DRS2, modify and return DIR according to that. IF ALIAS_DDR is + not NULL, we record dependence introduced by possible alias between + two data references in ALIAS_DDRS; otherwise, we simply ignore such + dependence as if it doesn't exist at all. */ + int pg_add_dependence_edges (struct graph *rdg, int dir, bitmap drs1, + bitmap drs2, vec<ddr_p> *alias_ddrs); + + + /* Build and return partition dependence graph for PARTITIONS. RDG is + reduced dependence graph for the loop to be distributed. If IGNORE_ALIAS_P + is true, data dependence caused by possible alias between references + is ignored, as if it doesn't exist at all; otherwise all depdendences + are considered. */ + struct graph *build_partition_graph (struct graph *rdg, + vec<struct partition *> *partitions, + bool ignore_alias_p); + + /* Given reduced dependence graph RDG merge strong connected components + of PARTITIONS. If IGNORE_ALIAS_P is true, data dependence caused by + possible alias between references is ignored, as if it doesn't exist + at all; otherwise all depdendences are considered. */ + void merge_dep_scc_partitions (struct graph *rdg, vec<struct partition *> + *partitions, bool ignore_alias_p); + +/* This is the main function breaking strong conected components in + PARTITIONS giving reduced depdendence graph RDG. Store data dependence + relations for runtime alias check in ALIAS_DDRS. */ + void break_alias_scc_partitions (struct graph *rdg, vec<struct partition *> + *partitions, vec<ddr_p> *alias_ddrs); + + + /* Fuse PARTITIONS of LOOP if necessary before finalizing distribution. + ALIAS_DDRS contains ddrs which need runtime alias check. */ + void finalize_partitions (class loop *loop, vec<struct partition *> + *partitions, vec<ddr_p> *alias_ddrs); + + /* Distributes the code from LOOP in such a way that producer statements + are placed before consumer statements. Tries to separate only the + statements from STMTS into separate loops. Returns the number of + distributed loops. Set NB_CALLS to number of generated builtin calls. + Set *DESTROY_P to whether LOOP needs to be destroyed. */ + int distribute_loop (class loop *loop, vec<gimple *> stmts, + control_dependences *cd, int *nb_calls, bool *destroy_p, + bool only_patterns_p); + + /* Compute topological order for basic blocks. Topological order is + needed because data dependence is computed for data references in + lexicographical order. */ + void bb_top_order_init (void); + + void bb_top_order_destroy (void); + + public: + + /* Getter for bb_top_order. */ + + inline int get_bb_top_order_index_size (void) + { + return bb_top_order_index_size; + } + + inline int get_bb_top_order_index (int i) + { + return bb_top_order_index[i]; + } + + unsigned int execute (function *fun); +}; + + +/* If X has a smaller topological sort number than Y, returns -1; + if greater, returns 1. */ +static int +bb_top_order_cmp_r (const void *x, const void *y, void *loop) +{ + loop_distribution *_loop = + (loop_distribution *) loop; + + basic_block bb1 = *(const basic_block *) x; + basic_block bb2 = *(const basic_block *) y; + + int bb_top_order_index_size = _loop->get_bb_top_order_index_size (); + + gcc_assert (bb1->index < bb_top_order_index_size + && bb2->index < bb_top_order_index_size); + gcc_assert (bb1 == bb2 + || _loop->get_bb_top_order_index(bb1->index) + != _loop->get_bb_top_order_index(bb2->index)); + + return (_loop->get_bb_top_order_index(bb1->index) - + _loop->get_bb_top_order_index(bb2->index)); +} + +bool +loop_distribution::create_rdg_vertices (struct graph *rdg, vec<gimple *> stmts, + loop_p loop) { int i; gimple *stmt; @@ -466,44 +730,17 @@ else RDGV_HAS_MEM_WRITE (v) = true; RDGV_DATAREFS (v).safe_push (dr); + has_nonaddressable_dataref_p |= may_be_nonaddressable_p (dr->ref); } } return true; } -/* Array mapping basic block's index to its topological order. */ -static int *bb_top_order_index; -/* And size of the array. */ -static int bb_top_order_index_size; - -/* If X has a smaller topological sort number than Y, returns -1; - if greater, returns 1. */ - -static int -bb_top_order_cmp (const void *x, const void *y) -{ - basic_block bb1 = *(const basic_block *) x; - basic_block bb2 = *(const basic_block *) y; - - gcc_assert (bb1->index < bb_top_order_index_size - && bb2->index < bb_top_order_index_size); - gcc_assert (bb1 == bb2 - || bb_top_order_index[bb1->index] - != bb_top_order_index[bb2->index]); - - return (bb_top_order_index[bb1->index] - bb_top_order_index[bb2->index]); -} - -/* Initialize STMTS with all the statements of LOOP. We use topological - order to discover all statements. The order is important because - generate_loops_for_partition is using the same traversal for identifying - statements in loop copies. */ - -static void -stmts_from_loop (struct loop *loop, vec<gimple *> *stmts) +void +loop_distribution::stmts_from_loop (class loop *loop, vec<gimple *> *stmts) { unsigned int i; - basic_block *bbs = get_loop_body_in_custom_order (loop, bb_top_order_cmp); + basic_block *bbs = get_loop_body_in_custom_order (loop, this, bb_top_order_cmp_r); for (i = 0; i < loop->num_nodes; i++) { @@ -552,13 +789,8 @@ free_graph (rdg); } -/* Build the Reduced Dependence Graph (RDG) with one vertex per statement of - LOOP, and one edge per flow dependence or control dependence from control - dependence CD. During visiting each statement, data references are also - collected and recorded in global data DATAREFS_VEC. */ - -static struct graph * -build_rdg (struct loop *loop, control_dependences *cd) +struct graph * +loop_distribution::build_rdg (class loop *loop, control_dependences *cd) { struct graph *rdg; @@ -581,64 +813,6 @@ } -/* Kind of distributed loop. */ -enum partition_kind { - PKIND_NORMAL, - /* Partial memset stands for a paritition can be distributed into a loop - of memset calls, rather than a single memset call. It's handled just - like a normal parition, i.e, distributed as separate loop, no memset - call is generated. - - Note: This is a hacking fix trying to distribute ZERO-ing stmt in a - loop nest as deep as possible. As a result, parloop achieves better - parallelization by parallelizing deeper loop nest. This hack should - be unnecessary and removed once distributed memset can be understood - and analyzed in data reference analysis. See PR82604 for more. */ - PKIND_PARTIAL_MEMSET, - PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE -}; - -/* Type of distributed loop. */ -enum partition_type { - /* The distributed loop can be executed parallelly. */ - PTYPE_PARALLEL = 0, - /* The distributed loop has to be executed sequentially. */ - PTYPE_SEQUENTIAL -}; - -/* Builtin info for loop distribution. */ -struct builtin_info -{ - /* data-references a kind != PKIND_NORMAL partition is about. */ - data_reference_p dst_dr; - data_reference_p src_dr; - /* Base address and size of memory objects operated by the builtin. Note - both dest and source memory objects must have the same size. */ - tree dst_base; - tree src_base; - tree size; - /* Base and offset part of dst_base after stripping constant offset. This - is only used in memset builtin distribution for now. */ - tree dst_base_base; - unsigned HOST_WIDE_INT dst_base_offset; -}; - -/* Partition for loop distribution. */ -struct partition -{ - /* Statements of the partition. */ - bitmap stmts; - /* True if the partition defines variable which is used outside of loop. */ - bool reduction_p; - enum partition_kind kind; - enum partition_type type; - /* Data references in the partition. */ - bitmap datarefs; - /* Information of builtin parition. */ - struct builtin_info *builtin; -}; - - /* Allocate and initialize a partition from BITMAP. */ static partition * @@ -647,7 +821,9 @@ partition *partition = XCNEW (struct partition); partition->stmts = BITMAP_ALLOC (NULL); partition->reduction_p = false; + partition->loc = UNKNOWN_LOCATION; partition->kind = PKIND_NORMAL; + partition->type = PTYPE_PARALLEL; partition->datarefs = BITMAP_ALLOC (NULL); return partition; } @@ -681,33 +857,9 @@ return partition->reduction_p; } -/* Partitions are fused because of different reasons. */ -enum fuse_type -{ - FUSE_NON_BUILTIN = 0, - FUSE_REDUCTION = 1, - FUSE_SHARE_REF = 2, - FUSE_SAME_SCC = 3, - FUSE_FINALIZE = 4 -}; - -/* Description on different fusing reason. */ -static const char *fuse_message[] = { - "they are non-builtins", - "they have reductions", - "they have shared memory refs", - "they are in the same dependence scc", - "there is no point to distribute loop"}; - -static void -update_type_for_merge (struct graph *, partition *, partition *); - -/* Merge PARTITION into the partition DEST. RDG is the reduced dependence - graph and we update type for result partition if it is non-NULL. */ - -static void -partition_merge_into (struct graph *rdg, partition *dest, - partition *partition, enum fuse_type ft) +void +loop_distribution::partition_merge_into (struct graph *rdg, + partition *dest, partition *partition, enum fuse_type ft) { if (dump_file && (dump_flags & TDF_DETAILS)) { @@ -778,10 +930,10 @@ /* Return a copy of LOOP placed before LOOP. */ -static struct loop * -copy_loop_before (struct loop *loop) +static class loop * +copy_loop_before (class loop *loop) { - struct loop *res; + class loop *res; edge preheader = loop_preheader_edge (loop); initialize_original_copy_tables (); @@ -796,7 +948,7 @@ /* Creates an empty basic block after LOOP. */ static void -create_bb_after_loop (struct loop *loop) +create_bb_after_loop (class loop *loop) { edge exit = single_exit (loop); @@ -813,7 +965,7 @@ basic blocks of a loop are taken in dom order. */ static void -generate_loops_for_partition (struct loop *loop, partition *partition, +generate_loops_for_partition (class loop *loop, partition *partition, bool copy_p) { unsigned i; @@ -985,7 +1137,7 @@ /* Generate a call to memset for PARTITION in LOOP. */ static void -generate_memset_builtin (struct loop *loop, partition *partition) +generate_memset_builtin (class loop *loop, partition *partition) { gimple_stmt_iterator gsi; tree mem, fn, nb_bytes; @@ -996,7 +1148,7 @@ /* The new statements will be placed before LOOP. */ gsi = gsi_last_bb (loop_preheader_edge (loop)->src); - nb_bytes = builtin->size; + nb_bytes = rewrite_to_non_trapping_overflow (builtin->size); nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE, false, GSI_CONTINUE_LINKING); mem = builtin->dst_base; @@ -1022,7 +1174,9 @@ fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_MEMSET)); fn_call = gimple_build_call (fn, 3, mem, val, nb_bytes); + gimple_set_location (fn_call, partition->loc); gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING); + fold_stmt (&gsi); if (dump_file && (dump_flags & TDF_DETAILS)) { @@ -1037,7 +1191,7 @@ /* Generate a call to memcpy for PARTITION in LOOP. */ static void -generate_memcpy_builtin (struct loop *loop, partition *partition) +generate_memcpy_builtin (class loop *loop, partition *partition) { gimple_stmt_iterator gsi; gimple *fn_call; @@ -1048,7 +1202,7 @@ /* The new statements will be placed before LOOP. */ gsi = gsi_last_bb (loop_preheader_edge (loop)->src); - nb_bytes = builtin->size; + nb_bytes = rewrite_to_non_trapping_overflow (builtin->size); nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE, false, GSI_CONTINUE_LINKING); dest = builtin->dst_base; @@ -1065,7 +1219,9 @@ false, GSI_CONTINUE_LINKING); fn = build_fold_addr_expr (builtin_decl_implicit (kind)); fn_call = gimple_build_call (fn, 3, dest, src, nb_bytes); + gimple_set_location (fn_call, partition->loc); gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING); + fold_stmt (&gsi); if (dump_file && (dump_flags & TDF_DETAILS)) { @@ -1079,7 +1235,7 @@ /* Remove and destroy the loop LOOP. */ static void -destroy_loop (struct loop *loop) +destroy_loop (class loop *loop) { unsigned nbbs = loop->num_nodes; edge exit = single_exit (loop); @@ -1089,6 +1245,50 @@ bbs = get_loop_body_in_dom_order (loop); + gimple_stmt_iterator dst_gsi = gsi_after_labels (exit->dest); + bool safe_p = single_pred_p (exit->dest); + for (unsigned i = 0; i < nbbs; ++i) + { + /* We have made sure to not leave any dangling uses of SSA + names defined in the loop. With the exception of virtuals. + Make sure we replace all uses of virtual defs that will remain + outside of the loop with the bare symbol as delete_basic_block + will release them. */ + for (gphi_iterator gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi); + gsi_next (&gsi)) + { + gphi *phi = gsi.phi (); + if (virtual_operand_p (gimple_phi_result (phi))) + mark_virtual_phi_result_for_renaming (phi); + } + for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi);) + { + gimple *stmt = gsi_stmt (gsi); + tree vdef = gimple_vdef (stmt); + if (vdef && TREE_CODE (vdef) == SSA_NAME) + mark_virtual_operand_for_renaming (vdef); + /* Also move and eventually reset debug stmts. We can leave + constant values in place in case the stmt dominates the exit. + ??? Non-constant values from the last iteration can be + replaced with final values if we can compute them. */ + if (gimple_debug_bind_p (stmt)) + { + tree val = gimple_debug_bind_get_value (stmt); + gsi_move_before (&gsi, &dst_gsi); + if (val + && (!safe_p + || !is_gimple_min_invariant (val) + || !dominated_by_p (CDI_DOMINATORS, exit->src, bbs[i]))) + { + gimple_debug_bind_reset_value (stmt); + update_stmt (stmt); + } + } + else + gsi_next (&gsi); + } + } + redirect_edge_pred (exit, src); exit->flags &= ~(EDGE_TRUE_VALUE|EDGE_FALSE_VALUE); exit->flags |= EDGE_FALLTHRU; @@ -1098,27 +1298,7 @@ i = nbbs; do { - /* We have made sure to not leave any dangling uses of SSA - names defined in the loop. With the exception of virtuals. - Make sure we replace all uses of virtual defs that will remain - outside of the loop with the bare symbol as delete_basic_block - will release them. */ --i; - for (gphi_iterator gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi); - gsi_next (&gsi)) - { - gphi *phi = gsi.phi (); - if (virtual_operand_p (gimple_phi_result (phi))) - mark_virtual_phi_result_for_renaming (phi); - } - for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); - gsi_next (&gsi)) - { - gimple *stmt = gsi_stmt (gsi); - tree vdef = gimple_vdef (stmt); - if (vdef && TREE_CODE (vdef) == SSA_NAME) - mark_virtual_operand_for_renaming (vdef); - } delete_basic_block (bbs[i]); } while (i != 0); @@ -1132,7 +1312,7 @@ /* Generates code for PARTITION. Return whether LOOP needs to be destroyed. */ static bool -generate_code_for_partition (struct loop *loop, +generate_code_for_partition (class loop *loop, partition *partition, bool copy_p) { switch (partition->kind) @@ -1165,13 +1345,9 @@ return false; } -/* Return data dependence relation for data references A and B. The two - data references must be in lexicographic order wrto reduced dependence - graph RDG. We firstly try to find ddr from global ddr hash table. If - it doesn't exist, compute the ddr and cache it. */ - -static data_dependence_relation * -get_data_dependence (struct graph *rdg, data_reference_p a, data_reference_p b) +data_dependence_relation * +loop_distribution::get_data_dependence (struct graph *rdg, data_reference_p a, + data_reference_p b) { struct data_dependence_relation ent, **slot; struct data_dependence_relation *ddr; @@ -1192,13 +1368,10 @@ return *slot; } -/* In reduced dependence graph RDG for loop distribution, return true if - dependence between references DR1 and DR2 leads to a dependence cycle - and such dependence cycle can't be resolved by runtime alias check. */ - -static bool -data_dep_in_cycle_p (struct graph *rdg, - data_reference_p dr1, data_reference_p dr2) +bool +loop_distribution::data_dep_in_cycle_p (struct graph *rdg, + data_reference_p dr1, + data_reference_p dr2) { struct data_dependence_relation *ddr; @@ -1228,12 +1401,10 @@ return true; } -/* Given reduced dependence graph RDG, PARTITION1 and PARTITION2, update - PARTITION1's type after merging PARTITION2 into PARTITION1. */ - -static void -update_type_for_merge (struct graph *rdg, - partition *partition1, partition *partition2) +void +loop_distribution::update_type_for_merge (struct graph *rdg, + partition *partition1, + partition *partition2) { unsigned i, j; bitmap_iterator bi, bj; @@ -1261,11 +1432,8 @@ } } -/* Returns a partition with all the statements needed for computing - the vertex V of the RDG, also including the loop exit conditions. */ - -static partition * -build_rdg_partition_for_vertex (struct graph *rdg, int v) +partition * +loop_distribution::build_rdg_partition_for_vertex (struct graph *rdg, int v) { partition *partition = partition_alloc (); auto_vec<int, 3> nodes; @@ -1309,7 +1477,7 @@ data references. */ static bool -find_single_drs (struct loop *loop, struct graph *rdg, partition *partition, +find_single_drs (class loop *loop, struct graph *rdg, partition *partition, data_reference_p *dst_dr, data_reference_p *src_dr) { unsigned i; @@ -1432,7 +1600,7 @@ { location_t loc = gimple_location (DR_STMT (dr)); basic_block bb = gimple_bb (DR_STMT (dr)); - struct loop *loop = bb->loop_father; + class loop *loop = bb->loop_father; tree ref = DR_REF (dr); tree access_base = build_fold_addr_expr (ref); tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (ref)); @@ -1559,9 +1727,11 @@ /* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify if it forms builtin memcpy or memmove call. */ -static void -classify_builtin_ldst (loop_p loop, struct graph *rdg, partition *partition, - data_reference_p dst_dr, data_reference_p src_dr) +void +loop_distribution::classify_builtin_ldst (loop_p loop, struct graph *rdg, + partition *partition, + data_reference_p dst_dr, + data_reference_p src_dr) { tree base, size, src_base, src_size; auto_vec<tree> dst_steps, src_steps; @@ -1619,13 +1789,10 @@ return; } -/* Classifies the builtin kind we can generate for PARTITION of RDG and LOOP. - For the moment we detect memset, memcpy and memmove patterns. Bitmap - STMT_IN_ALL_PARTITIONS contains statements belonging to all partitions. */ - -static void -classify_partition (loop_p loop, struct graph *rdg, partition *partition, - bitmap stmt_in_all_partitions) +bool +loop_distribution::classify_partition (loop_p loop, + struct graph *rdg, partition *partition, + bitmap stmt_in_all_partitions) { bitmap_iterator bi; unsigned i; @@ -1651,38 +1818,40 @@ to all partitions. In such case, reduction will be computed correctly no matter how partitions are fused/distributed. */ if (!bitmap_bit_p (stmt_in_all_partitions, i)) - { - partition->reduction_p = true; - return; - } - has_reduction = true; + partition->reduction_p = true; + else + has_reduction = true; } } + /* Simple workaround to prevent classifying the partition as builtin + if it contains any use outside of loop. For the case where all + partitions have the reduction this simple workaround is delayed + to only affect the last partition. */ + if (partition->reduction_p) + return has_reduction; + /* Perform general partition disqualification for builtins. */ if (volatiles_p - /* Simple workaround to prevent classifying the partition as builtin - if it contains any use outside of loop. */ - || has_reduction || !flag_tree_loop_distribute_patterns) - return; + return has_reduction; /* Find single load/store data references for builtin partition. */ if (!find_single_drs (loop, rdg, partition, &single_st, &single_ld)) - return; + return has_reduction; + + partition->loc = gimple_location (DR_STMT (single_st)); /* Classify the builtin kind. */ if (single_ld == NULL) classify_builtin_st (loop, partition, single_st); else classify_builtin_ldst (loop, rdg, partition, single_st, single_ld); + return has_reduction; } -/* Returns true when PARTITION1 and PARTITION2 access the same memory - object in RDG. */ - -static bool -share_memory_accesses (struct graph *rdg, +bool +loop_distribution::share_memory_accesses (struct graph *rdg, partition *partition1, partition *partition2) { unsigned i, j; @@ -1729,10 +1898,10 @@ partition for it by adding depended statements according to RDG. All partitions are recorded in PARTITIONS. */ -static void -rdg_build_partitions (struct graph *rdg, - vec<gimple *> starting_stmts, - vec<partition *> *partitions) +void +loop_distribution::rdg_build_partitions (struct graph *rdg, + vec<gimple *> starting_stmts, + vec<partition *> *partitions) { auto_bitmap processed; int i; @@ -1848,14 +2017,8 @@ return false; } -/* Compute partition dependence created by the data references in DRS1 - and DRS2, modify and return DIR according to that. IF ALIAS_DDR is - not NULL, we record dependence introduced by possible alias between - two data references in ALIAS_DDRS; otherwise, we simply ignore such - dependence as if it doesn't exist at all. */ - -static int -pg_add_dependence_edges (struct graph *rdg, int dir, +int +loop_distribution::pg_add_dependence_edges (struct graph *rdg, int dir, bitmap drs1, bitmap drs2, vec<ddr_p> *alias_ddrs) { unsigned i, j; @@ -1896,7 +2059,7 @@ /* Be conservative. If data references are not well analyzed, or the two data references have the same base address and offset, add dependence and consider it alias to each other. - In other words, the dependence can not be resolved by + In other words, the dependence cannot be resolved by runtime alias check. */ if (!DR_BASE_ADDRESS (dr1) || !DR_BASE_ADDRESS (dr2) || !DR_OFFSET (dr1) || !DR_OFFSET (dr2) @@ -2071,10 +2234,10 @@ is ignored, as if it doesn't exist at all; otherwise all depdendences are considered. */ -static struct graph * -build_partition_graph (struct graph *rdg, - vec<struct partition *> *partitions, - bool ignore_alias_p) +struct graph * +loop_distribution::build_partition_graph (struct graph *rdg, + vec<struct partition *> *partitions, + bool ignore_alias_p) { int i, j; struct partition *partition1, *partition2; @@ -2109,7 +2272,7 @@ /* Add edge to partition graph if there exists dependence. There are two types of edges. One type edge is caused by compilation - time known dependence, this type can not be resolved by runtime + time known dependence, this type cannot be resolved by runtime alias check. The other type can be resolved by runtime alias check. */ if (dir == 1 || dir == 2 @@ -2156,15 +2319,10 @@ } } -/* Given reduced dependence graph RDG merge strong connected components - of PARTITIONS. If IGNORE_ALIAS_P is true, data dependence caused by - possible alias between references is ignored, as if it doesn't exist - at all; otherwise all depdendences are considered. */ - -static void -merge_dep_scc_partitions (struct graph *rdg, - vec<struct partition *> *partitions, - bool ignore_alias_p) +void +loop_distribution::merge_dep_scc_partitions (struct graph *rdg, + vec<struct partition *> *partitions, + bool ignore_alias_p) { struct partition *partition1, *partition2; struct pg_vdata *data; @@ -2237,11 +2395,10 @@ /* This is the main function breaking strong conected components in PARTITIONS giving reduced depdendence graph RDG. Store data dependence relations for runtime alias check in ALIAS_DDRS. */ - -static void -break_alias_scc_partitions (struct graph *rdg, - vec<struct partition *> *partitions, - vec<ddr_p> *alias_ddrs) +void +loop_distribution::break_alias_scc_partitions (struct graph *rdg, + vec<struct partition *> *partitions, + vec<ddr_p> *alias_ddrs) { int i, j, k, num_sccs, num_sccs_no_alias; /* Build partition dependence graph. */ @@ -2382,7 +2539,7 @@ DR. */ static inline bool -latch_dominated_by_data_ref (struct loop *loop, data_reference *dr) +latch_dominated_by_data_ref (class loop *loop, data_reference *dr) { return dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src, gimple_bb (DR_STMT (dr))); @@ -2392,7 +2549,7 @@ data dependence relations ALIAS_DDRS. */ static void -compute_alias_check_pairs (struct loop *loop, vec<ddr_p> *alias_ddrs, +compute_alias_check_pairs (class loop *loop, vec<ddr_p> *alias_ddrs, vec<dr_with_seg_len_pair_t> *comp_alias_pairs) { unsigned int i; @@ -2413,12 +2570,6 @@ struct data_reference *dr_a = DDR_A (ddr); struct data_reference *dr_b = DDR_B (ddr); tree seg_length_a, seg_length_b; - int comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a), - DR_BASE_ADDRESS (dr_b)); - - if (comp_res == 0) - comp_res = data_ref_compare_tree (DR_OFFSET (dr_a), DR_OFFSET (dr_b)); - gcc_assert (comp_res != 0); if (latch_dominated_by_data_ref (loop, dr_a)) seg_length_a = data_ref_segment_size (dr_a, niters_plus_one); @@ -2439,11 +2590,9 @@ dr_with_seg_len_pair_t dr_with_seg_len_pair (dr_with_seg_len (dr_a, seg_length_a, access_size_a, align_a), - dr_with_seg_len (dr_b, seg_length_b, access_size_b, align_b)); - - /* Canonicalize pairs by sorting the two DR members. */ - if (comp_res > 0) - std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second); + dr_with_seg_len (dr_b, seg_length_b, access_size_b, align_b), + /* ??? Would WELL_ORDERED be safe? */ + dr_with_seg_len_pair_t::REORDERED); comp_alias_pairs->safe_push (dr_with_seg_len_pair); } @@ -2464,11 +2613,11 @@ static void version_loop_by_alias_check (vec<struct partition *> *partitions, - struct loop *loop, vec<ddr_p> *alias_ddrs) + class loop *loop, vec<ddr_p> *alias_ddrs) { profile_probability prob; basic_block cond_bb; - struct loop *nloop; + class loop *nloop; tree lhs, arg0, cond_expr = NULL_TREE; gimple_seq cond_stmts = NULL; gimple *call_stmt = NULL; @@ -2675,12 +2824,10 @@ } } -/* Fuse PARTITIONS of LOOP if necessary before finalizing distribution. - ALIAS_DDRS contains ddrs which need runtime alias check. */ - -static void -finalize_partitions (struct loop *loop, vec<struct partition *> *partitions, - vec<ddr_p> *alias_ddrs) +void +loop_distribution::finalize_partitions (class loop *loop, + vec<struct partition *> *partitions, + vec<ddr_p> *alias_ddrs) { unsigned i; struct partition *partition, *a; @@ -2735,14 +2882,14 @@ distributed loops. Set NB_CALLS to number of generated builtin calls. Set *DESTROY_P to whether LOOP needs to be destroyed. */ -static int -distribute_loop (struct loop *loop, vec<gimple *> stmts, - control_dependences *cd, int *nb_calls, bool *destroy_p) +int +loop_distribution::distribute_loop (class loop *loop, vec<gimple *> stmts, + control_dependences *cd, int *nb_calls, bool *destroy_p, + bool only_patterns_p) { ddrs_table = new hash_table<ddr_hasher> (389); struct graph *rdg; partition *partition; - bool any_builtin; int i, nbp; *destroy_p = false; @@ -2756,6 +2903,7 @@ } datarefs_vec.create (20); + has_nonaddressable_dataref_p = false; rdg = build_rdg (loop, cd); if (!rdg) { @@ -2801,16 +2949,18 @@ for (i = 1; partitions.iterate (i, &partition); ++i) bitmap_and_into (stmt_in_all_partitions, partitions[i]->stmts); - any_builtin = false; + bool any_builtin = false; + bool reduction_in_all = false; FOR_EACH_VEC_ELT (partitions, i, partition) { - classify_partition (loop, rdg, partition, stmt_in_all_partitions); + reduction_in_all + |= classify_partition (loop, rdg, partition, stmt_in_all_partitions); any_builtin |= partition_builtin_p (partition); } /* If we are only distributing patterns but did not detect any, simply bail out. */ - if (!flag_tree_loop_distribution + if (only_patterns_p && !any_builtin) { nbp = 0; @@ -2822,7 +2972,7 @@ a loop into pieces, separated by builtin calls. That is, we only want no or a single loop body remaining. */ struct partition *into; - if (!flag_tree_loop_distribution) + if (only_patterns_p) { for (i = 0; partitions.iterate (i, &into); ++i) if (!partition_builtin_p (into)) @@ -2879,13 +3029,30 @@ i--; } + /* Put a non-builtin partition last if we need to preserve a reduction. + ??? This is a workaround that makes sort_partitions_by_post_order do + the correct thing while in reality it should sort each component + separately and then put the component with a reduction or a non-builtin + last. */ + if (reduction_in_all + && partition_builtin_p (partitions.last())) + FOR_EACH_VEC_ELT (partitions, i, partition) + if (!partition_builtin_p (partition)) + { + partitions.unordered_remove (i); + partitions.quick_push (partition); + break; + } + /* Build the partition dependency graph and fuse partitions in strong connected component. */ if (partitions.length () > 1) { /* Don't support loop nest distribution under runtime alias check - since it's not likely to enable many vectorization opportunities. */ - if (loop->inner) + since it's not likely to enable many vectorization opportunities. + Also if loop has any data reference which may be not addressable + since alias check needs to take, compare address of the object. */ + if (loop->inner || has_nonaddressable_dataref_p) merge_dep_scc_partitions (rdg, &partitions, false); else { @@ -2897,6 +3064,21 @@ finalize_partitions (loop, &partitions, &alias_ddrs); + /* If there is a reduction in all partitions make sure the last one + is not classified for builtin code generation. */ + if (reduction_in_all) + { + partition = partitions.last (); + if (only_patterns_p + && partition_builtin_p (partition) + && !partition_builtin_p (partitions[0])) + { + nbp = 0; + goto ldist_done; + } + partition->kind = PKIND_NORMAL; + } + nbp = partitions.length (); if (nbp == 0 || (nbp == 1 && !partition_builtin_p (partitions[0])) @@ -2941,6 +3123,224 @@ return nbp - *nb_calls; } + +void loop_distribution::bb_top_order_init (void) +{ + int rpo_num; + int *rpo = XNEWVEC (int, last_basic_block_for_fn (cfun)); + + bb_top_order_index = XNEWVEC (int, last_basic_block_for_fn (cfun)); + bb_top_order_index_size = last_basic_block_for_fn (cfun); + rpo_num = pre_and_rev_post_order_compute_fn (cfun, NULL, rpo, true); + for (int i = 0; i < rpo_num; i++) + bb_top_order_index[rpo[i]] = i; + + free (rpo); +} + +void loop_distribution::bb_top_order_destroy () +{ + free (bb_top_order_index); + bb_top_order_index = NULL; + bb_top_order_index_size = 0; +} + + +/* Given LOOP, this function records seed statements for distribution in + WORK_LIST. Return false if there is nothing for distribution. */ + +static bool +find_seed_stmts_for_distribution (class loop *loop, vec<gimple *> *work_list) +{ + basic_block *bbs = get_loop_body_in_dom_order (loop); + + /* Initialize the worklist with stmts we seed the partitions with. */ + for (unsigned i = 0; i < loop->num_nodes; ++i) + { + for (gphi_iterator gsi = gsi_start_phis (bbs[i]); + !gsi_end_p (gsi); gsi_next (&gsi)) + { + gphi *phi = gsi.phi (); + if (virtual_operand_p (gimple_phi_result (phi))) + continue; + /* Distribute stmts which have defs that are used outside of + the loop. */ + if (!stmt_has_scalar_dependences_outside_loop (loop, phi)) + continue; + work_list->safe_push (phi); + } + for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); + !gsi_end_p (gsi); gsi_next (&gsi)) + { + gimple *stmt = gsi_stmt (gsi); + + /* Ignore clobbers, they do not have true side effects. */ + if (gimple_clobber_p (stmt)) + continue; + + /* If there is a stmt with side-effects bail out - we + cannot and should not distribute this loop. */ + if (gimple_has_side_effects (stmt)) + { + free (bbs); + return false; + } + + /* Distribute stmts which have defs that are used outside of + the loop. */ + if (stmt_has_scalar_dependences_outside_loop (loop, stmt)) + ; + /* Otherwise only distribute stores for now. */ + else if (!gimple_vdef (stmt)) + continue; + + work_list->safe_push (stmt); + } + } + free (bbs); + return work_list->length () > 0; +} + +/* Given innermost LOOP, return the outermost enclosing loop that forms a + perfect loop nest. */ + +static class loop * +prepare_perfect_loop_nest (class loop *loop) +{ + class loop *outer = loop_outer (loop); + tree niters = number_of_latch_executions (loop); + + /* TODO: We only support the innermost 3-level loop nest distribution + because of compilation time issue for now. This should be relaxed + in the future. Note we only allow 3-level loop nest distribution + when parallelizing loops. */ + while ((loop->inner == NULL + || (loop->inner->inner == NULL && flag_tree_parallelize_loops > 1)) + && loop_outer (outer) + && outer->inner == loop && loop->next == NULL + && single_exit (outer) + && !chrec_contains_symbols_defined_in_loop (niters, outer->num) + && (niters = number_of_latch_executions (outer)) != NULL_TREE + && niters != chrec_dont_know) + { + loop = outer; + outer = loop_outer (loop); + } + + return loop; +} + + +unsigned int +loop_distribution::execute (function *fun) +{ + class loop *loop; + bool changed = false; + basic_block bb; + control_dependences *cd = NULL; + auto_vec<loop_p> loops_to_be_destroyed; + + if (number_of_loops (fun) <= 1) + return 0; + + bb_top_order_init (); + + FOR_ALL_BB_FN (bb, fun) + { + gimple_stmt_iterator gsi; + for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi)) + gimple_set_uid (gsi_stmt (gsi), -1); + for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) + gimple_set_uid (gsi_stmt (gsi), -1); + } + + /* We can at the moment only distribute non-nested loops, thus restrict + walking to innermost loops. */ + FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST) + { + /* Don't distribute multiple exit edges loop, or cold loop when + not doing pattern detection. */ + if (!single_exit (loop) + || (!flag_tree_loop_distribute_patterns + && !optimize_loop_for_speed_p (loop))) + continue; + + /* Don't distribute loop if niters is unknown. */ + tree niters = number_of_latch_executions (loop); + if (niters == NULL_TREE || niters == chrec_dont_know) + continue; + + /* Get the perfect loop nest for distribution. */ + loop = prepare_perfect_loop_nest (loop); + for (; loop; loop = loop->inner) + { + auto_vec<gimple *> work_list; + if (!find_seed_stmts_for_distribution (loop, &work_list)) + break; + + const char *str = loop->inner ? " nest" : ""; + dump_user_location_t loc = find_loop_location (loop); + if (!cd) + { + calculate_dominance_info (CDI_DOMINATORS); + calculate_dominance_info (CDI_POST_DOMINATORS); + cd = new control_dependences (); + free_dominance_info (CDI_POST_DOMINATORS); + } + + bool destroy_p; + int nb_generated_loops, nb_generated_calls; + nb_generated_loops + = distribute_loop (loop, work_list, cd, &nb_generated_calls, + &destroy_p, (!optimize_loop_for_speed_p (loop) + || !flag_tree_loop_distribution)); + if (destroy_p) + loops_to_be_destroyed.safe_push (loop); + + if (nb_generated_loops + nb_generated_calls > 0) + { + changed = true; + if (dump_enabled_p ()) + dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, + loc, "Loop%s %d distributed: split to %d loops " + "and %d library calls.\n", str, loop->num, + nb_generated_loops, nb_generated_calls); + + break; + } + + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "Loop%s %d not distributed.\n", str, loop->num); + } + } + + if (cd) + delete cd; + + if (bb_top_order_index != NULL) + bb_top_order_destroy (); + + if (changed) + { + /* Destroy loop bodies that could not be reused. Do this late as we + otherwise can end up refering to stale data in control dependences. */ + unsigned i; + FOR_EACH_VEC_ELT (loops_to_be_destroyed, i, loop) + destroy_loop (loop); + + /* Cached scalar evolutions now may refer to wrong or non-existing + loops. */ + scev_reset_htab (); + mark_virtual_operands_for_renaming (fun); + rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa); + } + + checking_verify_loop_structure (); + + return changed ? TODO_cleanup_cfg : 0; +} + + /* Distribute all loops in the current function. */ namespace { @@ -2976,210 +3376,10 @@ }; // class pass_loop_distribution - -/* Given LOOP, this function records seed statements for distribution in - WORK_LIST. Return false if there is nothing for distribution. */ - -static bool -find_seed_stmts_for_distribution (struct loop *loop, vec<gimple *> *work_list) -{ - basic_block *bbs = get_loop_body_in_dom_order (loop); - - /* Initialize the worklist with stmts we seed the partitions with. */ - for (unsigned i = 0; i < loop->num_nodes; ++i) - { - for (gphi_iterator gsi = gsi_start_phis (bbs[i]); - !gsi_end_p (gsi); gsi_next (&gsi)) - { - gphi *phi = gsi.phi (); - if (virtual_operand_p (gimple_phi_result (phi))) - continue; - /* Distribute stmts which have defs that are used outside of - the loop. */ - if (!stmt_has_scalar_dependences_outside_loop (loop, phi)) - continue; - work_list->safe_push (phi); - } - for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); - !gsi_end_p (gsi); gsi_next (&gsi)) - { - gimple *stmt = gsi_stmt (gsi); - - /* If there is a stmt with side-effects bail out - we - cannot and should not distribute this loop. */ - if (gimple_has_side_effects (stmt)) - { - free (bbs); - return false; - } - - /* Distribute stmts which have defs that are used outside of - the loop. */ - if (stmt_has_scalar_dependences_outside_loop (loop, stmt)) - ; - /* Otherwise only distribute stores for now. */ - else if (!gimple_vdef (stmt)) - continue; - - work_list->safe_push (stmt); - } - } - free (bbs); - return work_list->length () > 0; -} - -/* Given innermost LOOP, return the outermost enclosing loop that forms a - perfect loop nest. */ - -static struct loop * -prepare_perfect_loop_nest (struct loop *loop) -{ - struct loop *outer = loop_outer (loop); - tree niters = number_of_latch_executions (loop); - - /* TODO: We only support the innermost 3-level loop nest distribution - because of compilation time issue for now. This should be relaxed - in the future. Note we only allow 3-level loop nest distribution - when parallelizing loops. */ - while ((loop->inner == NULL - || (loop->inner->inner == NULL && flag_tree_parallelize_loops > 1)) - && loop_outer (outer) - && outer->inner == loop && loop->next == NULL - && single_exit (outer) - && optimize_loop_for_speed_p (outer) - && !chrec_contains_symbols_defined_in_loop (niters, outer->num) - && (niters = number_of_latch_executions (outer)) != NULL_TREE - && niters != chrec_dont_know) - { - loop = outer; - outer = loop_outer (loop); - } - - return loop; -} - unsigned int pass_loop_distribution::execute (function *fun) { - struct loop *loop; - bool changed = false; - basic_block bb; - control_dependences *cd = NULL; - auto_vec<loop_p> loops_to_be_destroyed; - - if (number_of_loops (fun) <= 1) - return 0; - - /* Compute topological order for basic blocks. Topological order is - needed because data dependence is computed for data references in - lexicographical order. */ - if (bb_top_order_index == NULL) - { - int rpo_num; - int *rpo = XNEWVEC (int, last_basic_block_for_fn (cfun)); - - bb_top_order_index = XNEWVEC (int, last_basic_block_for_fn (cfun)); - bb_top_order_index_size = last_basic_block_for_fn (cfun); - rpo_num = pre_and_rev_post_order_compute_fn (cfun, NULL, rpo, true); - for (int i = 0; i < rpo_num; i++) - bb_top_order_index[rpo[i]] = i; - - free (rpo); - } - - FOR_ALL_BB_FN (bb, fun) - { - gimple_stmt_iterator gsi; - for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi)) - gimple_set_uid (gsi_stmt (gsi), -1); - for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) - gimple_set_uid (gsi_stmt (gsi), -1); - } - - /* We can at the moment only distribute non-nested loops, thus restrict - walking to innermost loops. */ - FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST) - { - /* Don't distribute multiple exit edges loop, or cold loop. */ - if (!single_exit (loop) - || !optimize_loop_for_speed_p (loop)) - continue; - - /* Don't distribute loop if niters is unknown. */ - tree niters = number_of_latch_executions (loop); - if (niters == NULL_TREE || niters == chrec_dont_know) - continue; - - /* Get the perfect loop nest for distribution. */ - loop = prepare_perfect_loop_nest (loop); - for (; loop; loop = loop->inner) - { - auto_vec<gimple *> work_list; - if (!find_seed_stmts_for_distribution (loop, &work_list)) - break; - - const char *str = loop->inner ? " nest" : ""; - dump_user_location_t loc = find_loop_location (loop); - if (!cd) - { - calculate_dominance_info (CDI_DOMINATORS); - calculate_dominance_info (CDI_POST_DOMINATORS); - cd = new control_dependences (); - free_dominance_info (CDI_POST_DOMINATORS); - } - - bool destroy_p; - int nb_generated_loops, nb_generated_calls; - nb_generated_loops = distribute_loop (loop, work_list, cd, - &nb_generated_calls, - &destroy_p); - if (destroy_p) - loops_to_be_destroyed.safe_push (loop); - - if (nb_generated_loops + nb_generated_calls > 0) - { - changed = true; - dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, - loc, "Loop%s %d distributed: split to %d loops " - "and %d library calls.\n", str, loop->num, - nb_generated_loops, nb_generated_calls); - - break; - } - - if (dump_file && (dump_flags & TDF_DETAILS)) - fprintf (dump_file, "Loop%s %d not distributed.\n", str, loop->num); - } - } - - if (cd) - delete cd; - - if (bb_top_order_index != NULL) - { - free (bb_top_order_index); - bb_top_order_index = NULL; - bb_top_order_index_size = 0; - } - - if (changed) - { - /* Destroy loop bodies that could not be reused. Do this late as we - otherwise can end up refering to stale data in control dependences. */ - unsigned i; - FOR_EACH_VEC_ELT (loops_to_be_destroyed, i, loop) - destroy_loop (loop); - - /* Cached scalar evolutions now may refer to wrong or non-existing - loops. */ - scev_reset_htab (); - mark_virtual_operands_for_renaming (fun); - rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa); - } - - checking_verify_loop_structure (); - - return changed ? TODO_cleanup_cfg : 0; + return loop_distribution ().execute (fun); } } // anon namespace