Mercurial > hg > CbC > CbC_gcc
annotate gcc/tree-loop-distribution.c @ 158:494b0b89df80 default tip
...
author | Shinji KONO <kono@ie.u-ryukyu.ac.jp> |
---|---|
date | Mon, 25 May 2020 18:13:55 +0900 |
parents | 1830386684a0 |
children |
rev | line source |
---|---|
0 | 1 /* Loop distribution. |
145 | 2 Copyright (C) 2006-2020 Free Software Foundation, Inc. |
0 | 3 Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr> |
4 and Sebastian Pop <sebastian.pop@amd.com>. | |
5 | |
6 This file is part of GCC. | |
55
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
7 |
0 | 8 GCC is free software; you can redistribute it and/or modify it |
9 under the terms of the GNU General Public License as published by the | |
10 Free Software Foundation; either version 3, or (at your option) any | |
11 later version. | |
55
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
12 |
0 | 13 GCC is distributed in the hope that it will be useful, but WITHOUT |
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
16 for more details. | |
55
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
17 |
0 | 18 You should have received a copy of the GNU General Public License |
19 along with GCC; see the file COPYING3. If not see | |
20 <http://www.gnu.org/licenses/>. */ | |
21 | |
22 /* This pass performs loop distribution: for example, the loop | |
23 | |
24 |DO I = 2, N | |
25 | A(I) = B(I) + C | |
26 | D(I) = A(I-1)*E | |
27 |ENDDO | |
28 | |
55
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
29 is transformed to |
0 | 30 |
31 |DOALL I = 2, N | |
32 | A(I) = B(I) + C | |
33 |ENDDO | |
34 | | |
35 |DOALL I = 2, N | |
36 | D(I) = A(I-1)*E | |
37 |ENDDO | |
38 | |
111 | 39 Loop distribution is the dual of loop fusion. It separates statements |
40 of a loop (or loop nest) into multiple loops (or loop nests) with the | |
41 same loop header. The major goal is to separate statements which may | |
42 be vectorized from those that can't. This pass implements distribution | |
43 in the following steps: | |
44 | |
45 1) Seed partitions with specific type statements. For now we support | |
46 two types seed statements: statement defining variable used outside | |
47 of loop; statement storing to memory. | |
48 2) Build reduced dependence graph (RDG) for loop to be distributed. | |
49 The vertices (RDG:V) model all statements in the loop and the edges | |
50 (RDG:E) model flow and control dependencies between statements. | |
51 3) Apart from RDG, compute data dependencies between memory references. | |
52 4) Starting from seed statement, build up partition by adding depended | |
53 statements according to RDG's dependence information. Partition is | |
54 classified as parallel type if it can be executed paralleled; or as | |
55 sequential type if it can't. Parallel type partition is further | |
56 classified as different builtin kinds if it can be implemented as | |
57 builtin function calls. | |
58 5) Build partition dependence graph (PG) based on data dependencies. | |
59 The vertices (PG:V) model all partitions and the edges (PG:E) model | |
60 all data dependencies between every partitions pair. In general, | |
61 data dependence is either compilation time known or unknown. In C | |
62 family languages, there exists quite amount compilation time unknown | |
63 dependencies because of possible alias relation of data references. | |
64 We categorize PG's edge to two types: "true" edge that represents | |
65 compilation time known data dependencies; "alias" edge for all other | |
66 data dependencies. | |
67 6) Traverse subgraph of PG as if all "alias" edges don't exist. Merge | |
68 partitions in each strong connected component (SCC) correspondingly. | |
69 Build new PG for merged partitions. | |
70 7) Traverse PG again and this time with both "true" and "alias" edges | |
71 included. We try to break SCCs by removing some edges. Because | |
72 SCCs by "true" edges are all fused in step 6), we can break SCCs | |
73 by removing some "alias" edges. It's NP-hard to choose optimal | |
74 edge set, fortunately simple approximation is good enough for us | |
75 given the small problem scale. | |
76 8) Collect all data dependencies of the removed "alias" edges. Create | |
77 runtime alias checks for collected data dependencies. | |
78 9) Version loop under the condition of runtime alias checks. Given | |
79 loop distribution generally introduces additional overhead, it is | |
80 only useful if vectorization is achieved in distributed loop. We | |
81 version loop with internal function call IFN_LOOP_DIST_ALIAS. If | |
82 no distributed loop can be vectorized, we simply remove distributed | |
83 loops and recover to the original one. | |
84 | |
85 TODO: | |
86 1) We only distribute innermost two-level loop nest now. We should | |
87 extend it for arbitrary loop nests in the future. | |
88 2) We only fuse partitions in SCC now. A better fusion algorithm is | |
89 desired to minimize loop overhead, maximize parallelism and maximize | |
90 data reuse. */ | |
0 | 91 |
92 #include "config.h" | |
93 #include "system.h" | |
94 #include "coretypes.h" | |
111 | 95 #include "backend.h" |
96 #include "tree.h" | |
97 #include "gimple.h" | |
98 #include "cfghooks.h" | |
0 | 99 #include "tree-pass.h" |
111 | 100 #include "ssa.h" |
101 #include "gimple-pretty-print.h" | |
102 #include "fold-const.h" | |
103 #include "cfganal.h" | |
104 #include "gimple-iterator.h" | |
105 #include "gimplify-me.h" | |
106 #include "stor-layout.h" | |
107 #include "tree-cfg.h" | |
108 #include "tree-ssa-loop-manip.h" | |
109 #include "tree-ssa-loop-ivopts.h" | |
110 #include "tree-ssa-loop.h" | |
111 #include "tree-into-ssa.h" | |
112 #include "tree-ssa.h" | |
113 #include "cfgloop.h" | |
114 #include "tree-scalar-evolution.h" | |
115 #include "tree-vectorizer.h" | |
145 | 116 #include "tree-eh.h" |
117 #include "gimple-fold.h" | |
111 | 118 |
119 | |
120 #define MAX_DATAREFS_NUM \ | |
145 | 121 ((unsigned) param_loop_max_datarefs_for_datadeps) |
111 | 122 |
123 /* Threshold controlling number of distributed partitions. Given it may | |
124 be unnecessary if a memory stream cost model is invented in the future, | |
125 we define it as a temporary macro, rather than a parameter. */ | |
126 #define NUM_PARTITION_THRESHOLD (4) | |
127 | |
128 /* Hashtable helpers. */ | |
129 | |
130 struct ddr_hasher : nofree_ptr_hash <struct data_dependence_relation> | |
131 { | |
132 static inline hashval_t hash (const data_dependence_relation *); | |
133 static inline bool equal (const data_dependence_relation *, | |
134 const data_dependence_relation *); | |
135 }; | |
136 | |
137 /* Hash function for data dependence. */ | |
138 | |
139 inline hashval_t | |
140 ddr_hasher::hash (const data_dependence_relation *ddr) | |
141 { | |
142 inchash::hash h; | |
143 h.add_ptr (DDR_A (ddr)); | |
144 h.add_ptr (DDR_B (ddr)); | |
145 return h.end (); | |
146 } | |
147 | |
148 /* Hash table equality function for data dependence. */ | |
149 | |
150 inline bool | |
151 ddr_hasher::equal (const data_dependence_relation *ddr1, | |
152 const data_dependence_relation *ddr2) | |
153 { | |
154 return (DDR_A (ddr1) == DDR_A (ddr2) && DDR_B (ddr1) == DDR_B (ddr2)); | |
155 } | |
156 | |
145 | 157 |
158 | |
111 | 159 #define DR_INDEX(dr) ((uintptr_t) (dr)->aux) |
160 | |
161 /* A Reduced Dependence Graph (RDG) vertex representing a statement. */ | |
162 struct rdg_vertex | |
163 { | |
164 /* The statement represented by this vertex. */ | |
165 gimple *stmt; | |
166 | |
167 /* Vector of data-references in this statement. */ | |
168 vec<data_reference_p> datarefs; | |
169 | |
170 /* True when the statement contains a write to memory. */ | |
171 bool has_mem_write; | |
172 | |
173 /* True when the statement contains a read from memory. */ | |
174 bool has_mem_reads; | |
175 }; | |
176 | |
177 #define RDGV_STMT(V) ((struct rdg_vertex *) ((V)->data))->stmt | |
178 #define RDGV_DATAREFS(V) ((struct rdg_vertex *) ((V)->data))->datarefs | |
179 #define RDGV_HAS_MEM_WRITE(V) ((struct rdg_vertex *) ((V)->data))->has_mem_write | |
180 #define RDGV_HAS_MEM_READS(V) ((struct rdg_vertex *) ((V)->data))->has_mem_reads | |
181 #define RDG_STMT(RDG, I) RDGV_STMT (&(RDG->vertices[I])) | |
182 #define RDG_DATAREFS(RDG, I) RDGV_DATAREFS (&(RDG->vertices[I])) | |
183 #define RDG_MEM_WRITE_STMT(RDG, I) RDGV_HAS_MEM_WRITE (&(RDG->vertices[I])) | |
184 #define RDG_MEM_READS_STMT(RDG, I) RDGV_HAS_MEM_READS (&(RDG->vertices[I])) | |
185 | |
186 /* Data dependence type. */ | |
187 | |
188 enum rdg_dep_type | |
189 { | |
190 /* Read After Write (RAW). */ | |
191 flow_dd = 'f', | |
192 | |
193 /* Control dependence (execute conditional on). */ | |
194 control_dd = 'c' | |
195 }; | |
196 | |
197 /* Dependence information attached to an edge of the RDG. */ | |
198 | |
199 struct rdg_edge | |
200 { | |
201 /* Type of the dependence. */ | |
202 enum rdg_dep_type type; | |
203 }; | |
204 | |
205 #define RDGE_TYPE(E) ((struct rdg_edge *) ((E)->data))->type | |
206 | |
145 | 207 /* Kind of distributed loop. */ |
208 enum partition_kind { | |
209 PKIND_NORMAL, | |
210 /* Partial memset stands for a paritition can be distributed into a loop | |
211 of memset calls, rather than a single memset call. It's handled just | |
212 like a normal parition, i.e, distributed as separate loop, no memset | |
213 call is generated. | |
214 | |
215 Note: This is a hacking fix trying to distribute ZERO-ing stmt in a | |
216 loop nest as deep as possible. As a result, parloop achieves better | |
217 parallelization by parallelizing deeper loop nest. This hack should | |
218 be unnecessary and removed once distributed memset can be understood | |
219 and analyzed in data reference analysis. See PR82604 for more. */ | |
220 PKIND_PARTIAL_MEMSET, | |
221 PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE | |
222 }; | |
223 | |
224 /* Type of distributed loop. */ | |
225 enum partition_type { | |
226 /* The distributed loop can be executed parallelly. */ | |
227 PTYPE_PARALLEL = 0, | |
228 /* The distributed loop has to be executed sequentially. */ | |
229 PTYPE_SEQUENTIAL | |
230 }; | |
231 | |
232 /* Builtin info for loop distribution. */ | |
233 struct builtin_info | |
234 { | |
235 /* data-references a kind != PKIND_NORMAL partition is about. */ | |
236 data_reference_p dst_dr; | |
237 data_reference_p src_dr; | |
238 /* Base address and size of memory objects operated by the builtin. Note | |
239 both dest and source memory objects must have the same size. */ | |
240 tree dst_base; | |
241 tree src_base; | |
242 tree size; | |
243 /* Base and offset part of dst_base after stripping constant offset. This | |
244 is only used in memset builtin distribution for now. */ | |
245 tree dst_base_base; | |
246 unsigned HOST_WIDE_INT dst_base_offset; | |
247 }; | |
248 | |
249 /* Partition for loop distribution. */ | |
250 struct partition | |
251 { | |
252 /* Statements of the partition. */ | |
253 bitmap stmts; | |
254 /* True if the partition defines variable which is used outside of loop. */ | |
255 bool reduction_p; | |
256 location_t loc; | |
257 enum partition_kind kind; | |
258 enum partition_type type; | |
259 /* Data references in the partition. */ | |
260 bitmap datarefs; | |
261 /* Information of builtin parition. */ | |
262 struct builtin_info *builtin; | |
263 }; | |
264 | |
265 /* Partitions are fused because of different reasons. */ | |
266 enum fuse_type | |
267 { | |
268 FUSE_NON_BUILTIN = 0, | |
269 FUSE_REDUCTION = 1, | |
270 FUSE_SHARE_REF = 2, | |
271 FUSE_SAME_SCC = 3, | |
272 FUSE_FINALIZE = 4 | |
273 }; | |
274 | |
275 /* Description on different fusing reason. */ | |
276 static const char *fuse_message[] = { | |
277 "they are non-builtins", | |
278 "they have reductions", | |
279 "they have shared memory refs", | |
280 "they are in the same dependence scc", | |
281 "there is no point to distribute loop"}; | |
282 | |
283 | |
111 | 284 /* Dump vertex I in RDG to FILE. */ |
285 | |
286 static void | |
287 dump_rdg_vertex (FILE *file, struct graph *rdg, int i) | |
288 { | |
289 struct vertex *v = &(rdg->vertices[i]); | |
290 struct graph_edge *e; | |
291 | |
292 fprintf (file, "(vertex %d: (%s%s) (in:", i, | |
293 RDG_MEM_WRITE_STMT (rdg, i) ? "w" : "", | |
294 RDG_MEM_READS_STMT (rdg, i) ? "r" : ""); | |
295 | |
296 if (v->pred) | |
297 for (e = v->pred; e; e = e->pred_next) | |
298 fprintf (file, " %d", e->src); | |
299 | |
300 fprintf (file, ") (out:"); | |
301 | |
302 if (v->succ) | |
303 for (e = v->succ; e; e = e->succ_next) | |
304 fprintf (file, " %d", e->dest); | |
305 | |
306 fprintf (file, ")\n"); | |
307 print_gimple_stmt (file, RDGV_STMT (v), 0, TDF_VOPS|TDF_MEMSYMS); | |
308 fprintf (file, ")\n"); | |
309 } | |
310 | |
311 /* Call dump_rdg_vertex on stderr. */ | |
312 | |
313 DEBUG_FUNCTION void | |
314 debug_rdg_vertex (struct graph *rdg, int i) | |
315 { | |
316 dump_rdg_vertex (stderr, rdg, i); | |
317 } | |
318 | |
319 /* Dump the reduced dependence graph RDG to FILE. */ | |
320 | |
321 static void | |
322 dump_rdg (FILE *file, struct graph *rdg) | |
323 { | |
324 fprintf (file, "(rdg\n"); | |
325 for (int i = 0; i < rdg->n_vertices; i++) | |
326 dump_rdg_vertex (file, rdg, i); | |
327 fprintf (file, ")\n"); | |
328 } | |
329 | |
330 /* Call dump_rdg on stderr. */ | |
331 | |
332 DEBUG_FUNCTION void | |
333 debug_rdg (struct graph *rdg) | |
334 { | |
335 dump_rdg (stderr, rdg); | |
336 } | |
337 | |
338 static void | |
339 dot_rdg_1 (FILE *file, struct graph *rdg) | |
340 { | |
341 int i; | |
342 pretty_printer buffer; | |
343 pp_needs_newline (&buffer) = false; | |
344 buffer.buffer->stream = file; | |
345 | |
346 fprintf (file, "digraph RDG {\n"); | |
347 | |
348 for (i = 0; i < rdg->n_vertices; i++) | |
349 { | |
350 struct vertex *v = &(rdg->vertices[i]); | |
351 struct graph_edge *e; | |
352 | |
353 fprintf (file, "%d [label=\"[%d] ", i, i); | |
354 pp_gimple_stmt_1 (&buffer, RDGV_STMT (v), 0, TDF_SLIM); | |
355 pp_flush (&buffer); | |
356 fprintf (file, "\"]\n"); | |
357 | |
358 /* Highlight reads from memory. */ | |
359 if (RDG_MEM_READS_STMT (rdg, i)) | |
360 fprintf (file, "%d [style=filled, fillcolor=green]\n", i); | |
361 | |
362 /* Highlight stores to memory. */ | |
363 if (RDG_MEM_WRITE_STMT (rdg, i)) | |
364 fprintf (file, "%d [style=filled, fillcolor=red]\n", i); | |
365 | |
366 if (v->succ) | |
367 for (e = v->succ; e; e = e->succ_next) | |
368 switch (RDGE_TYPE (e)) | |
369 { | |
370 case flow_dd: | |
371 /* These are the most common dependences: don't print these. */ | |
372 fprintf (file, "%d -> %d \n", i, e->dest); | |
373 break; | |
374 | |
375 case control_dd: | |
376 fprintf (file, "%d -> %d [label=control] \n", i, e->dest); | |
377 break; | |
378 | |
379 default: | |
380 gcc_unreachable (); | |
381 } | |
382 } | |
383 | |
384 fprintf (file, "}\n\n"); | |
385 } | |
386 | |
387 /* Display the Reduced Dependence Graph using dotty. */ | |
388 | |
389 DEBUG_FUNCTION void | |
390 dot_rdg (struct graph *rdg) | |
391 { | |
392 /* When debugging, you may want to enable the following code. */ | |
393 #ifdef HAVE_POPEN | |
394 FILE *file = popen ("dot -Tx11", "w"); | |
395 if (!file) | |
396 return; | |
397 dot_rdg_1 (file, rdg); | |
398 fflush (file); | |
399 close (fileno (file)); | |
400 pclose (file); | |
401 #else | |
402 dot_rdg_1 (stderr, rdg); | |
403 #endif | |
404 } | |
405 | |
406 /* Returns the index of STMT in RDG. */ | |
407 | |
408 static int | |
409 rdg_vertex_for_stmt (struct graph *rdg ATTRIBUTE_UNUSED, gimple *stmt) | |
410 { | |
411 int index = gimple_uid (stmt); | |
412 gcc_checking_assert (index == -1 || RDG_STMT (rdg, index) == stmt); | |
413 return index; | |
414 } | |
415 | |
416 /* Creates dependence edges in RDG for all the uses of DEF. IDEF is | |
417 the index of DEF in RDG. */ | |
418 | |
419 static void | |
420 create_rdg_edges_for_scalar (struct graph *rdg, tree def, int idef) | |
421 { | |
422 use_operand_p imm_use_p; | |
423 imm_use_iterator iterator; | |
424 | |
425 FOR_EACH_IMM_USE_FAST (imm_use_p, iterator, def) | |
426 { | |
427 struct graph_edge *e; | |
428 int use = rdg_vertex_for_stmt (rdg, USE_STMT (imm_use_p)); | |
429 | |
430 if (use < 0) | |
431 continue; | |
432 | |
433 e = add_edge (rdg, idef, use); | |
434 e->data = XNEW (struct rdg_edge); | |
435 RDGE_TYPE (e) = flow_dd; | |
436 } | |
437 } | |
438 | |
439 /* Creates an edge for the control dependences of BB to the vertex V. */ | |
440 | |
441 static void | |
442 create_edge_for_control_dependence (struct graph *rdg, basic_block bb, | |
443 int v, control_dependences *cd) | |
444 { | |
445 bitmap_iterator bi; | |
446 unsigned edge_n; | |
447 EXECUTE_IF_SET_IN_BITMAP (cd->get_edges_dependent_on (bb->index), | |
448 0, edge_n, bi) | |
449 { | |
450 basic_block cond_bb = cd->get_edge_src (edge_n); | |
451 gimple *stmt = last_stmt (cond_bb); | |
452 if (stmt && is_ctrl_stmt (stmt)) | |
453 { | |
454 struct graph_edge *e; | |
455 int c = rdg_vertex_for_stmt (rdg, stmt); | |
456 if (c < 0) | |
457 continue; | |
458 | |
459 e = add_edge (rdg, c, v); | |
460 e->data = XNEW (struct rdg_edge); | |
461 RDGE_TYPE (e) = control_dd; | |
462 } | |
463 } | |
464 } | |
465 | |
466 /* Creates the edges of the reduced dependence graph RDG. */ | |
0 | 467 |
468 static void | |
111 | 469 create_rdg_flow_edges (struct graph *rdg) |
0 | 470 { |
111 | 471 int i; |
472 def_operand_p def_p; | |
473 ssa_op_iter iter; | |
474 | |
475 for (i = 0; i < rdg->n_vertices; i++) | |
476 FOR_EACH_PHI_OR_STMT_DEF (def_p, RDG_STMT (rdg, i), | |
477 iter, SSA_OP_DEF) | |
478 create_rdg_edges_for_scalar (rdg, DEF_FROM_PTR (def_p), i); | |
479 } | |
480 | |
481 /* Creates the edges of the reduced dependence graph RDG. */ | |
482 | |
483 static void | |
484 create_rdg_cd_edges (struct graph *rdg, control_dependences *cd, loop_p loop) | |
485 { | |
486 int i; | |
487 | |
488 for (i = 0; i < rdg->n_vertices; i++) | |
0 | 489 { |
111 | 490 gimple *stmt = RDG_STMT (rdg, i); |
491 if (gimple_code (stmt) == GIMPLE_PHI) | |
0 | 492 { |
111 | 493 edge_iterator ei; |
494 edge e; | |
495 FOR_EACH_EDGE (e, ei, gimple_bb (stmt)->preds) | |
496 if (flow_bb_inside_loop_p (loop, e->src)) | |
497 create_edge_for_control_dependence (rdg, e->src, i, cd); | |
0 | 498 } |
499 else | |
111 | 500 create_edge_for_control_dependence (rdg, gimple_bb (stmt), i, cd); |
501 } | |
502 } | |
503 | |
145 | 504 |
505 class loop_distribution | |
506 { | |
507 private: | |
508 /* The loop (nest) to be distributed. */ | |
509 vec<loop_p> loop_nest; | |
510 | |
511 /* Vector of data references in the loop to be distributed. */ | |
512 vec<data_reference_p> datarefs_vec; | |
513 | |
514 /* If there is nonaddressable data reference in above vector. */ | |
515 bool has_nonaddressable_dataref_p; | |
516 | |
517 /* Store index of data reference in aux field. */ | |
518 | |
519 /* Hash table for data dependence relation in the loop to be distributed. */ | |
520 hash_table<ddr_hasher> *ddrs_table; | |
521 | |
522 /* Array mapping basic block's index to its topological order. */ | |
523 int *bb_top_order_index; | |
524 /* And size of the array. */ | |
525 int bb_top_order_index_size; | |
526 | |
527 /* Build the vertices of the reduced dependence graph RDG. Return false | |
528 if that failed. */ | |
529 bool create_rdg_vertices (struct graph *rdg, vec<gimple *> stmts, loop_p loop); | |
530 | |
531 /* Initialize STMTS with all the statements of LOOP. We use topological | |
532 order to discover all statements. The order is important because | |
533 generate_loops_for_partition is using the same traversal for identifying | |
534 statements in loop copies. */ | |
535 void stmts_from_loop (class loop *loop, vec<gimple *> *stmts); | |
536 | |
537 | |
538 /* Build the Reduced Dependence Graph (RDG) with one vertex per statement of | |
539 LOOP, and one edge per flow dependence or control dependence from control | |
540 dependence CD. During visiting each statement, data references are also | |
541 collected and recorded in global data DATAREFS_VEC. */ | |
542 struct graph * build_rdg (class loop *loop, control_dependences *cd); | |
543 | |
544 /* Merge PARTITION into the partition DEST. RDG is the reduced dependence | |
545 graph and we update type for result partition if it is non-NULL. */ | |
546 void partition_merge_into (struct graph *rdg, | |
547 partition *dest, partition *partition, | |
548 enum fuse_type ft); | |
549 | |
550 | |
551 /* Return data dependence relation for data references A and B. The two | |
552 data references must be in lexicographic order wrto reduced dependence | |
553 graph RDG. We firstly try to find ddr from global ddr hash table. If | |
554 it doesn't exist, compute the ddr and cache it. */ | |
555 data_dependence_relation * get_data_dependence (struct graph *rdg, | |
556 data_reference_p a, | |
557 data_reference_p b); | |
558 | |
559 | |
560 /* In reduced dependence graph RDG for loop distribution, return true if | |
561 dependence between references DR1 and DR2 leads to a dependence cycle | |
562 and such dependence cycle can't be resolved by runtime alias check. */ | |
563 bool data_dep_in_cycle_p (struct graph *rdg, data_reference_p dr1, | |
564 data_reference_p dr2); | |
565 | |
566 | |
567 /* Given reduced dependence graph RDG, PARTITION1 and PARTITION2, update | |
568 PARTITION1's type after merging PARTITION2 into PARTITION1. */ | |
569 void update_type_for_merge (struct graph *rdg, | |
570 partition *partition1, partition *partition2); | |
571 | |
572 | |
573 /* Returns a partition with all the statements needed for computing | |
574 the vertex V of the RDG, also including the loop exit conditions. */ | |
575 partition *build_rdg_partition_for_vertex (struct graph *rdg, int v); | |
576 | |
577 /* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify | |
578 if it forms builtin memcpy or memmove call. */ | |
579 void classify_builtin_ldst (loop_p loop, struct graph *rdg, partition *partition, | |
580 data_reference_p dst_dr, data_reference_p src_dr); | |
581 | |
582 /* Classifies the builtin kind we can generate for PARTITION of RDG and LOOP. | |
583 For the moment we detect memset, memcpy and memmove patterns. Bitmap | |
584 STMT_IN_ALL_PARTITIONS contains statements belonging to all partitions. | |
585 Returns true if there is a reduction in all partitions and we | |
586 possibly did not mark PARTITION as having one for this reason. */ | |
587 | |
588 bool | |
589 classify_partition (loop_p loop, | |
590 struct graph *rdg, partition *partition, | |
591 bitmap stmt_in_all_partitions); | |
592 | |
593 | |
594 /* Returns true when PARTITION1 and PARTITION2 access the same memory | |
595 object in RDG. */ | |
596 bool share_memory_accesses (struct graph *rdg, | |
597 partition *partition1, partition *partition2); | |
598 | |
599 /* For each seed statement in STARTING_STMTS, this function builds | |
600 partition for it by adding depended statements according to RDG. | |
601 All partitions are recorded in PARTITIONS. */ | |
602 void rdg_build_partitions (struct graph *rdg, | |
603 vec<gimple *> starting_stmts, | |
604 vec<partition *> *partitions); | |
605 | |
606 /* Compute partition dependence created by the data references in DRS1 | |
607 and DRS2, modify and return DIR according to that. IF ALIAS_DDR is | |
608 not NULL, we record dependence introduced by possible alias between | |
609 two data references in ALIAS_DDRS; otherwise, we simply ignore such | |
610 dependence as if it doesn't exist at all. */ | |
611 int pg_add_dependence_edges (struct graph *rdg, int dir, bitmap drs1, | |
612 bitmap drs2, vec<ddr_p> *alias_ddrs); | |
613 | |
614 | |
615 /* Build and return partition dependence graph for PARTITIONS. RDG is | |
616 reduced dependence graph for the loop to be distributed. If IGNORE_ALIAS_P | |
617 is true, data dependence caused by possible alias between references | |
618 is ignored, as if it doesn't exist at all; otherwise all depdendences | |
619 are considered. */ | |
620 struct graph *build_partition_graph (struct graph *rdg, | |
621 vec<struct partition *> *partitions, | |
622 bool ignore_alias_p); | |
623 | |
624 /* Given reduced dependence graph RDG merge strong connected components | |
625 of PARTITIONS. If IGNORE_ALIAS_P is true, data dependence caused by | |
626 possible alias between references is ignored, as if it doesn't exist | |
627 at all; otherwise all depdendences are considered. */ | |
628 void merge_dep_scc_partitions (struct graph *rdg, vec<struct partition *> | |
629 *partitions, bool ignore_alias_p); | |
630 | |
631 /* This is the main function breaking strong conected components in | |
632 PARTITIONS giving reduced depdendence graph RDG. Store data dependence | |
633 relations for runtime alias check in ALIAS_DDRS. */ | |
634 void break_alias_scc_partitions (struct graph *rdg, vec<struct partition *> | |
635 *partitions, vec<ddr_p> *alias_ddrs); | |
636 | |
637 | |
638 /* Fuse PARTITIONS of LOOP if necessary before finalizing distribution. | |
639 ALIAS_DDRS contains ddrs which need runtime alias check. */ | |
640 void finalize_partitions (class loop *loop, vec<struct partition *> | |
641 *partitions, vec<ddr_p> *alias_ddrs); | |
642 | |
643 /* Distributes the code from LOOP in such a way that producer statements | |
644 are placed before consumer statements. Tries to separate only the | |
645 statements from STMTS into separate loops. Returns the number of | |
646 distributed loops. Set NB_CALLS to number of generated builtin calls. | |
647 Set *DESTROY_P to whether LOOP needs to be destroyed. */ | |
648 int distribute_loop (class loop *loop, vec<gimple *> stmts, | |
649 control_dependences *cd, int *nb_calls, bool *destroy_p, | |
650 bool only_patterns_p); | |
651 | |
652 /* Compute topological order for basic blocks. Topological order is | |
653 needed because data dependence is computed for data references in | |
654 lexicographical order. */ | |
655 void bb_top_order_init (void); | |
656 | |
657 void bb_top_order_destroy (void); | |
658 | |
659 public: | |
660 | |
661 /* Getter for bb_top_order. */ | |
662 | |
663 inline int get_bb_top_order_index_size (void) | |
664 { | |
665 return bb_top_order_index_size; | |
666 } | |
667 | |
668 inline int get_bb_top_order_index (int i) | |
669 { | |
670 return bb_top_order_index[i]; | |
671 } | |
672 | |
673 unsigned int execute (function *fun); | |
674 }; | |
675 | |
676 | |
677 /* If X has a smaller topological sort number than Y, returns -1; | |
678 if greater, returns 1. */ | |
679 static int | |
680 bb_top_order_cmp_r (const void *x, const void *y, void *loop) | |
681 { | |
682 loop_distribution *_loop = | |
683 (loop_distribution *) loop; | |
684 | |
685 basic_block bb1 = *(const basic_block *) x; | |
686 basic_block bb2 = *(const basic_block *) y; | |
687 | |
688 int bb_top_order_index_size = _loop->get_bb_top_order_index_size (); | |
689 | |
690 gcc_assert (bb1->index < bb_top_order_index_size | |
691 && bb2->index < bb_top_order_index_size); | |
692 gcc_assert (bb1 == bb2 | |
693 || _loop->get_bb_top_order_index(bb1->index) | |
694 != _loop->get_bb_top_order_index(bb2->index)); | |
695 | |
696 return (_loop->get_bb_top_order_index(bb1->index) - | |
697 _loop->get_bb_top_order_index(bb2->index)); | |
698 } | |
699 | |
700 bool | |
701 loop_distribution::create_rdg_vertices (struct graph *rdg, vec<gimple *> stmts, | |
702 loop_p loop) | |
111 | 703 { |
704 int i; | |
705 gimple *stmt; | |
706 | |
707 FOR_EACH_VEC_ELT (stmts, i, stmt) | |
708 { | |
709 struct vertex *v = &(rdg->vertices[i]); | |
710 | |
711 /* Record statement to vertex mapping. */ | |
712 gimple_set_uid (stmt, i); | |
713 | |
714 v->data = XNEW (struct rdg_vertex); | |
715 RDGV_STMT (v) = stmt; | |
716 RDGV_DATAREFS (v).create (0); | |
717 RDGV_HAS_MEM_WRITE (v) = false; | |
718 RDGV_HAS_MEM_READS (v) = false; | |
719 if (gimple_code (stmt) == GIMPLE_PHI) | |
720 continue; | |
721 | |
722 unsigned drp = datarefs_vec.length (); | |
723 if (!find_data_references_in_stmt (loop, stmt, &datarefs_vec)) | |
724 return false; | |
725 for (unsigned j = drp; j < datarefs_vec.length (); ++j) | |
726 { | |
727 data_reference_p dr = datarefs_vec[j]; | |
728 if (DR_IS_READ (dr)) | |
729 RDGV_HAS_MEM_READS (v) = true; | |
730 else | |
731 RDGV_HAS_MEM_WRITE (v) = true; | |
732 RDGV_DATAREFS (v).safe_push (dr); | |
145 | 733 has_nonaddressable_dataref_p |= may_be_nonaddressable_p (dr->ref); |
111 | 734 } |
735 } | |
736 return true; | |
737 } | |
738 | |
145 | 739 void |
740 loop_distribution::stmts_from_loop (class loop *loop, vec<gimple *> *stmts) | |
111 | 741 { |
742 unsigned int i; | |
145 | 743 basic_block *bbs = get_loop_body_in_custom_order (loop, this, bb_top_order_cmp_r); |
111 | 744 |
745 for (i = 0; i < loop->num_nodes; i++) | |
746 { | |
747 basic_block bb = bbs[i]; | |
748 | |
749 for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi); | |
750 gsi_next (&bsi)) | |
751 if (!virtual_operand_p (gimple_phi_result (bsi.phi ()))) | |
752 stmts->safe_push (bsi.phi ()); | |
753 | |
754 for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi); | |
755 gsi_next (&bsi)) | |
756 { | |
757 gimple *stmt = gsi_stmt (bsi); | |
758 if (gimple_code (stmt) != GIMPLE_LABEL && !is_gimple_debug (stmt)) | |
759 stmts->safe_push (stmt); | |
760 } | |
761 } | |
762 | |
763 free (bbs); | |
764 } | |
765 | |
766 /* Free the reduced dependence graph RDG. */ | |
767 | |
768 static void | |
769 free_rdg (struct graph *rdg) | |
770 { | |
771 int i; | |
772 | |
773 for (i = 0; i < rdg->n_vertices; i++) | |
774 { | |
775 struct vertex *v = &(rdg->vertices[i]); | |
776 struct graph_edge *e; | |
777 | |
778 for (e = v->succ; e; e = e->succ_next) | |
779 free (e->data); | |
780 | |
781 if (v->data) | |
782 { | |
783 gimple_set_uid (RDGV_STMT (v), -1); | |
784 (RDGV_DATAREFS (v)).release (); | |
785 free (v->data); | |
786 } | |
787 } | |
788 | |
789 free_graph (rdg); | |
790 } | |
791 | |
145 | 792 struct graph * |
793 loop_distribution::build_rdg (class loop *loop, control_dependences *cd) | |
111 | 794 { |
795 struct graph *rdg; | |
796 | |
797 /* Create the RDG vertices from the stmts of the loop nest. */ | |
798 auto_vec<gimple *, 10> stmts; | |
799 stmts_from_loop (loop, &stmts); | |
800 rdg = new_graph (stmts.length ()); | |
801 if (!create_rdg_vertices (rdg, stmts, loop)) | |
802 { | |
803 free_rdg (rdg); | |
804 return NULL; | |
0 | 805 } |
111 | 806 stmts.release (); |
807 | |
808 create_rdg_flow_edges (rdg); | |
809 if (cd) | |
810 create_rdg_cd_edges (rdg, cd, loop); | |
811 | |
812 return rdg; | |
813 } | |
814 | |
815 | |
816 /* Allocate and initialize a partition from BITMAP. */ | |
817 | |
818 static partition * | |
819 partition_alloc (void) | |
820 { | |
821 partition *partition = XCNEW (struct partition); | |
822 partition->stmts = BITMAP_ALLOC (NULL); | |
823 partition->reduction_p = false; | |
145 | 824 partition->loc = UNKNOWN_LOCATION; |
111 | 825 partition->kind = PKIND_NORMAL; |
145 | 826 partition->type = PTYPE_PARALLEL; |
111 | 827 partition->datarefs = BITMAP_ALLOC (NULL); |
828 return partition; | |
829 } | |
830 | |
831 /* Free PARTITION. */ | |
832 | |
833 static void | |
834 partition_free (partition *partition) | |
835 { | |
836 BITMAP_FREE (partition->stmts); | |
837 BITMAP_FREE (partition->datarefs); | |
838 if (partition->builtin) | |
839 free (partition->builtin); | |
840 | |
841 free (partition); | |
842 } | |
843 | |
844 /* Returns true if the partition can be generated as a builtin. */ | |
845 | |
846 static bool | |
847 partition_builtin_p (partition *partition) | |
848 { | |
131 | 849 return partition->kind > PKIND_PARTIAL_MEMSET; |
111 | 850 } |
851 | |
852 /* Returns true if the partition contains a reduction. */ | |
853 | |
854 static bool | |
855 partition_reduction_p (partition *partition) | |
856 { | |
857 return partition->reduction_p; | |
858 } | |
859 | |
145 | 860 void |
861 loop_distribution::partition_merge_into (struct graph *rdg, | |
862 partition *dest, partition *partition, enum fuse_type ft) | |
111 | 863 { |
864 if (dump_file && (dump_flags & TDF_DETAILS)) | |
865 { | |
866 fprintf (dump_file, "Fuse partitions because %s:\n", fuse_message[ft]); | |
867 fprintf (dump_file, " Part 1: "); | |
868 dump_bitmap (dump_file, dest->stmts); | |
869 fprintf (dump_file, " Part 2: "); | |
870 dump_bitmap (dump_file, partition->stmts); | |
871 } | |
872 | |
873 dest->kind = PKIND_NORMAL; | |
874 if (dest->type == PTYPE_PARALLEL) | |
875 dest->type = partition->type; | |
876 | |
877 bitmap_ior_into (dest->stmts, partition->stmts); | |
878 if (partition_reduction_p (partition)) | |
879 dest->reduction_p = true; | |
880 | |
881 /* Further check if any data dependence prevents us from executing the | |
882 new partition parallelly. */ | |
883 if (dest->type == PTYPE_PARALLEL && rdg != NULL) | |
884 update_type_for_merge (rdg, dest, partition); | |
885 | |
886 bitmap_ior_into (dest->datarefs, partition->datarefs); | |
887 } | |
888 | |
889 | |
890 /* Returns true when DEF is an SSA_NAME defined in LOOP and used after | |
891 the LOOP. */ | |
892 | |
893 static bool | |
894 ssa_name_has_uses_outside_loop_p (tree def, loop_p loop) | |
895 { | |
896 imm_use_iterator imm_iter; | |
897 use_operand_p use_p; | |
898 | |
899 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def) | |
900 { | |
901 if (is_gimple_debug (USE_STMT (use_p))) | |
902 continue; | |
903 | |
904 basic_block use_bb = gimple_bb (USE_STMT (use_p)); | |
905 if (!flow_bb_inside_loop_p (loop, use_bb)) | |
906 return true; | |
907 } | |
908 | |
909 return false; | |
910 } | |
911 | |
912 /* Returns true when STMT defines a scalar variable used after the | |
913 loop LOOP. */ | |
914 | |
915 static bool | |
916 stmt_has_scalar_dependences_outside_loop (loop_p loop, gimple *stmt) | |
917 { | |
918 def_operand_p def_p; | |
919 ssa_op_iter op_iter; | |
920 | |
921 if (gimple_code (stmt) == GIMPLE_PHI) | |
922 return ssa_name_has_uses_outside_loop_p (gimple_phi_result (stmt), loop); | |
923 | |
924 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF) | |
925 if (ssa_name_has_uses_outside_loop_p (DEF_FROM_PTR (def_p), loop)) | |
926 return true; | |
927 | |
928 return false; | |
0 | 929 } |
930 | |
931 /* Return a copy of LOOP placed before LOOP. */ | |
932 | |
145 | 933 static class loop * |
934 copy_loop_before (class loop *loop) | |
0 | 935 { |
145 | 936 class loop *res; |
0 | 937 edge preheader = loop_preheader_edge (loop); |
938 | |
939 initialize_original_copy_tables (); | |
111 | 940 res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, NULL, preheader); |
941 gcc_assert (res != NULL); | |
0 | 942 free_original_copy_tables (); |
111 | 943 delete_update_ssa (); |
0 | 944 |
945 return res; | |
946 } | |
947 | |
948 /* Creates an empty basic block after LOOP. */ | |
949 | |
950 static void | |
145 | 951 create_bb_after_loop (class loop *loop) |
0 | 952 { |
953 edge exit = single_exit (loop); | |
954 | |
955 if (!exit) | |
956 return; | |
957 | |
958 split_edge (exit); | |
959 } | |
960 | |
961 /* Generate code for PARTITION from the code in LOOP. The loop is | |
962 copied when COPY_P is true. All the statements not flagged in the | |
963 PARTITION bitmap are removed from the loop or from its copy. The | |
964 statements are indexed in sequence inside a basic block, and the | |
111 | 965 basic blocks of a loop are taken in dom order. */ |
966 | |
967 static void | |
145 | 968 generate_loops_for_partition (class loop *loop, partition *partition, |
111 | 969 bool copy_p) |
0 | 970 { |
111 | 971 unsigned i; |
0 | 972 basic_block *bbs; |
973 | |
974 if (copy_p) | |
975 { | |
111 | 976 int orig_loop_num = loop->orig_loop_num; |
0 | 977 loop = copy_loop_before (loop); |
111 | 978 gcc_assert (loop != NULL); |
979 loop->orig_loop_num = orig_loop_num; | |
0 | 980 create_preheader (loop, CP_SIMPLE_PREHEADERS); |
981 create_bb_after_loop (loop); | |
982 } | |
111 | 983 else |
0 | 984 { |
111 | 985 /* Origin number is set to the new versioned loop's num. */ |
986 gcc_assert (loop->orig_loop_num != loop->num); | |
0 | 987 } |
988 | |
111 | 989 /* Remove stmts not in the PARTITION bitmap. */ |
0 | 990 bbs = get_loop_body_in_dom_order (loop); |
991 | |
131 | 992 if (MAY_HAVE_DEBUG_BIND_STMTS) |
111 | 993 for (i = 0; i < loop->num_nodes; i++) |
994 { | |
995 basic_block bb = bbs[i]; | |
996 | |
997 for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi); | |
998 gsi_next (&bsi)) | |
999 { | |
1000 gphi *phi = bsi.phi (); | |
1001 if (!virtual_operand_p (gimple_phi_result (phi)) | |
1002 && !bitmap_bit_p (partition->stmts, gimple_uid (phi))) | |
1003 reset_debug_uses (phi); | |
1004 } | |
1005 | |
1006 for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi)) | |
1007 { | |
1008 gimple *stmt = gsi_stmt (bsi); | |
1009 if (gimple_code (stmt) != GIMPLE_LABEL | |
1010 && !is_gimple_debug (stmt) | |
1011 && !bitmap_bit_p (partition->stmts, gimple_uid (stmt))) | |
1012 reset_debug_uses (stmt); | |
1013 } | |
1014 } | |
1015 | |
0 | 1016 for (i = 0; i < loop->num_nodes; i++) |
1017 { | |
1018 basic_block bb = bbs[i]; | |
111 | 1019 edge inner_exit = NULL; |
1020 | |
1021 if (loop != bb->loop_father) | |
1022 inner_exit = single_exit (bb->loop_father); | |
1023 | |
1024 for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);) | |
1025 { | |
1026 gphi *phi = bsi.phi (); | |
1027 if (!virtual_operand_p (gimple_phi_result (phi)) | |
1028 && !bitmap_bit_p (partition->stmts, gimple_uid (phi))) | |
1029 remove_phi_node (&bsi, true); | |
1030 else | |
1031 gsi_next (&bsi); | |
1032 } | |
1033 | |
1034 for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);) | |
0 | 1035 { |
111 | 1036 gimple *stmt = gsi_stmt (bsi); |
1037 if (gimple_code (stmt) != GIMPLE_LABEL | |
1038 && !is_gimple_debug (stmt) | |
1039 && !bitmap_bit_p (partition->stmts, gimple_uid (stmt))) | |
0 | 1040 { |
111 | 1041 /* In distribution of loop nest, if bb is inner loop's exit_bb, |
1042 we choose its exit edge/path in order to avoid generating | |
1043 infinite loop. For all other cases, we choose an arbitrary | |
1044 path through the empty CFG part that this unnecessary | |
1045 control stmt controls. */ | |
1046 if (gcond *cond_stmt = dyn_cast <gcond *> (stmt)) | |
1047 { | |
1048 if (inner_exit && inner_exit->flags & EDGE_TRUE_VALUE) | |
1049 gimple_cond_make_true (cond_stmt); | |
1050 else | |
1051 gimple_cond_make_false (cond_stmt); | |
1052 update_stmt (stmt); | |
1053 } | |
1054 else if (gimple_code (stmt) == GIMPLE_SWITCH) | |
1055 { | |
1056 gswitch *switch_stmt = as_a <gswitch *> (stmt); | |
1057 gimple_switch_set_index | |
1058 (switch_stmt, CASE_LOW (gimple_switch_label (switch_stmt, 1))); | |
1059 update_stmt (stmt); | |
1060 } | |
1061 else | |
1062 { | |
1063 unlink_stmt_vdef (stmt); | |
1064 gsi_remove (&bsi, true); | |
1065 release_defs (stmt); | |
1066 continue; | |
1067 } | |
0 | 1068 } |
111 | 1069 gsi_next (&bsi); |
0 | 1070 } |
1071 } | |
1072 | |
111 | 1073 free (bbs); |
1074 } | |
1075 | |
1076 /* If VAL memory representation contains the same value in all bytes, | |
1077 return that value, otherwise return -1. | |
1078 E.g. for 0x24242424 return 0x24, for IEEE double | |
1079 747708026454360457216.0 return 0x44, etc. */ | |
1080 | |
1081 static int | |
1082 const_with_all_bytes_same (tree val) | |
1083 { | |
1084 unsigned char buf[64]; | |
1085 int i, len; | |
1086 | |
1087 if (integer_zerop (val) | |
1088 || (TREE_CODE (val) == CONSTRUCTOR | |
1089 && !TREE_CLOBBER_P (val) | |
1090 && CONSTRUCTOR_NELTS (val) == 0)) | |
1091 return 0; | |
1092 | |
1093 if (real_zerop (val)) | |
1094 { | |
1095 /* Only return 0 for +0.0, not for -0.0, which doesn't have | |
1096 an all bytes same memory representation. Don't transform | |
1097 -0.0 stores into +0.0 even for !HONOR_SIGNED_ZEROS. */ | |
1098 switch (TREE_CODE (val)) | |
1099 { | |
1100 case REAL_CST: | |
1101 if (!real_isneg (TREE_REAL_CST_PTR (val))) | |
1102 return 0; | |
1103 break; | |
1104 case COMPLEX_CST: | |
1105 if (!const_with_all_bytes_same (TREE_REALPART (val)) | |
1106 && !const_with_all_bytes_same (TREE_IMAGPART (val))) | |
1107 return 0; | |
1108 break; | |
1109 case VECTOR_CST: | |
131 | 1110 { |
1111 unsigned int count = vector_cst_encoded_nelts (val); | |
1112 unsigned int j; | |
1113 for (j = 0; j < count; ++j) | |
1114 if (const_with_all_bytes_same (VECTOR_CST_ENCODED_ELT (val, j))) | |
1115 break; | |
1116 if (j == count) | |
1117 return 0; | |
1118 break; | |
1119 } | |
111 | 1120 default: |
1121 break; | |
1122 } | |
1123 } | |
1124 | |
1125 if (CHAR_BIT != 8 || BITS_PER_UNIT != 8) | |
1126 return -1; | |
1127 | |
1128 len = native_encode_expr (val, buf, sizeof (buf)); | |
1129 if (len == 0) | |
1130 return -1; | |
1131 for (i = 1; i < len; i++) | |
1132 if (buf[i] != buf[0]) | |
1133 return -1; | |
1134 return buf[0]; | |
1135 } | |
1136 | |
1137 /* Generate a call to memset for PARTITION in LOOP. */ | |
1138 | |
1139 static void | |
145 | 1140 generate_memset_builtin (class loop *loop, partition *partition) |
111 | 1141 { |
1142 gimple_stmt_iterator gsi; | |
1143 tree mem, fn, nb_bytes; | |
1144 tree val; | |
1145 struct builtin_info *builtin = partition->builtin; | |
1146 gimple *fn_call; | |
0 | 1147 |
1148 /* The new statements will be placed before LOOP. */ | |
111 | 1149 gsi = gsi_last_bb (loop_preheader_edge (loop)->src); |
1150 | |
145 | 1151 nb_bytes = rewrite_to_non_trapping_overflow (builtin->size); |
111 | 1152 nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE, |
1153 false, GSI_CONTINUE_LINKING); | |
1154 mem = builtin->dst_base; | |
1155 mem = force_gimple_operand_gsi (&gsi, mem, true, NULL_TREE, | |
1156 false, GSI_CONTINUE_LINKING); | |
1157 | |
1158 /* This exactly matches the pattern recognition in classify_partition. */ | |
1159 val = gimple_assign_rhs1 (DR_STMT (builtin->dst_dr)); | |
1160 /* Handle constants like 0x15151515 and similarly | |
1161 floating point constants etc. where all bytes are the same. */ | |
1162 int bytev = const_with_all_bytes_same (val); | |
1163 if (bytev != -1) | |
1164 val = build_int_cst (integer_type_node, bytev); | |
1165 else if (TREE_CODE (val) == INTEGER_CST) | |
1166 val = fold_convert (integer_type_node, val); | |
1167 else if (!useless_type_conversion_p (integer_type_node, TREE_TYPE (val))) | |
1168 { | |
1169 tree tem = make_ssa_name (integer_type_node); | |
1170 gimple *cstmt = gimple_build_assign (tem, NOP_EXPR, val); | |
1171 gsi_insert_after (&gsi, cstmt, GSI_CONTINUE_LINKING); | |
1172 val = tem; | |
1173 } | |
1174 | |
1175 fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_MEMSET)); | |
1176 fn_call = gimple_build_call (fn, 3, mem, val, nb_bytes); | |
145 | 1177 gimple_set_location (fn_call, partition->loc); |
111 | 1178 gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING); |
145 | 1179 fold_stmt (&gsi); |
111 | 1180 |
1181 if (dump_file && (dump_flags & TDF_DETAILS)) | |
0 | 1182 { |
111 | 1183 fprintf (dump_file, "generated memset"); |
1184 if (bytev == 0) | |
1185 fprintf (dump_file, " zero\n"); | |
1186 else | |
1187 fprintf (dump_file, "\n"); | |
1188 } | |
1189 } | |
1190 | |
1191 /* Generate a call to memcpy for PARTITION in LOOP. */ | |
1192 | |
1193 static void | |
145 | 1194 generate_memcpy_builtin (class loop *loop, partition *partition) |
111 | 1195 { |
1196 gimple_stmt_iterator gsi; | |
1197 gimple *fn_call; | |
1198 tree dest, src, fn, nb_bytes; | |
1199 enum built_in_function kind; | |
1200 struct builtin_info *builtin = partition->builtin; | |
1201 | |
1202 /* The new statements will be placed before LOOP. */ | |
1203 gsi = gsi_last_bb (loop_preheader_edge (loop)->src); | |
1204 | |
145 | 1205 nb_bytes = rewrite_to_non_trapping_overflow (builtin->size); |
111 | 1206 nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE, |
1207 false, GSI_CONTINUE_LINKING); | |
1208 dest = builtin->dst_base; | |
1209 src = builtin->src_base; | |
1210 if (partition->kind == PKIND_MEMCPY | |
1211 || ! ptr_derefs_may_alias_p (dest, src)) | |
1212 kind = BUILT_IN_MEMCPY; | |
1213 else | |
1214 kind = BUILT_IN_MEMMOVE; | |
1215 | |
1216 dest = force_gimple_operand_gsi (&gsi, dest, true, NULL_TREE, | |
1217 false, GSI_CONTINUE_LINKING); | |
1218 src = force_gimple_operand_gsi (&gsi, src, true, NULL_TREE, | |
1219 false, GSI_CONTINUE_LINKING); | |
1220 fn = build_fold_addr_expr (builtin_decl_implicit (kind)); | |
1221 fn_call = gimple_build_call (fn, 3, dest, src, nb_bytes); | |
145 | 1222 gimple_set_location (fn_call, partition->loc); |
111 | 1223 gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING); |
145 | 1224 fold_stmt (&gsi); |
111 | 1225 |
1226 if (dump_file && (dump_flags & TDF_DETAILS)) | |
1227 { | |
1228 if (kind == BUILT_IN_MEMCPY) | |
1229 fprintf (dump_file, "generated memcpy\n"); | |
1230 else | |
1231 fprintf (dump_file, "generated memmove\n"); | |
0 | 1232 } |
1233 } | |
1234 | |
111 | 1235 /* Remove and destroy the loop LOOP. */ |
1236 | |
1237 static void | |
145 | 1238 destroy_loop (class loop *loop) |
0 | 1239 { |
111 | 1240 unsigned nbbs = loop->num_nodes; |
1241 edge exit = single_exit (loop); | |
1242 basic_block src = loop_preheader_edge (loop)->src, dest = exit->dest; | |
1243 basic_block *bbs; | |
1244 unsigned i; | |
1245 | |
1246 bbs = get_loop_body_in_dom_order (loop); | |
1247 | |
145 | 1248 gimple_stmt_iterator dst_gsi = gsi_after_labels (exit->dest); |
1249 bool safe_p = single_pred_p (exit->dest); | |
1250 for (unsigned i = 0; i < nbbs; ++i) | |
1251 { | |
1252 /* We have made sure to not leave any dangling uses of SSA | |
1253 names defined in the loop. With the exception of virtuals. | |
1254 Make sure we replace all uses of virtual defs that will remain | |
1255 outside of the loop with the bare symbol as delete_basic_block | |
1256 will release them. */ | |
1257 for (gphi_iterator gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi); | |
1258 gsi_next (&gsi)) | |
1259 { | |
1260 gphi *phi = gsi.phi (); | |
1261 if (virtual_operand_p (gimple_phi_result (phi))) | |
1262 mark_virtual_phi_result_for_renaming (phi); | |
1263 } | |
1264 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi);) | |
1265 { | |
1266 gimple *stmt = gsi_stmt (gsi); | |
1267 tree vdef = gimple_vdef (stmt); | |
1268 if (vdef && TREE_CODE (vdef) == SSA_NAME) | |
1269 mark_virtual_operand_for_renaming (vdef); | |
1270 /* Also move and eventually reset debug stmts. We can leave | |
1271 constant values in place in case the stmt dominates the exit. | |
1272 ??? Non-constant values from the last iteration can be | |
1273 replaced with final values if we can compute them. */ | |
1274 if (gimple_debug_bind_p (stmt)) | |
1275 { | |
1276 tree val = gimple_debug_bind_get_value (stmt); | |
1277 gsi_move_before (&gsi, &dst_gsi); | |
1278 if (val | |
1279 && (!safe_p | |
1280 || !is_gimple_min_invariant (val) | |
1281 || !dominated_by_p (CDI_DOMINATORS, exit->src, bbs[i]))) | |
1282 { | |
1283 gimple_debug_bind_reset_value (stmt); | |
1284 update_stmt (stmt); | |
1285 } | |
1286 } | |
1287 else | |
1288 gsi_next (&gsi); | |
1289 } | |
1290 } | |
1291 | |
111 | 1292 redirect_edge_pred (exit, src); |
1293 exit->flags &= ~(EDGE_TRUE_VALUE|EDGE_FALSE_VALUE); | |
1294 exit->flags |= EDGE_FALLTHRU; | |
1295 cancel_loop_tree (loop); | |
1296 rescan_loop_exit (exit, false, true); | |
1297 | |
1298 i = nbbs; | |
1299 do | |
1300 { | |
1301 --i; | |
1302 delete_basic_block (bbs[i]); | |
1303 } | |
1304 while (i != 0); | |
1305 | |
1306 free (bbs); | |
1307 | |
1308 set_immediate_dominator (CDI_DOMINATORS, dest, | |
1309 recompute_dominator (CDI_DOMINATORS, dest)); | |
1310 } | |
1311 | |
1312 /* Generates code for PARTITION. Return whether LOOP needs to be destroyed. */ | |
1313 | |
1314 static bool | |
145 | 1315 generate_code_for_partition (class loop *loop, |
111 | 1316 partition *partition, bool copy_p) |
1317 { | |
1318 switch (partition->kind) | |
1319 { | |
1320 case PKIND_NORMAL: | |
131 | 1321 case PKIND_PARTIAL_MEMSET: |
111 | 1322 /* Reductions all have to be in the last partition. */ |
1323 gcc_assert (!partition_reduction_p (partition) | |
1324 || !copy_p); | |
1325 generate_loops_for_partition (loop, partition, copy_p); | |
1326 return false; | |
1327 | |
1328 case PKIND_MEMSET: | |
1329 generate_memset_builtin (loop, partition); | |
1330 break; | |
1331 | |
1332 case PKIND_MEMCPY: | |
1333 case PKIND_MEMMOVE: | |
1334 generate_memcpy_builtin (loop, partition); | |
1335 break; | |
1336 | |
1337 default: | |
1338 gcc_unreachable (); | |
1339 } | |
1340 | |
1341 /* Common tail for partitions we turn into a call. If this was the last | |
1342 partition for which we generate code, we have to destroy the loop. */ | |
1343 if (!copy_p) | |
0 | 1344 return true; |
1345 return false; | |
1346 } | |
1347 | |
145 | 1348 data_dependence_relation * |
1349 loop_distribution::get_data_dependence (struct graph *rdg, data_reference_p a, | |
1350 data_reference_p b) | |
0 | 1351 { |
111 | 1352 struct data_dependence_relation ent, **slot; |
1353 struct data_dependence_relation *ddr; | |
1354 | |
1355 gcc_assert (DR_IS_WRITE (a) || DR_IS_WRITE (b)); | |
1356 gcc_assert (rdg_vertex_for_stmt (rdg, DR_STMT (a)) | |
1357 <= rdg_vertex_for_stmt (rdg, DR_STMT (b))); | |
1358 ent.a = a; | |
1359 ent.b = b; | |
1360 slot = ddrs_table->find_slot (&ent, INSERT); | |
1361 if (*slot == NULL) | |
1362 { | |
1363 ddr = initialize_data_dependence_relation (a, b, loop_nest); | |
1364 compute_affine_dependence (ddr, loop_nest[0]); | |
1365 *slot = ddr; | |
1366 } | |
1367 | |
1368 return *slot; | |
0 | 1369 } |
1370 | |
145 | 1371 bool |
1372 loop_distribution::data_dep_in_cycle_p (struct graph *rdg, | |
1373 data_reference_p dr1, | |
1374 data_reference_p dr2) | |
0 | 1375 { |
111 | 1376 struct data_dependence_relation *ddr; |
1377 | |
1378 /* Re-shuffle data-refs to be in topological order. */ | |
1379 if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1)) | |
1380 > rdg_vertex_for_stmt (rdg, DR_STMT (dr2))) | |
1381 std::swap (dr1, dr2); | |
1382 | |
1383 ddr = get_data_dependence (rdg, dr1, dr2); | |
1384 | |
1385 /* In case of no data dependence. */ | |
1386 if (DDR_ARE_DEPENDENT (ddr) == chrec_known) | |
1387 return false; | |
1388 /* For unknown data dependence or known data dependence which can't be | |
1389 expressed in classic distance vector, we check if it can be resolved | |
1390 by runtime alias check. If yes, we still consider data dependence | |
1391 as won't introduce data dependence cycle. */ | |
1392 else if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know | |
1393 || DDR_NUM_DIST_VECTS (ddr) == 0) | |
1394 return !runtime_alias_check_p (ddr, NULL, true); | |
1395 else if (DDR_NUM_DIST_VECTS (ddr) > 1) | |
1396 return true; | |
1397 else if (DDR_REVERSED_P (ddr) | |
1398 || lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), 1)) | |
1399 return false; | |
1400 | |
1401 return true; | |
0 | 1402 } |
1403 | |
145 | 1404 void |
1405 loop_distribution::update_type_for_merge (struct graph *rdg, | |
1406 partition *partition1, | |
1407 partition *partition2) | |
0 | 1408 { |
111 | 1409 unsigned i, j; |
1410 bitmap_iterator bi, bj; | |
1411 data_reference_p dr1, dr2; | |
1412 | |
1413 EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi) | |
0 | 1414 { |
111 | 1415 unsigned start = (partition1 == partition2) ? i + 1 : 0; |
1416 | |
1417 dr1 = datarefs_vec[i]; | |
1418 EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, start, j, bj) | |
0 | 1419 { |
111 | 1420 dr2 = datarefs_vec[j]; |
1421 if (DR_IS_READ (dr1) && DR_IS_READ (dr2)) | |
1422 continue; | |
1423 | |
1424 /* Partition can only be executed sequentially if there is any | |
1425 data dependence cycle. */ | |
1426 if (data_dep_in_cycle_p (rdg, dr1, dr2)) | |
0 | 1427 { |
111 | 1428 partition1->type = PTYPE_SEQUENTIAL; |
1429 return; | |
0 | 1430 } |
1431 } | |
1432 } | |
1433 } | |
1434 | |
145 | 1435 partition * |
1436 loop_distribution::build_rdg_partition_for_vertex (struct graph *rdg, int v) | |
0 | 1437 { |
111 | 1438 partition *partition = partition_alloc (); |
1439 auto_vec<int, 3> nodes; | |
1440 unsigned i, j; | |
0 | 1441 int x; |
111 | 1442 data_reference_p dr; |
1443 | |
1444 graphds_dfs (rdg, &v, 1, &nodes, false, NULL); | |
1445 | |
1446 FOR_EACH_VEC_ELT (nodes, i, x) | |
1447 { | |
1448 bitmap_set_bit (partition->stmts, x); | |
1449 | |
1450 for (j = 0; RDG_DATAREFS (rdg, x).iterate (j, &dr); ++j) | |
1451 { | |
1452 unsigned idx = (unsigned) DR_INDEX (dr); | |
1453 gcc_assert (idx < datarefs_vec.length ()); | |
1454 | |
1455 /* Partition can only be executed sequentially if there is any | |
1456 unknown data reference. */ | |
1457 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) | |
1458 || !DR_INIT (dr) || !DR_STEP (dr)) | |
1459 partition->type = PTYPE_SEQUENTIAL; | |
1460 | |
1461 bitmap_set_bit (partition->datarefs, idx); | |
1462 } | |
1463 } | |
1464 | |
1465 if (partition->type == PTYPE_SEQUENTIAL) | |
1466 return partition; | |
1467 | |
1468 /* Further check if any data dependence prevents us from executing the | |
1469 partition parallelly. */ | |
1470 update_type_for_merge (rdg, partition, partition); | |
1471 | |
1472 return partition; | |
0 | 1473 } |
1474 | |
111 | 1475 /* Given PARTITION of LOOP and RDG, record single load/store data references |
1476 for builtin partition in SRC_DR/DST_DR, return false if there is no such | |
1477 data references. */ | |
1478 | |
1479 static bool | |
145 | 1480 find_single_drs (class loop *loop, struct graph *rdg, partition *partition, |
111 | 1481 data_reference_p *dst_dr, data_reference_p *src_dr) |
0 | 1482 { |
1483 unsigned i; | |
111 | 1484 data_reference_p single_ld = NULL, single_st = NULL; |
0 | 1485 bitmap_iterator bi; |
111 | 1486 |
1487 EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi) | |
0 | 1488 { |
111 | 1489 gimple *stmt = RDG_STMT (rdg, i); |
1490 data_reference_p dr; | |
1491 | |
1492 if (gimple_code (stmt) == GIMPLE_PHI) | |
1493 continue; | |
1494 | |
1495 /* Any scalar stmts are ok. */ | |
1496 if (!gimple_vuse (stmt)) | |
1497 continue; | |
1498 | |
1499 /* Otherwise just regular loads/stores. */ | |
1500 if (!gimple_assign_single_p (stmt)) | |
1501 return false; | |
1502 | |
1503 /* But exactly one store and/or load. */ | |
1504 for (unsigned j = 0; RDG_DATAREFS (rdg, i).iterate (j, &dr); ++j) | |
1505 { | |
1506 tree type = TREE_TYPE (DR_REF (dr)); | |
1507 | |
1508 /* The memset, memcpy and memmove library calls are only | |
1509 able to deal with generic address space. */ | |
1510 if (!ADDR_SPACE_GENERIC_P (TYPE_ADDR_SPACE (type))) | |
1511 return false; | |
1512 | |
1513 if (DR_IS_READ (dr)) | |
1514 { | |
1515 if (single_ld != NULL) | |
1516 return false; | |
1517 single_ld = dr; | |
1518 } | |
1519 else | |
1520 { | |
1521 if (single_st != NULL) | |
1522 return false; | |
1523 single_st = dr; | |
1524 } | |
1525 } | |
0 | 1526 } |
1527 | |
111 | 1528 if (!single_st) |
1529 return false; | |
1530 | |
1531 /* Bail out if this is a bitfield memory reference. */ | |
1532 if (TREE_CODE (DR_REF (single_st)) == COMPONENT_REF | |
1533 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_st), 1))) | |
1534 return false; | |
1535 | |
1536 /* Data reference must be executed exactly once per iteration of each | |
1537 loop in the loop nest. We only need to check dominance information | |
1538 against the outermost one in a perfect loop nest because a bb can't | |
1539 dominate outermost loop's latch without dominating inner loop's. */ | |
1540 basic_block bb_st = gimple_bb (DR_STMT (single_st)); | |
1541 if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_st)) | |
1542 return false; | |
1543 | |
1544 if (single_ld) | |
1545 { | |
1546 gimple *store = DR_STMT (single_st), *load = DR_STMT (single_ld); | |
1547 /* Direct aggregate copy or via an SSA name temporary. */ | |
1548 if (load != store | |
1549 && gimple_assign_lhs (load) != gimple_assign_rhs1 (store)) | |
1550 return false; | |
1551 | |
1552 /* Bail out if this is a bitfield memory reference. */ | |
1553 if (TREE_CODE (DR_REF (single_ld)) == COMPONENT_REF | |
1554 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_ld), 1))) | |
1555 return false; | |
1556 | |
1557 /* Load and store must be in the same loop nest. */ | |
1558 basic_block bb_ld = gimple_bb (DR_STMT (single_ld)); | |
1559 if (bb_st->loop_father != bb_ld->loop_father) | |
1560 return false; | |
1561 | |
1562 /* Data reference must be executed exactly once per iteration. | |
1563 Same as single_st, we only need to check against the outermost | |
1564 loop. */ | |
1565 if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_ld)) | |
1566 return false; | |
1567 | |
1568 edge e = single_exit (bb_st->loop_father); | |
1569 bool dom_ld = dominated_by_p (CDI_DOMINATORS, e->src, bb_ld); | |
1570 bool dom_st = dominated_by_p (CDI_DOMINATORS, e->src, bb_st); | |
1571 if (dom_ld != dom_st) | |
1572 return false; | |
1573 } | |
1574 | |
1575 *src_dr = single_ld; | |
1576 *dst_dr = single_st; | |
1577 return true; | |
0 | 1578 } |
1579 | |
111 | 1580 /* Given data reference DR in LOOP_NEST, this function checks the enclosing |
1581 loops from inner to outer to see if loop's step equals to access size at | |
131 | 1582 each level of loop. Return 2 if we can prove this at all level loops; |
1583 record access base and size in BASE and SIZE; save loop's step at each | |
1584 level of loop in STEPS if it is not null. For example: | |
111 | 1585 |
1586 int arr[100][100][100]; | |
1587 for (i = 0; i < 100; i++) ;steps[2] = 40000 | |
1588 for (j = 100; j > 0; j--) ;steps[1] = -400 | |
1589 for (k = 0; k < 100; k++) ;steps[0] = 4 | |
131 | 1590 arr[i][j - 1][k] = 0; ;base = &arr, size = 4000000 |
1591 | |
1592 Return 1 if we can prove the equality at the innermost loop, but not all | |
1593 level loops. In this case, no information is recorded. | |
1594 | |
1595 Return 0 if no equality can be proven at any level loops. */ | |
1596 | |
1597 static int | |
111 | 1598 compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base, |
1599 tree *size, vec<tree> *steps = NULL) | |
0 | 1600 { |
111 | 1601 location_t loc = gimple_location (DR_STMT (dr)); |
1602 basic_block bb = gimple_bb (DR_STMT (dr)); | |
145 | 1603 class loop *loop = bb->loop_father; |
111 | 1604 tree ref = DR_REF (dr); |
1605 tree access_base = build_fold_addr_expr (ref); | |
1606 tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (ref)); | |
131 | 1607 int res = 0; |
111 | 1608 |
1609 do { | |
1610 tree scev_fn = analyze_scalar_evolution (loop, access_base); | |
1611 if (TREE_CODE (scev_fn) != POLYNOMIAL_CHREC) | |
131 | 1612 return res; |
111 | 1613 |
1614 access_base = CHREC_LEFT (scev_fn); | |
1615 if (tree_contains_chrecs (access_base, NULL)) | |
131 | 1616 return res; |
111 | 1617 |
1618 tree scev_step = CHREC_RIGHT (scev_fn); | |
1619 /* Only support constant steps. */ | |
1620 if (TREE_CODE (scev_step) != INTEGER_CST) | |
131 | 1621 return res; |
111 | 1622 |
1623 enum ev_direction access_dir = scev_direction (scev_fn); | |
1624 if (access_dir == EV_DIR_UNKNOWN) | |
131 | 1625 return res; |
111 | 1626 |
1627 if (steps != NULL) | |
1628 steps->safe_push (scev_step); | |
1629 | |
1630 scev_step = fold_convert_loc (loc, sizetype, scev_step); | |
1631 /* Compute absolute value of scev step. */ | |
1632 if (access_dir == EV_DIR_DECREASES) | |
1633 scev_step = fold_build1_loc (loc, NEGATE_EXPR, sizetype, scev_step); | |
1634 | |
1635 /* At each level of loop, scev step must equal to access size. In other | |
1636 words, DR must access consecutive memory between loop iterations. */ | |
1637 if (!operand_equal_p (scev_step, access_size, 0)) | |
131 | 1638 return res; |
1639 | |
1640 /* Access stride can be computed for data reference at least for the | |
1641 innermost loop. */ | |
1642 res = 1; | |
111 | 1643 |
1644 /* Compute DR's execution times in loop. */ | |
1645 tree niters = number_of_latch_executions (loop); | |
1646 niters = fold_convert_loc (loc, sizetype, niters); | |
1647 if (dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src, bb)) | |
1648 niters = size_binop_loc (loc, PLUS_EXPR, niters, size_one_node); | |
1649 | |
1650 /* Compute DR's overall access size in loop. */ | |
1651 access_size = fold_build2_loc (loc, MULT_EXPR, sizetype, | |
1652 niters, scev_step); | |
1653 /* Adjust base address in case of negative step. */ | |
1654 if (access_dir == EV_DIR_DECREASES) | |
1655 { | |
1656 tree adj = fold_build2_loc (loc, MINUS_EXPR, sizetype, | |
1657 scev_step, access_size); | |
1658 access_base = fold_build_pointer_plus_loc (loc, access_base, adj); | |
1659 } | |
1660 } while (loop != loop_nest && (loop = loop_outer (loop)) != NULL); | |
1661 | |
1662 *base = access_base; | |
1663 *size = access_size; | |
131 | 1664 /* Access stride can be computed for data reference at each level loop. */ |
1665 return 2; | |
0 | 1666 } |
1667 | |
111 | 1668 /* Allocate and return builtin struct. Record information like DST_DR, |
1669 SRC_DR, DST_BASE, SRC_BASE and SIZE in the allocated struct. */ | |
1670 | |
1671 static struct builtin_info * | |
1672 alloc_builtin (data_reference_p dst_dr, data_reference_p src_dr, | |
1673 tree dst_base, tree src_base, tree size) | |
1674 { | |
1675 struct builtin_info *builtin = XNEW (struct builtin_info); | |
1676 builtin->dst_dr = dst_dr; | |
1677 builtin->src_dr = src_dr; | |
1678 builtin->dst_base = dst_base; | |
1679 builtin->src_base = src_base; | |
1680 builtin->size = size; | |
1681 return builtin; | |
1682 } | |
1683 | |
1684 /* Given data reference DR in loop nest LOOP, classify if it forms builtin | |
1685 memset call. */ | |
0 | 1686 |
1687 static void | |
111 | 1688 classify_builtin_st (loop_p loop, partition *partition, data_reference_p dr) |
0 | 1689 { |
111 | 1690 gimple *stmt = DR_STMT (dr); |
1691 tree base, size, rhs = gimple_assign_rhs1 (stmt); | |
1692 | |
1693 if (const_with_all_bytes_same (rhs) == -1 | |
1694 && (!INTEGRAL_TYPE_P (TREE_TYPE (rhs)) | |
1695 || (TYPE_MODE (TREE_TYPE (rhs)) | |
1696 != TYPE_MODE (unsigned_char_type_node)))) | |
1697 return; | |
1698 | |
1699 if (TREE_CODE (rhs) == SSA_NAME | |
1700 && !SSA_NAME_IS_DEFAULT_DEF (rhs) | |
1701 && flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (rhs)))) | |
1702 return; | |
1703 | |
131 | 1704 int res = compute_access_range (loop, dr, &base, &size); |
1705 if (res == 0) | |
1706 return; | |
1707 if (res == 1) | |
1708 { | |
1709 partition->kind = PKIND_PARTIAL_MEMSET; | |
1710 return; | |
1711 } | |
1712 | |
1713 poly_uint64 base_offset; | |
1714 unsigned HOST_WIDE_INT const_base_offset; | |
1715 tree base_base = strip_offset (base, &base_offset); | |
1716 if (!base_offset.is_constant (&const_base_offset)) | |
111 | 1717 return; |
1718 | |
1719 struct builtin_info *builtin; | |
1720 builtin = alloc_builtin (dr, NULL, base, NULL_TREE, size); | |
131 | 1721 builtin->dst_base_base = base_base; |
1722 builtin->dst_base_offset = const_base_offset; | |
111 | 1723 partition->builtin = builtin; |
1724 partition->kind = PKIND_MEMSET; | |
0 | 1725 } |
1726 | |
111 | 1727 /* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify |
1728 if it forms builtin memcpy or memmove call. */ | |
0 | 1729 |
145 | 1730 void |
1731 loop_distribution::classify_builtin_ldst (loop_p loop, struct graph *rdg, | |
1732 partition *partition, | |
1733 data_reference_p dst_dr, | |
1734 data_reference_p src_dr) | |
0 | 1735 { |
111 | 1736 tree base, size, src_base, src_size; |
1737 auto_vec<tree> dst_steps, src_steps; | |
1738 | |
131 | 1739 /* Compute access range of both load and store. */ |
1740 int res = compute_access_range (loop, dst_dr, &base, &size, &dst_steps); | |
1741 if (res != 2) | |
1742 return; | |
1743 res = compute_access_range (loop, src_dr, &src_base, &src_size, &src_steps); | |
1744 if (res != 2) | |
1745 return; | |
1746 | |
1747 /* They much have the same access size. */ | |
1748 if (!operand_equal_p (size, src_size, 0)) | |
111 | 1749 return; |
1750 | |
1751 /* Load and store in loop nest must access memory in the same way, i.e, | |
1752 their must have the same steps in each loop of the nest. */ | |
1753 if (dst_steps.length () != src_steps.length ()) | |
1754 return; | |
1755 for (unsigned i = 0; i < dst_steps.length (); ++i) | |
1756 if (!operand_equal_p (dst_steps[i], src_steps[i], 0)) | |
1757 return; | |
1758 | |
1759 /* Now check that if there is a dependence. */ | |
1760 ddr_p ddr = get_data_dependence (rdg, src_dr, dst_dr); | |
1761 | |
1762 /* Classify as memcpy if no dependence between load and store. */ | |
1763 if (DDR_ARE_DEPENDENT (ddr) == chrec_known) | |
1764 { | |
1765 partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size); | |
1766 partition->kind = PKIND_MEMCPY; | |
1767 return; | |
1768 } | |
1769 | |
1770 /* Can't do memmove in case of unknown dependence or dependence without | |
1771 classical distance vector. */ | |
1772 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know | |
1773 || DDR_NUM_DIST_VECTS (ddr) == 0) | |
1774 return; | |
1775 | |
1776 unsigned i; | |
1777 lambda_vector dist_v; | |
1778 int num_lev = (DDR_LOOP_NEST (ddr)).length (); | |
1779 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) | |
0 | 1780 { |
111 | 1781 unsigned dep_lev = dependence_level (dist_v, num_lev); |
1782 /* Can't do memmove if load depends on store. */ | |
1783 if (dep_lev > 0 && dist_v[dep_lev - 1] > 0 && !DDR_REVERSED_P (ddr)) | |
1784 return; | |
1785 } | |
1786 | |
1787 partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size); | |
1788 partition->kind = PKIND_MEMMOVE; | |
1789 return; | |
1790 } | |
1791 | |
145 | 1792 bool |
1793 loop_distribution::classify_partition (loop_p loop, | |
1794 struct graph *rdg, partition *partition, | |
1795 bitmap stmt_in_all_partitions) | |
111 | 1796 { |
1797 bitmap_iterator bi; | |
1798 unsigned i; | |
1799 data_reference_p single_ld = NULL, single_st = NULL; | |
1800 bool volatiles_p = false, has_reduction = false; | |
1801 | |
1802 EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi) | |
1803 { | |
1804 gimple *stmt = RDG_STMT (rdg, i); | |
1805 | |
1806 if (gimple_has_volatile_ops (stmt)) | |
1807 volatiles_p = true; | |
1808 | |
1809 /* If the stmt is not included by all partitions and there is uses | |
1810 outside of the loop, then mark the partition as reduction. */ | |
1811 if (stmt_has_scalar_dependences_outside_loop (loop, stmt)) | |
0 | 1812 { |
111 | 1813 /* Due to limitation in the transform phase we have to fuse all |
1814 reduction partitions. As a result, this could cancel valid | |
1815 loop distribution especially for loop that induction variable | |
1816 is used outside of loop. To workaround this issue, we skip | |
1817 marking partition as reudction if the reduction stmt belongs | |
1818 to all partitions. In such case, reduction will be computed | |
1819 correctly no matter how partitions are fused/distributed. */ | |
1820 if (!bitmap_bit_p (stmt_in_all_partitions, i)) | |
145 | 1821 partition->reduction_p = true; |
1822 else | |
1823 has_reduction = true; | |
0 | 1824 } |
1825 } | |
1826 | |
145 | 1827 /* Simple workaround to prevent classifying the partition as builtin |
1828 if it contains any use outside of loop. For the case where all | |
1829 partitions have the reduction this simple workaround is delayed | |
1830 to only affect the last partition. */ | |
1831 if (partition->reduction_p) | |
1832 return has_reduction; | |
1833 | |
111 | 1834 /* Perform general partition disqualification for builtins. */ |
1835 if (volatiles_p | |
1836 || !flag_tree_loop_distribute_patterns) | |
145 | 1837 return has_reduction; |
111 | 1838 |
1839 /* Find single load/store data references for builtin partition. */ | |
1840 if (!find_single_drs (loop, rdg, partition, &single_st, &single_ld)) | |
145 | 1841 return has_reduction; |
1842 | |
1843 partition->loc = gimple_location (DR_STMT (single_st)); | |
111 | 1844 |
1845 /* Classify the builtin kind. */ | |
1846 if (single_ld == NULL) | |
1847 classify_builtin_st (loop, partition, single_st); | |
1848 else | |
1849 classify_builtin_ldst (loop, rdg, partition, single_st, single_ld); | |
145 | 1850 return has_reduction; |
0 | 1851 } |
1852 | |
145 | 1853 bool |
1854 loop_distribution::share_memory_accesses (struct graph *rdg, | |
111 | 1855 partition *partition1, partition *partition2) |
67
f6334be47118
update gcc from gcc-4.6-20100522 to gcc-4.6-20110318
nobuyasu <dimolto@cr.ie.u-ryukyu.ac.jp>
parents:
63
diff
changeset
|
1856 { |
f6334be47118
update gcc from gcc-4.6-20100522 to gcc-4.6-20110318
nobuyasu <dimolto@cr.ie.u-ryukyu.ac.jp>
parents:
63
diff
changeset
|
1857 unsigned i, j; |
f6334be47118
update gcc from gcc-4.6-20100522 to gcc-4.6-20110318
nobuyasu <dimolto@cr.ie.u-ryukyu.ac.jp>
parents:
63
diff
changeset
|
1858 bitmap_iterator bi, bj; |
111 | 1859 data_reference_p dr1, dr2; |
1860 | |
1861 /* First check whether in the intersection of the two partitions are | |
1862 any loads or stores. Common loads are the situation that happens | |
1863 most often. */ | |
1864 EXECUTE_IF_AND_IN_BITMAP (partition1->stmts, partition2->stmts, 0, i, bi) | |
67
f6334be47118
update gcc from gcc-4.6-20100522 to gcc-4.6-20110318
nobuyasu <dimolto@cr.ie.u-ryukyu.ac.jp>
parents:
63
diff
changeset
|
1865 if (RDG_MEM_WRITE_STMT (rdg, i) |
f6334be47118
update gcc from gcc-4.6-20100522 to gcc-4.6-20110318
nobuyasu <dimolto@cr.ie.u-ryukyu.ac.jp>
parents:
63
diff
changeset
|
1866 || RDG_MEM_READS_STMT (rdg, i)) |
111 | 1867 return true; |
1868 | |
1869 /* Then check whether the two partitions access the same memory object. */ | |
1870 EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi) | |
1871 { | |
1872 dr1 = datarefs_vec[i]; | |
1873 | |
1874 if (!DR_BASE_ADDRESS (dr1) | |
1875 || !DR_OFFSET (dr1) || !DR_INIT (dr1) || !DR_STEP (dr1)) | |
1876 continue; | |
1877 | |
1878 EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, 0, j, bj) | |
1879 { | |
1880 dr2 = datarefs_vec[j]; | |
1881 | |
1882 if (!DR_BASE_ADDRESS (dr2) | |
1883 || !DR_OFFSET (dr2) || !DR_INIT (dr2) || !DR_STEP (dr2)) | |
1884 continue; | |
1885 | |
1886 if (operand_equal_p (DR_BASE_ADDRESS (dr1), DR_BASE_ADDRESS (dr2), 0) | |
1887 && operand_equal_p (DR_OFFSET (dr1), DR_OFFSET (dr2), 0) | |
1888 && operand_equal_p (DR_INIT (dr1), DR_INIT (dr2), 0) | |
1889 && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0)) | |
67
f6334be47118
update gcc from gcc-4.6-20100522 to gcc-4.6-20110318
nobuyasu <dimolto@cr.ie.u-ryukyu.ac.jp>
parents:
63
diff
changeset
|
1890 return true; |
111 | 1891 } |
67
f6334be47118
update gcc from gcc-4.6-20100522 to gcc-4.6-20110318
nobuyasu <dimolto@cr.ie.u-ryukyu.ac.jp>
parents:
63
diff
changeset
|
1892 } |
f6334be47118
update gcc from gcc-4.6-20100522 to gcc-4.6-20110318
nobuyasu <dimolto@cr.ie.u-ryukyu.ac.jp>
parents:
63
diff
changeset
|
1893 |
f6334be47118
update gcc from gcc-4.6-20100522 to gcc-4.6-20110318
nobuyasu <dimolto@cr.ie.u-ryukyu.ac.jp>
parents:
63
diff
changeset
|
1894 return false; |
f6334be47118
update gcc from gcc-4.6-20100522 to gcc-4.6-20110318
nobuyasu <dimolto@cr.ie.u-ryukyu.ac.jp>
parents:
63
diff
changeset
|
1895 } |
f6334be47118
update gcc from gcc-4.6-20100522 to gcc-4.6-20110318
nobuyasu <dimolto@cr.ie.u-ryukyu.ac.jp>
parents:
63
diff
changeset
|
1896 |
111 | 1897 /* For each seed statement in STARTING_STMTS, this function builds |
1898 partition for it by adding depended statements according to RDG. | |
1899 All partitions are recorded in PARTITIONS. */ | |
67
f6334be47118
update gcc from gcc-4.6-20100522 to gcc-4.6-20110318
nobuyasu <dimolto@cr.ie.u-ryukyu.ac.jp>
parents:
63
diff
changeset
|
1900 |
145 | 1901 void |
1902 loop_distribution::rdg_build_partitions (struct graph *rdg, | |
1903 vec<gimple *> starting_stmts, | |
1904 vec<partition *> *partitions) | |
67
f6334be47118
update gcc from gcc-4.6-20100522 to gcc-4.6-20110318
nobuyasu <dimolto@cr.ie.u-ryukyu.ac.jp>
parents:
63
diff
changeset
|
1905 { |
111 | 1906 auto_bitmap processed; |
0 | 1907 int i; |
111 | 1908 gimple *stmt; |
1909 | |
1910 FOR_EACH_VEC_ELT (starting_stmts, i, stmt) | |
0 | 1911 { |
111 | 1912 int v = rdg_vertex_for_stmt (rdg, stmt); |
1913 | |
1914 if (dump_file && (dump_flags & TDF_DETAILS)) | |
1915 fprintf (dump_file, | |
1916 "ldist asked to generate code for vertex %d\n", v); | |
1917 | |
1918 /* If the vertex is already contained in another partition so | |
1919 is the partition rooted at it. */ | |
0 | 1920 if (bitmap_bit_p (processed, v)) |
1921 continue; | |
55
77e2b8dfacca
update it from 4.4.3 to 4.5.0
ryoma <e075725@ie.u-ryukyu.ac.jp>
parents:
0
diff
changeset
|
1922 |
111 | 1923 partition *partition = build_rdg_partition_for_vertex (rdg, v); |
1924 bitmap_ior_into (processed, partition->stmts); | |
1925 | |
1926 if (dump_file && (dump_flags & TDF_DETAILS)) | |
0 | 1927 { |
111 | 1928 fprintf (dump_file, "ldist creates useful %s partition:\n", |
1929 partition->type == PTYPE_PARALLEL ? "parallel" : "sequent"); | |
1930 bitmap_print (dump_file, partition->stmts, " ", "\n"); | |
0 | 1931 } |
111 | 1932 |
1933 partitions->safe_push (partition); | |
0 | 1934 } |
1935 | |
111 | 1936 /* All vertices should have been assigned to at least one partition now, |
1937 other than vertices belonging to dead code. */ | |
0 | 1938 } |
1939 | |
1940 /* Dump to FILE the PARTITIONS. */ | |
1941 | |
1942 static void | |
111 | 1943 dump_rdg_partitions (FILE *file, vec<partition *> partitions) |
0 | 1944 { |
1945 int i; | |
111 | 1946 partition *partition; |
1947 | |
1948 FOR_EACH_VEC_ELT (partitions, i, partition) | |
1949 debug_bitmap_file (file, partition->stmts); | |
0 | 1950 } |
1951 | |
1952 /* Debug PARTITIONS. */ | |
111 | 1953 extern void debug_rdg_partitions (vec<partition *> ); |
0 | 1954 |
67
f6334be47118
update gcc from gcc-4.6-20100522 to gcc-4.6-20110318
nobuyasu <dimolto@cr.ie.u-ryukyu.ac.jp>
parents:
63
diff
changeset
|
1955 DEBUG_FUNCTION void |
111 | 1956 debug_rdg_partitions (vec<partition *> partitions) |
0 | 1957 { |
1958 dump_rdg_partitions (stderr, partitions); | |
1959 } | |
1960 | |
1961 /* Returns the number of read and write operations in the RDG. */ | |
1962 | |
1963 static int | |
1964 number_of_rw_in_rdg (struct graph *rdg) | |
1965 { | |
1966 int i, res = 0; | |
1967 | |
1968 for (i = 0; i < rdg->n_vertices; i++) | |
1969 { | |
1970 if (RDG_MEM_WRITE_STMT (rdg, i)) | |
1971 ++res; | |
1972 | |
1973 if (RDG_MEM_READS_STMT (rdg, i)) | |
1974 ++res; | |
1975 } | |
1976 | |
1977 return res; | |
1978 } | |
1979 | |
1980 /* Returns the number of read and write operations in a PARTITION of | |
1981 the RDG. */ | |
1982 | |
1983 static int | |
111 | 1984 number_of_rw_in_partition (struct graph *rdg, partition *partition) |
0 | 1985 { |
1986 int res = 0; | |
1987 unsigned i; | |
1988 bitmap_iterator ii; | |
1989 | |
111 | 1990 EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, ii) |
0 | 1991 { |
1992 if (RDG_MEM_WRITE_STMT (rdg, i)) | |
1993 ++res; | |
1994 | |
1995 if (RDG_MEM_READS_STMT (rdg, i)) | |
1996 ++res; | |
1997 } | |
1998 | |
1999 return res; | |
2000 } | |
2001 | |
2002 /* Returns true when one of the PARTITIONS contains all the read or | |
2003 write operations of RDG. */ | |
2004 | |
2005 static bool | |
111 | 2006 partition_contains_all_rw (struct graph *rdg, |
2007 vec<partition *> partitions) | |
0 | 2008 { |
2009 int i; | |
111 | 2010 partition *partition; |
0 | 2011 int nrw = number_of_rw_in_rdg (rdg); |
2012 | |
111 | 2013 FOR_EACH_VEC_ELT (partitions, i, partition) |
0 | 2014 if (nrw == number_of_rw_in_partition (rdg, partition)) |
2015 return true; | |
2016 | |
2017 return false; | |
2018 } | |
2019 | |
145 | 2020 int |
2021 loop_distribution::pg_add_dependence_edges (struct graph *rdg, int dir, | |
111 | 2022 bitmap drs1, bitmap drs2, vec<ddr_p> *alias_ddrs) |
2023 { | |
2024 unsigned i, j; | |
2025 bitmap_iterator bi, bj; | |
2026 data_reference_p dr1, dr2, saved_dr1; | |
2027 | |
2028 /* dependence direction - 0 is no dependence, -1 is back, | |
2029 1 is forth, 2 is both (we can stop then, merging will occur). */ | |
2030 EXECUTE_IF_SET_IN_BITMAP (drs1, 0, i, bi) | |
2031 { | |
2032 dr1 = datarefs_vec[i]; | |
2033 | |
2034 EXECUTE_IF_SET_IN_BITMAP (drs2, 0, j, bj) | |
2035 { | |
2036 int res, this_dir = 1; | |
2037 ddr_p ddr; | |
2038 | |
2039 dr2 = datarefs_vec[j]; | |
2040 | |
2041 /* Skip all <read, read> data dependence. */ | |
2042 if (DR_IS_READ (dr1) && DR_IS_READ (dr2)) | |
2043 continue; | |
2044 | |
2045 saved_dr1 = dr1; | |
2046 /* Re-shuffle data-refs to be in topological order. */ | |
2047 if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1)) | |
2048 > rdg_vertex_for_stmt (rdg, DR_STMT (dr2))) | |
2049 { | |
2050 std::swap (dr1, dr2); | |
2051 this_dir = -this_dir; | |
2052 } | |
2053 ddr = get_data_dependence (rdg, dr1, dr2); | |
2054 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) | |
2055 { | |
2056 this_dir = 0; | |
2057 res = data_ref_compare_tree (DR_BASE_ADDRESS (dr1), | |
2058 DR_BASE_ADDRESS (dr2)); | |
2059 /* Be conservative. If data references are not well analyzed, | |
2060 or the two data references have the same base address and | |
2061 offset, add dependence and consider it alias to each other. | |
145 | 2062 In other words, the dependence cannot be resolved by |
111 | 2063 runtime alias check. */ |
2064 if (!DR_BASE_ADDRESS (dr1) || !DR_BASE_ADDRESS (dr2) | |
2065 || !DR_OFFSET (dr1) || !DR_OFFSET (dr2) | |
2066 || !DR_INIT (dr1) || !DR_INIT (dr2) | |
2067 || !DR_STEP (dr1) || !tree_fits_uhwi_p (DR_STEP (dr1)) | |
2068 || !DR_STEP (dr2) || !tree_fits_uhwi_p (DR_STEP (dr2)) | |
2069 || res == 0) | |
2070 this_dir = 2; | |
2071 /* Data dependence could be resolved by runtime alias check, | |
2072 record it in ALIAS_DDRS. */ | |
2073 else if (alias_ddrs != NULL) | |
2074 alias_ddrs->safe_push (ddr); | |
2075 /* Or simply ignore it. */ | |
2076 } | |
2077 else if (DDR_ARE_DEPENDENT (ddr) == NULL_TREE) | |
2078 { | |
2079 if (DDR_REVERSED_P (ddr)) | |
2080 this_dir = -this_dir; | |
2081 | |
2082 /* Known dependences can still be unordered througout the | |
2083 iteration space, see gcc.dg/tree-ssa/ldist-16.c. */ | |
2084 if (DDR_NUM_DIST_VECTS (ddr) != 1) | |
2085 this_dir = 2; | |
2086 /* If the overlap is exact preserve stmt order. */ | |
131 | 2087 else if (lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), |
2088 DDR_NB_LOOPS (ddr))) | |
111 | 2089 ; |
2090 /* Else as the distance vector is lexicographic positive swap | |
2091 the dependence direction. */ | |
2092 else | |
2093 this_dir = -this_dir; | |
2094 } | |
2095 else | |
2096 this_dir = 0; | |
2097 if (this_dir == 2) | |
2098 return 2; | |
2099 else if (dir == 0) | |
2100 dir = this_dir; | |
2101 else if (this_dir != 0 && dir != this_dir) | |
2102 return 2; | |
2103 /* Shuffle "back" dr1. */ | |
2104 dr1 = saved_dr1; | |
2105 } | |
2106 } | |
2107 return dir; | |
2108 } | |
2109 | |
2110 /* Compare postorder number of the partition graph vertices V1 and V2. */ | |
0 | 2111 |
2112 static int | |
111 | 2113 pgcmp (const void *v1_, const void *v2_) |
2114 { | |
2115 const vertex *v1 = (const vertex *)v1_; | |
2116 const vertex *v2 = (const vertex *)v2_; | |
2117 return v2->post - v1->post; | |
2118 } | |
2119 | |
2120 /* Data attached to vertices of partition dependence graph. */ | |
2121 struct pg_vdata | |
2122 { | |
2123 /* ID of the corresponding partition. */ | |
2124 int id; | |
2125 /* The partition. */ | |
2126 struct partition *partition; | |
2127 }; | |
2128 | |
2129 /* Data attached to edges of partition dependence graph. */ | |
2130 struct pg_edata | |
2131 { | |
2132 /* If the dependence edge can be resolved by runtime alias check, | |
2133 this vector contains data dependence relations for runtime alias | |
2134 check. On the other hand, if the dependence edge is introduced | |
2135 because of compilation time known data dependence, this vector | |
2136 contains nothing. */ | |
2137 vec<ddr_p> alias_ddrs; | |
2138 }; | |
2139 | |
2140 /* Callback data for traversing edges in graph. */ | |
2141 struct pg_edge_callback_data | |
0 | 2142 { |
111 | 2143 /* Bitmap contains strong connected components should be merged. */ |
2144 bitmap sccs_to_merge; | |
2145 /* Array constains component information for all vertices. */ | |
2146 int *vertices_component; | |
2147 /* Vector to record all data dependence relations which are needed | |
2148 to break strong connected components by runtime alias checks. */ | |
2149 vec<ddr_p> *alias_ddrs; | |
2150 }; | |
2151 | |
2152 /* Initialize vertice's data for partition dependence graph PG with | |
2153 PARTITIONS. */ | |
2154 | |
2155 static void | |
2156 init_partition_graph_vertices (struct graph *pg, | |
2157 vec<struct partition *> *partitions) | |
2158 { | |
2159 int i; | |
2160 partition *partition; | |
2161 struct pg_vdata *data; | |
2162 | |
2163 for (i = 0; partitions->iterate (i, &partition); ++i) | |
2164 { | |
2165 data = new pg_vdata; | |
2166 pg->vertices[i].data = data; | |
2167 data->id = i; | |
2168 data->partition = partition; | |
2169 } | |
2170 } | |
2171 | |
2172 /* Add edge <I, J> to partition dependence graph PG. Attach vector of data | |
2173 dependence relations to the EDGE if DDRS isn't NULL. */ | |
2174 | |
2175 static void | |
2176 add_partition_graph_edge (struct graph *pg, int i, int j, vec<ddr_p> *ddrs) | |
2177 { | |
2178 struct graph_edge *e = add_edge (pg, i, j); | |
2179 | |
2180 /* If the edge is attached with data dependence relations, it means this | |
2181 dependence edge can be resolved by runtime alias checks. */ | |
2182 if (ddrs != NULL) | |
0 | 2183 { |
111 | 2184 struct pg_edata *data = new pg_edata; |
2185 | |
2186 gcc_assert (ddrs->length () > 0); | |
2187 e->data = data; | |
2188 data->alias_ddrs = vNULL; | |
2189 data->alias_ddrs.safe_splice (*ddrs); | |
2190 } | |
2191 } | |
2192 | |
2193 /* Callback function for graph travesal algorithm. It returns true | |
2194 if edge E should skipped when traversing the graph. */ | |
2195 | |
2196 static bool | |
2197 pg_skip_alias_edge (struct graph_edge *e) | |
2198 { | |
2199 struct pg_edata *data = (struct pg_edata *)e->data; | |
2200 return (data != NULL && data->alias_ddrs.length () > 0); | |
2201 } | |
2202 | |
2203 /* Callback function freeing data attached to edge E of graph. */ | |
2204 | |
2205 static void | |
2206 free_partition_graph_edata_cb (struct graph *, struct graph_edge *e, void *) | |
2207 { | |
2208 if (e->data != NULL) | |
2209 { | |
2210 struct pg_edata *data = (struct pg_edata *)e->data; | |
2211 data->alias_ddrs.release (); | |
2212 delete data; | |
2213 } | |
2214 } | |
2215 | |
2216 /* Free data attached to vertice of partition dependence graph PG. */ | |
2217 | |
2218 static void | |
2219 free_partition_graph_vdata (struct graph *pg) | |
2220 { | |
2221 int i; | |
2222 struct pg_vdata *data; | |
2223 | |
2224 for (i = 0; i < pg->n_vertices; ++i) | |
2225 { | |
2226 data = (struct pg_vdata *)pg->vertices[i].data; | |
2227 delete data; | |
2228 } | |
2229 } | |
2230 | |
2231 /* Build and return partition dependence graph for PARTITIONS. RDG is | |
2232 reduced dependence graph for the loop to be distributed. If IGNORE_ALIAS_P | |
2233 is true, data dependence caused by possible alias between references | |
2234 is ignored, as if it doesn't exist at all; otherwise all depdendences | |
2235 are considered. */ | |
2236 | |
145 | 2237 struct graph * |
2238 loop_distribution::build_partition_graph (struct graph *rdg, | |
2239 vec<struct partition *> *partitions, | |
2240 bool ignore_alias_p) | |
111 | 2241 { |
2242 int i, j; | |
2243 struct partition *partition1, *partition2; | |
2244 graph *pg = new_graph (partitions->length ()); | |
2245 auto_vec<ddr_p> alias_ddrs, *alias_ddrs_p; | |
2246 | |
2247 alias_ddrs_p = ignore_alias_p ? NULL : &alias_ddrs; | |
2248 | |
2249 init_partition_graph_vertices (pg, partitions); | |
2250 | |
2251 for (i = 0; partitions->iterate (i, &partition1); ++i) | |
2252 { | |
2253 for (j = i + 1; partitions->iterate (j, &partition2); ++j) | |
0 | 2254 { |
111 | 2255 /* dependence direction - 0 is no dependence, -1 is back, |
2256 1 is forth, 2 is both (we can stop then, merging will occur). */ | |
2257 int dir = 0; | |
2258 | |
2259 /* If the first partition has reduction, add back edge; if the | |
2260 second partition has reduction, add forth edge. This makes | |
2261 sure that reduction partition will be sorted as the last one. */ | |
2262 if (partition_reduction_p (partition1)) | |
2263 dir = -1; | |
2264 else if (partition_reduction_p (partition2)) | |
2265 dir = 1; | |
2266 | |
2267 /* Cleanup the temporary vector. */ | |
2268 alias_ddrs.truncate (0); | |
2269 | |
2270 dir = pg_add_dependence_edges (rdg, dir, partition1->datarefs, | |
2271 partition2->datarefs, alias_ddrs_p); | |
2272 | |
2273 /* Add edge to partition graph if there exists dependence. There | |
2274 are two types of edges. One type edge is caused by compilation | |
145 | 2275 time known dependence, this type cannot be resolved by runtime |
111 | 2276 alias check. The other type can be resolved by runtime alias |
2277 check. */ | |
2278 if (dir == 1 || dir == 2 | |
2279 || alias_ddrs.length () > 0) | |
2280 { | |
2281 /* Attach data dependence relations to edge that can be resolved | |
2282 by runtime alias check. */ | |
2283 bool alias_edge_p = (dir != 1 && dir != 2); | |
2284 add_partition_graph_edge (pg, i, j, | |
2285 (alias_edge_p) ? &alias_ddrs : NULL); | |
2286 } | |
2287 if (dir == -1 || dir == 2 | |
2288 || alias_ddrs.length () > 0) | |
2289 { | |
2290 /* Attach data dependence relations to edge that can be resolved | |
2291 by runtime alias check. */ | |
2292 bool alias_edge_p = (dir != -1 && dir != 2); | |
2293 add_partition_graph_edge (pg, j, i, | |
2294 (alias_edge_p) ? &alias_ddrs : NULL); | |
2295 } | |
0 | 2296 } |
2297 } | |
111 | 2298 return pg; |
2299 } | |
2300 | |
2301 /* Sort partitions in PG in descending post order and store them in | |
2302 PARTITIONS. */ | |
2303 | |
2304 static void | |
2305 sort_partitions_by_post_order (struct graph *pg, | |
2306 vec<struct partition *> *partitions) | |
2307 { | |
2308 int i; | |
2309 struct pg_vdata *data; | |
2310 | |
2311 /* Now order the remaining nodes in descending postorder. */ | |
2312 qsort (pg->vertices, pg->n_vertices, sizeof (vertex), pgcmp); | |
2313 partitions->truncate (0); | |
2314 for (i = 0; i < pg->n_vertices; ++i) | |
2315 { | |
2316 data = (struct pg_vdata *)pg->vertices[i].data; | |
2317 if (data->partition) | |
2318 partitions->safe_push (data->partition); | |
2319 } | |
2320 } | |
2321 | |
145 | 2322 void |
2323 loop_distribution::merge_dep_scc_partitions (struct graph *rdg, | |
2324 vec<struct partition *> *partitions, | |
2325 bool ignore_alias_p) | |
111 | 2326 { |
2327 struct partition *partition1, *partition2; | |
2328 struct pg_vdata *data; | |
2329 graph *pg = build_partition_graph (rdg, partitions, ignore_alias_p); | |
2330 int i, j, num_sccs = graphds_scc (pg, NULL); | |
2331 | |
2332 /* Strong connected compoenent means dependence cycle, we cannot distribute | |
2333 them. So fuse them together. */ | |
2334 if ((unsigned) num_sccs < partitions->length ()) | |
2335 { | |
2336 for (i = 0; i < num_sccs; ++i) | |
2337 { | |
2338 for (j = 0; partitions->iterate (j, &partition1); ++j) | |
2339 if (pg->vertices[j].component == i) | |
2340 break; | |
2341 for (j = j + 1; partitions->iterate (j, &partition2); ++j) | |
2342 if (pg->vertices[j].component == i) | |
2343 { | |
2344 partition_merge_into (NULL, partition1, | |
2345 partition2, FUSE_SAME_SCC); | |
2346 partition1->type = PTYPE_SEQUENTIAL; | |
2347 (*partitions)[j] = NULL; | |
2348 partition_free (partition2); | |
2349 data = (struct pg_vdata *)pg->vertices[j].data; | |
2350 data->partition = NULL; | |
2351 } | |
2352 } | |
2353 } | |
2354 | |
2355 sort_partitions_by_post_order (pg, partitions); | |
2356 gcc_assert (partitions->length () == (unsigned)num_sccs); | |
2357 free_partition_graph_vdata (pg); | |
2358 free_graph (pg); | |
2359 } | |
2360 | |
2361 /* Callback function for traversing edge E in graph G. DATA is private | |
2362 callback data. */ | |
2363 | |
2364 static void | |
2365 pg_collect_alias_ddrs (struct graph *g, struct graph_edge *e, void *data) | |
2366 { | |
2367 int i, j, component; | |
2368 struct pg_edge_callback_data *cbdata; | |
2369 struct pg_edata *edata = (struct pg_edata *) e->data; | |
2370 | |
2371 /* If the edge doesn't have attached data dependence, it represents | |
2372 compilation time known dependences. This type dependence cannot | |
2373 be resolved by runtime alias check. */ | |
2374 if (edata == NULL || edata->alias_ddrs.length () == 0) | |
2375 return; | |
2376 | |
2377 cbdata = (struct pg_edge_callback_data *) data; | |
2378 i = e->src; | |
2379 j = e->dest; | |
2380 component = cbdata->vertices_component[i]; | |
2381 /* Vertices are topologically sorted according to compilation time | |
2382 known dependences, so we can break strong connected components | |
2383 by removing edges of the opposite direction, i.e, edges pointing | |
2384 from vertice with smaller post number to vertice with bigger post | |
2385 number. */ | |
2386 if (g->vertices[i].post < g->vertices[j].post | |
2387 /* We only need to remove edges connecting vertices in the same | |
2388 strong connected component to break it. */ | |
2389 && component == cbdata->vertices_component[j] | |
2390 /* Check if we want to break the strong connected component or not. */ | |
2391 && !bitmap_bit_p (cbdata->sccs_to_merge, component)) | |
2392 cbdata->alias_ddrs->safe_splice (edata->alias_ddrs); | |
2393 } | |
2394 | |
2395 /* This is the main function breaking strong conected components in | |
2396 PARTITIONS giving reduced depdendence graph RDG. Store data dependence | |
2397 relations for runtime alias check in ALIAS_DDRS. */ | |
145 | 2398 void |
2399 loop_distribution::break_alias_scc_partitions (struct graph *rdg, | |
2400 vec<struct partition *> *partitions, | |
2401 vec<ddr_p> *alias_ddrs) | |
111 | 2402 { |
2403 int i, j, k, num_sccs, num_sccs_no_alias; | |
2404 /* Build partition dependence graph. */ | |
2405 graph *pg = build_partition_graph (rdg, partitions, false); | |
2406 | |
2407 alias_ddrs->truncate (0); | |
2408 /* Find strong connected components in the graph, with all dependence edges | |
2409 considered. */ | |
2410 num_sccs = graphds_scc (pg, NULL); | |
2411 /* All SCCs now can be broken by runtime alias checks because SCCs caused by | |
2412 compilation time known dependences are merged before this function. */ | |
2413 if ((unsigned) num_sccs < partitions->length ()) | |
2414 { | |
2415 struct pg_edge_callback_data cbdata; | |
2416 auto_bitmap sccs_to_merge; | |
2417 auto_vec<enum partition_type> scc_types; | |
2418 struct partition *partition, *first; | |
2419 | |
2420 /* If all partitions in a SCC have the same type, we can simply merge the | |
2421 SCC. This loop finds out such SCCS and record them in bitmap. */ | |
2422 bitmap_set_range (sccs_to_merge, 0, (unsigned) num_sccs); | |
2423 for (i = 0; i < num_sccs; ++i) | |
2424 { | |
2425 for (j = 0; partitions->iterate (j, &first); ++j) | |
2426 if (pg->vertices[j].component == i) | |
2427 break; | |
131 | 2428 |
2429 bool same_type = true, all_builtins = partition_builtin_p (first); | |
111 | 2430 for (++j; partitions->iterate (j, &partition); ++j) |
2431 { | |
2432 if (pg->vertices[j].component != i) | |
2433 continue; | |
2434 | |
2435 if (first->type != partition->type) | |
2436 { | |
131 | 2437 same_type = false; |
111 | 2438 break; |
2439 } | |
131 | 2440 all_builtins &= partition_builtin_p (partition); |
111 | 2441 } |
131 | 2442 /* Merge SCC if all partitions in SCC have the same type, though the |
2443 result partition is sequential, because vectorizer can do better | |
2444 runtime alias check. One expecption is all partitions in SCC are | |
2445 builtins. */ | |
2446 if (!same_type || all_builtins) | |
2447 bitmap_clear_bit (sccs_to_merge, i); | |
111 | 2448 } |
2449 | |
2450 /* Initialize callback data for traversing. */ | |
2451 cbdata.sccs_to_merge = sccs_to_merge; | |
2452 cbdata.alias_ddrs = alias_ddrs; | |
2453 cbdata.vertices_component = XNEWVEC (int, pg->n_vertices); | |
2454 /* Record the component information which will be corrupted by next | |
2455 graph scc finding call. */ | |
2456 for (i = 0; i < pg->n_vertices; ++i) | |
2457 cbdata.vertices_component[i] = pg->vertices[i].component; | |
2458 | |
2459 /* Collect data dependences for runtime alias checks to break SCCs. */ | |
2460 if (bitmap_count_bits (sccs_to_merge) != (unsigned) num_sccs) | |
2461 { | |
2462 /* Run SCC finding algorithm again, with alias dependence edges | |
2463 skipped. This is to topologically sort partitions according to | |
2464 compilation time known dependence. Note the topological order | |
2465 is stored in the form of pg's post order number. */ | |
2466 num_sccs_no_alias = graphds_scc (pg, NULL, pg_skip_alias_edge); | |
2467 gcc_assert (partitions->length () == (unsigned) num_sccs_no_alias); | |
2468 /* With topological order, we can construct two subgraphs L and R. | |
2469 L contains edge <x, y> where x < y in terms of post order, while | |
2470 R contains edge <x, y> where x > y. Edges for compilation time | |
2471 known dependence all fall in R, so we break SCCs by removing all | |
2472 (alias) edges of in subgraph L. */ | |
2473 for_each_edge (pg, pg_collect_alias_ddrs, &cbdata); | |
2474 } | |
2475 | |
2476 /* For SCC that doesn't need to be broken, merge it. */ | |
2477 for (i = 0; i < num_sccs; ++i) | |
2478 { | |
2479 if (!bitmap_bit_p (sccs_to_merge, i)) | |
2480 continue; | |
2481 | |
2482 for (j = 0; partitions->iterate (j, &first); ++j) | |
2483 if (cbdata.vertices_component[j] == i) | |
2484 break; | |
2485 for (k = j + 1; partitions->iterate (k, &partition); ++k) | |
2486 { | |
2487 struct pg_vdata *data; | |
2488 | |
2489 if (cbdata.vertices_component[k] != i) | |
2490 continue; | |
2491 | |
2492 /* Update postorder number so that merged reduction partition is | |
2493 sorted after other partitions. */ | |
2494 if (!partition_reduction_p (first) | |
2495 && partition_reduction_p (partition)) | |
2496 { | |
2497 gcc_assert (pg->vertices[k].post < pg->vertices[j].post); | |
2498 pg->vertices[j].post = pg->vertices[k].post; | |
2499 } | |
2500 partition_merge_into (NULL, first, partition, FUSE_SAME_SCC); | |
2501 (*partitions)[k] = NULL; | |
2502 partition_free (partition); | |
2503 data = (struct pg_vdata *)pg->vertices[k].data; | |
2504 gcc_assert (data->id == k); | |
2505 data->partition = NULL; | |
2506 /* The result partition of merged SCC must be sequential. */ | |
2507 first->type = PTYPE_SEQUENTIAL; | |
2508 } | |
2509 } | |
2510 } | |
2511 | |
2512 sort_partitions_by_post_order (pg, partitions); | |
2513 free_partition_graph_vdata (pg); | |
2514 for_each_edge (pg, free_partition_graph_edata_cb, NULL); | |
2515 free_graph (pg); | |
0 | 2516 |
2517 if (dump_file && (dump_flags & TDF_DETAILS)) | |
111 | 2518 { |
2519 fprintf (dump_file, "Possible alias data dependence to break:\n"); | |
2520 dump_data_dependence_relations (dump_file, *alias_ddrs); | |
2521 } | |
2522 } | |
2523 | |
2524 /* Compute and return an expression whose value is the segment length which | |
2525 will be accessed by DR in NITERS iterations. */ | |
2526 | |
2527 static tree | |
2528 data_ref_segment_size (struct data_reference *dr, tree niters) | |
2529 { | |
131 | 2530 niters = size_binop (MINUS_EXPR, |
2531 fold_convert (sizetype, niters), | |
2532 size_one_node); | |
2533 return size_binop (MULT_EXPR, | |
2534 fold_convert (sizetype, DR_STEP (dr)), | |
2535 fold_convert (sizetype, niters)); | |
111 | 2536 } |
2537 | |
2538 /* Return true if LOOP's latch is dominated by statement for data reference | |
2539 DR. */ | |
2540 | |
2541 static inline bool | |
145 | 2542 latch_dominated_by_data_ref (class loop *loop, data_reference *dr) |
111 | 2543 { |
2544 return dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src, | |
2545 gimple_bb (DR_STMT (dr))); | |
2546 } | |
2547 | |
2548 /* Compute alias check pairs and store them in COMP_ALIAS_PAIRS for LOOP's | |
2549 data dependence relations ALIAS_DDRS. */ | |
2550 | |
2551 static void | |
145 | 2552 compute_alias_check_pairs (class loop *loop, vec<ddr_p> *alias_ddrs, |
111 | 2553 vec<dr_with_seg_len_pair_t> *comp_alias_pairs) |
2554 { | |
2555 unsigned int i; | |
2556 unsigned HOST_WIDE_INT factor = 1; | |
2557 tree niters_plus_one, niters = number_of_latch_executions (loop); | |
2558 | |
2559 gcc_assert (niters != NULL_TREE && niters != chrec_dont_know); | |
2560 niters = fold_convert (sizetype, niters); | |
2561 niters_plus_one = size_binop (PLUS_EXPR, niters, size_one_node); | |
2562 | |
2563 if (dump_file && (dump_flags & TDF_DETAILS)) | |
2564 fprintf (dump_file, "Creating alias check pairs:\n"); | |
2565 | |
2566 /* Iterate all data dependence relations and compute alias check pairs. */ | |
2567 for (i = 0; i < alias_ddrs->length (); i++) | |
2568 { | |
2569 ddr_p ddr = (*alias_ddrs)[i]; | |
2570 struct data_reference *dr_a = DDR_A (ddr); | |
2571 struct data_reference *dr_b = DDR_B (ddr); | |
2572 tree seg_length_a, seg_length_b; | |
2573 | |
2574 if (latch_dominated_by_data_ref (loop, dr_a)) | |
2575 seg_length_a = data_ref_segment_size (dr_a, niters_plus_one); | |
2576 else | |
2577 seg_length_a = data_ref_segment_size (dr_a, niters); | |
2578 | |
2579 if (latch_dominated_by_data_ref (loop, dr_b)) | |
2580 seg_length_b = data_ref_segment_size (dr_b, niters_plus_one); | |
2581 else | |
2582 seg_length_b = data_ref_segment_size (dr_b, niters); | |
2583 | |
131 | 2584 unsigned HOST_WIDE_INT access_size_a |
2585 = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_a)))); | |
2586 unsigned HOST_WIDE_INT access_size_b | |
2587 = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_b)))); | |
2588 unsigned int align_a = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_a))); | |
2589 unsigned int align_b = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_b))); | |
2590 | |
111 | 2591 dr_with_seg_len_pair_t dr_with_seg_len_pair |
131 | 2592 (dr_with_seg_len (dr_a, seg_length_a, access_size_a, align_a), |
145 | 2593 dr_with_seg_len (dr_b, seg_length_b, access_size_b, align_b), |
2594 /* ??? Would WELL_ORDERED be safe? */ | |
2595 dr_with_seg_len_pair_t::REORDERED); | |
111 | 2596 |
2597 comp_alias_pairs->safe_push (dr_with_seg_len_pair); | |
2598 } | |
2599 | |
2600 if (tree_fits_uhwi_p (niters)) | |
2601 factor = tree_to_uhwi (niters); | |
2602 | |
2603 /* Prune alias check pairs. */ | |
2604 prune_runtime_alias_test_list (comp_alias_pairs, factor); | |
2605 if (dump_file && (dump_flags & TDF_DETAILS)) | |
2606 fprintf (dump_file, | |
2607 "Improved number of alias checks from %d to %d\n", | |
2608 alias_ddrs->length (), comp_alias_pairs->length ()); | |
2609 } | |
2610 | |
2611 /* Given data dependence relations in ALIAS_DDRS, generate runtime alias | |
2612 checks and version LOOP under condition of these runtime alias checks. */ | |
2613 | |
2614 static void | |
131 | 2615 version_loop_by_alias_check (vec<struct partition *> *partitions, |
145 | 2616 class loop *loop, vec<ddr_p> *alias_ddrs) |
111 | 2617 { |
2618 profile_probability prob; | |
2619 basic_block cond_bb; | |
145 | 2620 class loop *nloop; |
111 | 2621 tree lhs, arg0, cond_expr = NULL_TREE; |
2622 gimple_seq cond_stmts = NULL; | |
2623 gimple *call_stmt = NULL; | |
2624 auto_vec<dr_with_seg_len_pair_t> comp_alias_pairs; | |
2625 | |
2626 /* Generate code for runtime alias checks if necessary. */ | |
2627 gcc_assert (alias_ddrs->length () > 0); | |
2628 | |
2629 if (dump_file && (dump_flags & TDF_DETAILS)) | |
2630 fprintf (dump_file, | |
2631 "Version loop <%d> with runtime alias check\n", loop->num); | |
2632 | |
2633 compute_alias_check_pairs (loop, alias_ddrs, &comp_alias_pairs); | |
2634 create_runtime_alias_checks (loop, &comp_alias_pairs, &cond_expr); | |
2635 cond_expr = force_gimple_operand_1 (cond_expr, &cond_stmts, | |
2636 is_gimple_val, NULL_TREE); | |
2637 | |
2638 /* Depend on vectorizer to fold IFN_LOOP_DIST_ALIAS. */ | |
131 | 2639 bool cancelable_p = flag_tree_loop_vectorize; |
2640 if (cancelable_p) | |
111 | 2641 { |
131 | 2642 unsigned i = 0; |
2643 struct partition *partition; | |
2644 for (; partitions->iterate (i, &partition); ++i) | |
2645 if (!partition_builtin_p (partition)) | |
2646 break; | |
2647 | |
2648 /* If all partitions are builtins, distributing it would be profitable and | |
2649 we don't want to cancel the runtime alias checks. */ | |
2650 if (i == partitions->length ()) | |
2651 cancelable_p = false; | |
2652 } | |
2653 | |
2654 /* Generate internal function call for loop distribution alias check if the | |
2655 runtime alias check should be cancelable. */ | |
2656 if (cancelable_p) | |
2657 { | |
111 | 2658 call_stmt = gimple_build_call_internal (IFN_LOOP_DIST_ALIAS, |
2659 2, NULL_TREE, cond_expr); | |
2660 lhs = make_ssa_name (boolean_type_node); | |
2661 gimple_call_set_lhs (call_stmt, lhs); | |
2662 } | |
2663 else | |
2664 lhs = cond_expr; | |
2665 | |
2666 prob = profile_probability::guessed_always ().apply_scale (9, 10); | |
2667 initialize_original_copy_tables (); | |
2668 nloop = loop_version (loop, lhs, &cond_bb, prob, prob.invert (), | |
2669 prob, prob.invert (), true); | |
2670 free_original_copy_tables (); | |
2671 /* Record the original loop number in newly generated loops. In case of | |
2672 distribution, the original loop will be distributed and the new loop | |
2673 is kept. */ | |
2674 loop->orig_loop_num = nloop->num; | |
2675 nloop->orig_loop_num = nloop->num; | |
2676 nloop->dont_vectorize = true; | |
2677 nloop->force_vectorize = false; | |
2678 | |
2679 if (call_stmt) | |
2680 { | |
2681 /* Record new loop's num in IFN_LOOP_DIST_ALIAS because the original | |
2682 loop could be destroyed. */ | |
2683 arg0 = build_int_cst (integer_type_node, loop->orig_loop_num); | |
2684 gimple_call_set_arg (call_stmt, 0, arg0); | |
2685 gimple_seq_add_stmt_without_update (&cond_stmts, call_stmt); | |
2686 } | |
2687 | |
2688 if (cond_stmts) | |
2689 { | |
2690 gimple_stmt_iterator cond_gsi = gsi_last_bb (cond_bb); | |
2691 gsi_insert_seq_before (&cond_gsi, cond_stmts, GSI_SAME_STMT); | |
2692 } | |
2693 update_ssa (TODO_update_ssa); | |
0 | 2694 } |
2695 | |
111 | 2696 /* Return true if loop versioning is needed to distrubute PARTITIONS. |
2697 ALIAS_DDRS are data dependence relations for runtime alias check. */ | |
2698 | |
2699 static inline bool | |
2700 version_for_distribution_p (vec<struct partition *> *partitions, | |
2701 vec<ddr_p> *alias_ddrs) | |
2702 { | |
2703 /* No need to version loop if we have only one partition. */ | |
2704 if (partitions->length () == 1) | |
2705 return false; | |
2706 | |
2707 /* Need to version loop if runtime alias check is necessary. */ | |
2708 return (alias_ddrs->length () > 0); | |
2709 } | |
2710 | |
2711 /* Compare base offset of builtin mem* partitions P1 and P2. */ | |
2712 | |
131 | 2713 static int |
2714 offset_cmp (const void *vp1, const void *vp2) | |
111 | 2715 { |
131 | 2716 struct partition *p1 = *(struct partition *const *) vp1; |
2717 struct partition *p2 = *(struct partition *const *) vp2; | |
2718 unsigned HOST_WIDE_INT o1 = p1->builtin->dst_base_offset; | |
2719 unsigned HOST_WIDE_INT o2 = p2->builtin->dst_base_offset; | |
2720 return (o2 < o1) - (o1 < o2); | |
111 | 2721 } |
2722 | |
2723 /* Fuse adjacent memset builtin PARTITIONS if possible. This is a special | |
2724 case optimization transforming below code: | |
2725 | |
2726 __builtin_memset (&obj, 0, 100); | |
2727 _1 = &obj + 100; | |
2728 __builtin_memset (_1, 0, 200); | |
2729 _2 = &obj + 300; | |
2730 __builtin_memset (_2, 0, 100); | |
2731 | |
2732 into: | |
2733 | |
2734 __builtin_memset (&obj, 0, 400); | |
2735 | |
2736 Note we don't have dependence information between different partitions | |
2737 at this point, as a result, we can't handle nonadjacent memset builtin | |
2738 partitions since dependence might be broken. */ | |
2739 | |
2740 static void | |
2741 fuse_memset_builtins (vec<struct partition *> *partitions) | |
2742 { | |
2743 unsigned i, j; | |
2744 struct partition *part1, *part2; | |
131 | 2745 tree rhs1, rhs2; |
111 | 2746 |
2747 for (i = 0; partitions->iterate (i, &part1);) | |
2748 { | |
2749 if (part1->kind != PKIND_MEMSET) | |
2750 { | |
2751 i++; | |
2752 continue; | |
2753 } | |
2754 | |
2755 /* Find sub-array of memset builtins of the same base. Index range | |
2756 of the sub-array is [i, j) with "j > i". */ | |
2757 for (j = i + 1; partitions->iterate (j, &part2); ++j) | |
2758 { | |
2759 if (part2->kind != PKIND_MEMSET | |
2760 || !operand_equal_p (part1->builtin->dst_base_base, | |
2761 part2->builtin->dst_base_base, 0)) | |
2762 break; | |
131 | 2763 |
2764 /* Memset calls setting different values can't be merged. */ | |
2765 rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr)); | |
2766 rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr)); | |
2767 if (!operand_equal_p (rhs1, rhs2, 0)) | |
2768 break; | |
111 | 2769 } |
2770 | |
2771 /* Stable sort is required in order to avoid breaking dependence. */ | |
131 | 2772 gcc_stablesort (&(*partitions)[i], j - i, sizeof (*partitions)[i], |
2773 offset_cmp); | |
111 | 2774 /* Continue with next partition. */ |
2775 i = j; | |
2776 } | |
2777 | |
2778 /* Merge all consecutive memset builtin partitions. */ | |
2779 for (i = 0; i < partitions->length () - 1;) | |
2780 { | |
2781 part1 = (*partitions)[i]; | |
2782 if (part1->kind != PKIND_MEMSET) | |
2783 { | |
2784 i++; | |
2785 continue; | |
2786 } | |
2787 | |
2788 part2 = (*partitions)[i + 1]; | |
2789 /* Only merge memset partitions of the same base and with constant | |
2790 access sizes. */ | |
2791 if (part2->kind != PKIND_MEMSET | |
2792 || TREE_CODE (part1->builtin->size) != INTEGER_CST | |
2793 || TREE_CODE (part2->builtin->size) != INTEGER_CST | |
2794 || !operand_equal_p (part1->builtin->dst_base_base, | |
2795 part2->builtin->dst_base_base, 0)) | |
2796 { | |
2797 i++; | |
2798 continue; | |
2799 } | |
131 | 2800 rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr)); |
2801 rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr)); | |
111 | 2802 int bytev1 = const_with_all_bytes_same (rhs1); |
2803 int bytev2 = const_with_all_bytes_same (rhs2); | |
2804 /* Only merge memset partitions of the same value. */ | |
2805 if (bytev1 != bytev2 || bytev1 == -1) | |
2806 { | |
2807 i++; | |
2808 continue; | |
2809 } | |
2810 wide_int end1 = wi::add (part1->builtin->dst_base_offset, | |
2811 wi::to_wide (part1->builtin->size)); | |
2812 /* Only merge adjacent memset partitions. */ | |
2813 if (wi::ne_p (end1, part2->builtin->dst_base_offset)) | |
2814 { | |
2815 i++; | |
2816 continue; | |
2817 } | |
2818 /* Merge partitions[i] and partitions[i+1]. */ | |
2819 part1->builtin->size = fold_build2 (PLUS_EXPR, sizetype, | |
2820 part1->builtin->size, | |
2821 part2->builtin->size); | |
2822 partition_free (part2); | |
2823 partitions->ordered_remove (i + 1); | |
2824 } | |
2825 } | |
2826 | |
145 | 2827 void |
2828 loop_distribution::finalize_partitions (class loop *loop, | |
2829 vec<struct partition *> *partitions, | |
2830 vec<ddr_p> *alias_ddrs) | |
111 | 2831 { |
2832 unsigned i; | |
2833 struct partition *partition, *a; | |
2834 | |
2835 if (partitions->length () == 1 | |
2836 || alias_ddrs->length () > 0) | |
2837 return; | |
2838 | |
131 | 2839 unsigned num_builtin = 0, num_normal = 0, num_partial_memset = 0; |
111 | 2840 bool same_type_p = true; |
2841 enum partition_type type = ((*partitions)[0])->type; | |
2842 for (i = 0; partitions->iterate (i, &partition); ++i) | |
2843 { | |
2844 same_type_p &= (type == partition->type); | |
131 | 2845 if (partition_builtin_p (partition)) |
2846 { | |
2847 num_builtin++; | |
2848 continue; | |
2849 } | |
2850 num_normal++; | |
2851 if (partition->kind == PKIND_PARTIAL_MEMSET) | |
2852 num_partial_memset++; | |
111 | 2853 } |
2854 | |
2855 /* Don't distribute current loop into too many loops given we don't have | |
2856 memory stream cost model. Be even more conservative in case of loop | |
2857 nest distribution. */ | |
131 | 2858 if ((same_type_p && num_builtin == 0 |
2859 && (loop->inner == NULL || num_normal != 2 || num_partial_memset != 1)) | |
111 | 2860 || (loop->inner != NULL |
2861 && i >= NUM_PARTITION_THRESHOLD && num_normal > 1) | |
2862 || (loop->inner == NULL | |
2863 && i >= NUM_PARTITION_THRESHOLD && num_normal > num_builtin)) | |
2864 { | |
2865 a = (*partitions)[0]; | |
2866 for (i = 1; partitions->iterate (i, &partition); ++i) | |
2867 { | |
2868 partition_merge_into (NULL, a, partition, FUSE_FINALIZE); | |
2869 partition_free (partition); | |
2870 } | |
2871 partitions->truncate (1); | |
2872 } | |
2873 | |
2874 /* Fuse memset builtins if possible. */ | |
2875 if (partitions->length () > 1) | |
2876 fuse_memset_builtins (partitions); | |
2877 } | |
2878 | |
2879 /* Distributes the code from LOOP in such a way that producer statements | |
2880 are placed before consumer statements. Tries to separate only the | |
2881 statements from STMTS into separate loops. Returns the number of | |
2882 distributed loops. Set NB_CALLS to number of generated builtin calls. | |
2883 Set *DESTROY_P to whether LOOP needs to be destroyed. */ | |
0 | 2884 |
145 | 2885 int |
2886 loop_distribution::distribute_loop (class loop *loop, vec<gimple *> stmts, | |
2887 control_dependences *cd, int *nb_calls, bool *destroy_p, | |
2888 bool only_patterns_p) | |
0 | 2889 { |
111 | 2890 ddrs_table = new hash_table<ddr_hasher> (389); |
0 | 2891 struct graph *rdg; |
111 | 2892 partition *partition; |
2893 int i, nbp; | |
2894 | |
2895 *destroy_p = false; | |
2896 *nb_calls = 0; | |
2897 loop_nest.create (0); | |
2898 if (!find_loop_nest (loop, &loop_nest)) | |
0 | 2899 { |
111 | 2900 loop_nest.release (); |
2901 delete ddrs_table; | |
2902 return 0; | |
0 | 2903 } |
2904 | |
111 | 2905 datarefs_vec.create (20); |
145 | 2906 has_nonaddressable_dataref_p = false; |
111 | 2907 rdg = build_rdg (loop, cd); |
0 | 2908 if (!rdg) |
2909 { | |
2910 if (dump_file && (dump_flags & TDF_DETAILS)) | |
2911 fprintf (dump_file, | |
111 | 2912 "Loop %d not distributed: failed to build the RDG.\n", |
0 | 2913 loop->num); |
2914 | |
111 | 2915 loop_nest.release (); |
2916 free_data_refs (datarefs_vec); | |
2917 delete ddrs_table; | |
2918 return 0; | |
0 | 2919 } |
2920 | |
111 | 2921 if (datarefs_vec.length () > MAX_DATAREFS_NUM) |
2922 { | |
2923 if (dump_file && (dump_flags & TDF_DETAILS)) | |
2924 fprintf (dump_file, | |
2925 "Loop %d not distributed: too many memory references.\n", | |
2926 loop->num); | |
2927 | |
2928 free_rdg (rdg); | |
2929 loop_nest.release (); | |
2930 free_data_refs (datarefs_vec); | |
2931 delete ddrs_table; | |
2932 return 0; | |
2933 } | |
2934 | |
2935 data_reference_p dref; | |
2936 for (i = 0; datarefs_vec.iterate (i, &dref); ++i) | |
2937 dref->aux = (void *) (uintptr_t) i; | |
0 | 2938 |
2939 if (dump_file && (dump_flags & TDF_DETAILS)) | |
2940 dump_rdg (dump_file, rdg); | |
2941 | |
111 | 2942 auto_vec<struct partition *, 3> partitions; |
2943 rdg_build_partitions (rdg, stmts, &partitions); | |
2944 | |
2945 auto_vec<ddr_p> alias_ddrs; | |
2946 | |
2947 auto_bitmap stmt_in_all_partitions; | |
2948 bitmap_copy (stmt_in_all_partitions, partitions[0]->stmts); | |
2949 for (i = 1; partitions.iterate (i, &partition); ++i) | |
2950 bitmap_and_into (stmt_in_all_partitions, partitions[i]->stmts); | |
2951 | |
145 | 2952 bool any_builtin = false; |
2953 bool reduction_in_all = false; | |
111 | 2954 FOR_EACH_VEC_ELT (partitions, i, partition) |
2955 { | |
145 | 2956 reduction_in_all |
2957 |= classify_partition (loop, rdg, partition, stmt_in_all_partitions); | |
111 | 2958 any_builtin |= partition_builtin_p (partition); |
2959 } | |
2960 | |
2961 /* If we are only distributing patterns but did not detect any, | |
2962 simply bail out. */ | |
145 | 2963 if (only_patterns_p |
111 | 2964 && !any_builtin) |
2965 { | |
2966 nbp = 0; | |
2967 goto ldist_done; | |
2968 } | |
2969 | |
2970 /* If we are only distributing patterns fuse all partitions that | |
2971 were not classified as builtins. This also avoids chopping | |
2972 a loop into pieces, separated by builtin calls. That is, we | |
2973 only want no or a single loop body remaining. */ | |
2974 struct partition *into; | |
145 | 2975 if (only_patterns_p) |
0 | 2976 { |
111 | 2977 for (i = 0; partitions.iterate (i, &into); ++i) |
2978 if (!partition_builtin_p (into)) | |
2979 break; | |
2980 for (++i; partitions.iterate (i, &partition); ++i) | |
2981 if (!partition_builtin_p (partition)) | |
2982 { | |
2983 partition_merge_into (NULL, into, partition, FUSE_NON_BUILTIN); | |
2984 partitions.unordered_remove (i); | |
2985 partition_free (partition); | |
2986 i--; | |
2987 } | |
2988 } | |
2989 | |
2990 /* Due to limitations in the transform phase we have to fuse all | |
2991 reduction partitions into the last partition so the existing | |
2992 loop will contain all loop-closed PHI nodes. */ | |
2993 for (i = 0; partitions.iterate (i, &into); ++i) | |
2994 if (partition_reduction_p (into)) | |
2995 break; | |
2996 for (i = i + 1; partitions.iterate (i, &partition); ++i) | |
2997 if (partition_reduction_p (partition)) | |
2998 { | |
2999 partition_merge_into (rdg, into, partition, FUSE_REDUCTION); | |
3000 partitions.unordered_remove (i); | |
3001 partition_free (partition); | |
3002 i--; | |
3003 } | |
3004 | |
3005 /* Apply our simple cost model - fuse partitions with similar | |
3006 memory accesses. */ | |
3007 for (i = 0; partitions.iterate (i, &into); ++i) | |
3008 { | |
3009 bool changed = false; | |
131 | 3010 if (partition_builtin_p (into) || into->kind == PKIND_PARTIAL_MEMSET) |
111 | 3011 continue; |
3012 for (int j = i + 1; | |
3013 partitions.iterate (j, &partition); ++j) | |
0 | 3014 { |
111 | 3015 if (share_memory_accesses (rdg, into, partition)) |
3016 { | |
3017 partition_merge_into (rdg, into, partition, FUSE_SHARE_REF); | |
3018 partitions.unordered_remove (j); | |
3019 partition_free (partition); | |
3020 j--; | |
3021 changed = true; | |
3022 } | |
3023 } | |
3024 /* If we fused 0 1 2 in step 1 to 0,2 1 as 0 and 2 have similar | |
3025 accesses when 1 and 2 have similar accesses but not 0 and 1 | |
3026 then in the next iteration we will fail to consider merging | |
3027 1 into 0,2. So try again if we did any merging into 0. */ | |
3028 if (changed) | |
3029 i--; | |
3030 } | |
3031 | |
145 | 3032 /* Put a non-builtin partition last if we need to preserve a reduction. |
3033 ??? This is a workaround that makes sort_partitions_by_post_order do | |
3034 the correct thing while in reality it should sort each component | |
3035 separately and then put the component with a reduction or a non-builtin | |
3036 last. */ | |
3037 if (reduction_in_all | |
3038 && partition_builtin_p (partitions.last())) | |
3039 FOR_EACH_VEC_ELT (partitions, i, partition) | |
3040 if (!partition_builtin_p (partition)) | |
3041 { | |
3042 partitions.unordered_remove (i); | |
3043 partitions.quick_push (partition); | |
3044 break; | |
3045 } | |
3046 | |
111 | 3047 /* Build the partition dependency graph and fuse partitions in strong |
3048 connected component. */ | |
3049 if (partitions.length () > 1) | |
3050 { | |
3051 /* Don't support loop nest distribution under runtime alias check | |
145 | 3052 since it's not likely to enable many vectorization opportunities. |
3053 Also if loop has any data reference which may be not addressable | |
3054 since alias check needs to take, compare address of the object. */ | |
3055 if (loop->inner || has_nonaddressable_dataref_p) | |
111 | 3056 merge_dep_scc_partitions (rdg, &partitions, false); |
3057 else | |
3058 { | |
3059 merge_dep_scc_partitions (rdg, &partitions, true); | |
3060 if (partitions.length () > 1) | |
3061 break_alias_scc_partitions (rdg, &partitions, &alias_ddrs); | |
0 | 3062 } |
3063 } | |
3064 | |
111 | 3065 finalize_partitions (loop, &partitions, &alias_ddrs); |
3066 | |
145 | 3067 /* If there is a reduction in all partitions make sure the last one |
3068 is not classified for builtin code generation. */ | |
3069 if (reduction_in_all) | |
3070 { | |
3071 partition = partitions.last (); | |
3072 if (only_patterns_p | |
3073 && partition_builtin_p (partition) | |
3074 && !partition_builtin_p (partitions[0])) | |
3075 { | |
3076 nbp = 0; | |
3077 goto ldist_done; | |
3078 } | |
3079 partition->kind = PKIND_NORMAL; | |
3080 } | |
3081 | |
111 | 3082 nbp = partitions.length (); |
3083 if (nbp == 0 | |
3084 || (nbp == 1 && !partition_builtin_p (partitions[0])) | |
3085 || (nbp > 1 && partition_contains_all_rw (rdg, partitions))) | |
3086 { | |
3087 nbp = 0; | |
3088 goto ldist_done; | |
3089 } | |
3090 | |
3091 if (version_for_distribution_p (&partitions, &alias_ddrs)) | |
131 | 3092 version_loop_by_alias_check (&partitions, loop, &alias_ddrs); |
111 | 3093 |
3094 if (dump_file && (dump_flags & TDF_DETAILS)) | |
3095 { | |
3096 fprintf (dump_file, | |
3097 "distribute loop <%d> into partitions:\n", loop->num); | |
3098 dump_rdg_partitions (dump_file, partitions); | |
3099 } | |
3100 | |
3101 FOR_EACH_VEC_ELT (partitions, i, partition) | |
3102 { | |
3103 if (partition_builtin_p (partition)) | |
3104 (*nb_calls)++; | |
3105 *destroy_p |= generate_code_for_partition (loop, partition, i < nbp - 1); | |
3106 } | |
3107 | |
3108 ldist_done: | |
3109 loop_nest.release (); | |
3110 free_data_refs (datarefs_vec); | |
3111 for (hash_table<ddr_hasher>::iterator iter = ddrs_table->begin (); | |
3112 iter != ddrs_table->end (); ++iter) | |
3113 { | |
3114 free_dependence_relation (*iter); | |
3115 *iter = NULL; | |
3116 } | |
3117 delete ddrs_table; | |
3118 | |
3119 FOR_EACH_VEC_ELT (partitions, i, partition) | |
3120 partition_free (partition); | |
3121 | |
0 | 3122 free_rdg (rdg); |
111 | 3123 return nbp - *nb_calls; |
0 | 3124 } |
3125 | |
145 | 3126 |
3127 void loop_distribution::bb_top_order_init (void) | |
3128 { | |
3129 int rpo_num; | |
3130 int *rpo = XNEWVEC (int, last_basic_block_for_fn (cfun)); | |
3131 | |
3132 bb_top_order_index = XNEWVEC (int, last_basic_block_for_fn (cfun)); | |
3133 bb_top_order_index_size = last_basic_block_for_fn (cfun); | |
3134 rpo_num = pre_and_rev_post_order_compute_fn (cfun, NULL, rpo, true); | |
3135 for (int i = 0; i < rpo_num; i++) | |
3136 bb_top_order_index[rpo[i]] = i; | |
3137 | |
3138 free (rpo); | |
3139 } | |
3140 | |
3141 void loop_distribution::bb_top_order_destroy () | |
3142 { | |
3143 free (bb_top_order_index); | |
3144 bb_top_order_index = NULL; | |
3145 bb_top_order_index_size = 0; | |
3146 } | |
3147 | |
3148 | |
3149 /* Given LOOP, this function records seed statements for distribution in | |
3150 WORK_LIST. Return false if there is nothing for distribution. */ | |
3151 | |
3152 static bool | |
3153 find_seed_stmts_for_distribution (class loop *loop, vec<gimple *> *work_list) | |
3154 { | |
3155 basic_block *bbs = get_loop_body_in_dom_order (loop); | |
3156 | |
3157 /* Initialize the worklist with stmts we seed the partitions with. */ | |
3158 for (unsigned i = 0; i < loop->num_nodes; ++i) | |
3159 { | |
3160 for (gphi_iterator gsi = gsi_start_phis (bbs[i]); | |
3161 !gsi_end_p (gsi); gsi_next (&gsi)) | |
3162 { | |
3163 gphi *phi = gsi.phi (); | |
3164 if (virtual_operand_p (gimple_phi_result (phi))) | |
3165 continue; | |
3166 /* Distribute stmts which have defs that are used outside of | |
3167 the loop. */ | |
3168 if (!stmt_has_scalar_dependences_outside_loop (loop, phi)) | |
3169 continue; | |
3170 work_list->safe_push (phi); | |
3171 } | |
3172 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); | |
3173 !gsi_end_p (gsi); gsi_next (&gsi)) | |
3174 { | |
3175 gimple *stmt = gsi_stmt (gsi); | |
3176 | |
3177 /* Ignore clobbers, they do not have true side effects. */ | |
3178 if (gimple_clobber_p (stmt)) | |
3179 continue; | |
3180 | |
3181 /* If there is a stmt with side-effects bail out - we | |
3182 cannot and should not distribute this loop. */ | |
3183 if (gimple_has_side_effects (stmt)) | |
3184 { | |
3185 free (bbs); | |
3186 return false; | |
3187 } | |
3188 | |
3189 /* Distribute stmts which have defs that are used outside of | |
3190 the loop. */ | |
3191 if (stmt_has_scalar_dependences_outside_loop (loop, stmt)) | |
3192 ; | |
3193 /* Otherwise only distribute stores for now. */ | |
3194 else if (!gimple_vdef (stmt)) | |
3195 continue; | |
3196 | |
3197 work_list->safe_push (stmt); | |
3198 } | |
3199 } | |
3200 free (bbs); | |
3201 return work_list->length () > 0; | |
3202 } | |
3203 | |
3204 /* Given innermost LOOP, return the outermost enclosing loop that forms a | |
3205 perfect loop nest. */ | |
3206 | |
3207 static class loop * | |
3208 prepare_perfect_loop_nest (class loop *loop) | |
3209 { | |
3210 class loop *outer = loop_outer (loop); | |
3211 tree niters = number_of_latch_executions (loop); | |
3212 | |
3213 /* TODO: We only support the innermost 3-level loop nest distribution | |
3214 because of compilation time issue for now. This should be relaxed | |
3215 in the future. Note we only allow 3-level loop nest distribution | |
3216 when parallelizing loops. */ | |
3217 while ((loop->inner == NULL | |
3218 || (loop->inner->inner == NULL && flag_tree_parallelize_loops > 1)) | |
3219 && loop_outer (outer) | |
3220 && outer->inner == loop && loop->next == NULL | |
3221 && single_exit (outer) | |
3222 && !chrec_contains_symbols_defined_in_loop (niters, outer->num) | |
3223 && (niters = number_of_latch_executions (outer)) != NULL_TREE | |
3224 && niters != chrec_dont_know) | |
3225 { | |
3226 loop = outer; | |
3227 outer = loop_outer (loop); | |
3228 } | |
3229 | |
3230 return loop; | |
3231 } | |
3232 | |
3233 | |
3234 unsigned int | |
3235 loop_distribution::execute (function *fun) | |
3236 { | |
3237 class loop *loop; | |
3238 bool changed = false; | |
3239 basic_block bb; | |
3240 control_dependences *cd = NULL; | |
3241 auto_vec<loop_p> loops_to_be_destroyed; | |
3242 | |
3243 if (number_of_loops (fun) <= 1) | |
3244 return 0; | |
3245 | |
3246 bb_top_order_init (); | |
3247 | |
3248 FOR_ALL_BB_FN (bb, fun) | |
3249 { | |
3250 gimple_stmt_iterator gsi; | |
3251 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi)) | |
3252 gimple_set_uid (gsi_stmt (gsi), -1); | |
3253 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) | |
3254 gimple_set_uid (gsi_stmt (gsi), -1); | |
3255 } | |
3256 | |
3257 /* We can at the moment only distribute non-nested loops, thus restrict | |
3258 walking to innermost loops. */ | |
3259 FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST) | |
3260 { | |
3261 /* Don't distribute multiple exit edges loop, or cold loop when | |
3262 not doing pattern detection. */ | |
3263 if (!single_exit (loop) | |
3264 || (!flag_tree_loop_distribute_patterns | |
3265 && !optimize_loop_for_speed_p (loop))) | |
3266 continue; | |
3267 | |
3268 /* Don't distribute loop if niters is unknown. */ | |
3269 tree niters = number_of_latch_executions (loop); | |
3270 if (niters == NULL_TREE || niters == chrec_dont_know) | |
3271 continue; | |
3272 | |
3273 /* Get the perfect loop nest for distribution. */ | |
3274 loop = prepare_perfect_loop_nest (loop); | |
3275 for (; loop; loop = loop->inner) | |
3276 { | |
3277 auto_vec<gimple *> work_list; | |
3278 if (!find_seed_stmts_for_distribution (loop, &work_list)) | |
3279 break; | |
3280 | |
3281 const char *str = loop->inner ? " nest" : ""; | |
3282 dump_user_location_t loc = find_loop_location (loop); | |
3283 if (!cd) | |
3284 { | |
3285 calculate_dominance_info (CDI_DOMINATORS); | |
3286 calculate_dominance_info (CDI_POST_DOMINATORS); | |
3287 cd = new control_dependences (); | |
3288 free_dominance_info (CDI_POST_DOMINATORS); | |
3289 } | |
3290 | |
3291 bool destroy_p; | |
3292 int nb_generated_loops, nb_generated_calls; | |
3293 nb_generated_loops | |
3294 = distribute_loop (loop, work_list, cd, &nb_generated_calls, | |
3295 &destroy_p, (!optimize_loop_for_speed_p (loop) | |
3296 || !flag_tree_loop_distribution)); | |
3297 if (destroy_p) | |
3298 loops_to_be_destroyed.safe_push (loop); | |
3299 | |
3300 if (nb_generated_loops + nb_generated_calls > 0) | |
3301 { | |
3302 changed = true; | |
3303 if (dump_enabled_p ()) | |
3304 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, | |
3305 loc, "Loop%s %d distributed: split to %d loops " | |
3306 "and %d library calls.\n", str, loop->num, | |
3307 nb_generated_loops, nb_generated_calls); | |
3308 | |
3309 break; | |
3310 } | |
3311 | |
3312 if (dump_file && (dump_flags & TDF_DETAILS)) | |
3313 fprintf (dump_file, "Loop%s %d not distributed.\n", str, loop->num); | |
3314 } | |
3315 } | |
3316 | |
3317 if (cd) | |
3318 delete cd; | |
3319 | |
3320 if (bb_top_order_index != NULL) | |
3321 bb_top_order_destroy (); | |
3322 | |
3323 if (changed) | |
3324 { | |
3325 /* Destroy loop bodies that could not be reused. Do this late as we | |
3326 otherwise can end up refering to stale data in control dependences. */ | |
3327 unsigned i; | |
3328 FOR_EACH_VEC_ELT (loops_to_be_destroyed, i, loop) | |
3329 destroy_loop (loop); | |
3330 | |
3331 /* Cached scalar evolutions now may refer to wrong or non-existing | |
3332 loops. */ | |
3333 scev_reset_htab (); | |
3334 mark_virtual_operands_for_renaming (fun); | |
3335 rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa); | |
3336 } | |
3337 | |
3338 checking_verify_loop_structure (); | |
3339 | |
3340 return changed ? TODO_cleanup_cfg : 0; | |
3341 } | |
3342 | |
3343 | |
0 | 3344 /* Distribute all loops in the current function. */ |
3345 | |
111 | 3346 namespace { |
3347 | |
3348 const pass_data pass_data_loop_distribution = | |
3349 { | |
3350 GIMPLE_PASS, /* type */ | |
3351 "ldist", /* name */ | |
3352 OPTGROUP_LOOP, /* optinfo_flags */ | |
3353 TV_TREE_LOOP_DISTRIBUTION, /* tv_id */ | |
3354 ( PROP_cfg | PROP_ssa ), /* properties_required */ | |
3355 0, /* properties_provided */ | |
3356 0, /* properties_destroyed */ | |
3357 0, /* todo_flags_start */ | |
3358 0, /* todo_flags_finish */ | |
3359 }; | |
3360 | |
3361 class pass_loop_distribution : public gimple_opt_pass | |
3362 { | |
3363 public: | |
3364 pass_loop_distribution (gcc::context *ctxt) | |
3365 : gimple_opt_pass (pass_data_loop_distribution, ctxt) | |
3366 {} | |
3367 | |
3368 /* opt_pass methods: */ | |
3369 virtual bool gate (function *) | |
3370 { | |
3371 return flag_tree_loop_distribution | |
3372 || flag_tree_loop_distribute_patterns; | |
3373 } | |
3374 | |
3375 virtual unsigned int execute (function *); | |
3376 | |
3377 }; // class pass_loop_distribution | |
3378 | |
3379 unsigned int | |
3380 pass_loop_distribution::execute (function *fun) | |
0 | 3381 { |
145 | 3382 return loop_distribution ().execute (fun); |
0 | 3383 } |
3384 | |
111 | 3385 } // anon namespace |
3386 | |
3387 gimple_opt_pass * | |
3388 make_pass_loop_distribution (gcc::context *ctxt) | |
0 | 3389 { |
111 | 3390 return new pass_loop_distribution (ctxt); |
3391 } |