Mercurial > hg > CbC > CbC_gcc
comparison gcc/gimple-loop-jam.c @ 131:84e7813d76e9
gcc-8.2
author | mir3636 |
---|---|
date | Thu, 25 Oct 2018 07:37:49 +0900 |
parents | |
children | 1830386684a0 |
comparison
equal
deleted
inserted
replaced
111:04ced10e8804 | 131:84e7813d76e9 |
---|---|
1 /* Loop unroll-and-jam. | |
2 Copyright (C) 2017-2018 Free Software Foundation, Inc. | |
3 | |
4 This file is part of GCC. | |
5 | |
6 GCC is free software; you can redistribute it and/or modify it | |
7 under the terms of the GNU General Public License as published by the | |
8 Free Software Foundation; either version 3, or (at your option) any | |
9 later version. | |
10 | |
11 GCC is distributed in the hope that it will be useful, but WITHOUT | |
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
14 for more details. | |
15 | |
16 You should have received a copy of the GNU General Public License | |
17 along with GCC; see the file COPYING3. If not see | |
18 <http://www.gnu.org/licenses/>. */ | |
19 | |
20 #include "config.h" | |
21 #include "system.h" | |
22 #include "coretypes.h" | |
23 #include "params.h" | |
24 #include "tree-pass.h" | |
25 #include "backend.h" | |
26 #include "tree.h" | |
27 #include "gimple.h" | |
28 #include "ssa.h" | |
29 #include "fold-const.h" | |
30 #include "tree-cfg.h" | |
31 #include "tree-ssa.h" | |
32 #include "tree-ssa-loop-niter.h" | |
33 #include "tree-ssa-loop.h" | |
34 #include "tree-ssa-loop-manip.h" | |
35 #include "cfgloop.h" | |
36 #include "tree-scalar-evolution.h" | |
37 #include "gimple-iterator.h" | |
38 #include "cfghooks.h" | |
39 #include "tree-data-ref.h" | |
40 #include "tree-ssa-loop-ivopts.h" | |
41 #include "tree-vectorizer.h" | |
42 | |
43 /* Unroll and Jam transformation | |
44 | |
45 This is a combination of two transformations, where the second | |
46 is not always valid. It's applicable if a loop nest has redundancies | |
47 over the iterations of an outer loop while not having that with | |
48 an inner loop. | |
49 | |
50 Given this nest: | |
51 for (i) { | |
52 for (j) { | |
53 B(i,j) | |
54 } | |
55 } | |
56 | |
57 first unroll: | |
58 for (i by 2) { | |
59 for (j) { | |
60 B(i,j) | |
61 } | |
62 for (j) { | |
63 B(i+1,j) | |
64 } | |
65 } | |
66 | |
67 then fuse the two adjacent inner loops resulting from that: | |
68 for (i by 2) { | |
69 for (j) { | |
70 B(i,j) | |
71 B(i+1,j) | |
72 } | |
73 } | |
74 | |
75 As the order of evaluations of the body B changes this is valid | |
76 only in certain situations: all distance vectors need to be forward. | |
77 Additionally if there are multiple induction variables than just | |
78 a counting control IV (j above) we can also deal with some situations. | |
79 | |
80 The validity is checked by unroll_jam_possible_p, and the data-dep | |
81 testing below. | |
82 | |
83 A trivial example where the fusion is wrong would be when | |
84 B(i,j) == x[j-1] = x[j]; | |
85 for (i by 2) { | |
86 for (j) { | |
87 x[j-1] = x[j]; | |
88 } | |
89 for (j) { | |
90 x[j-1] = x[j]; | |
91 } | |
92 } effect: move content to front by two elements | |
93 --> | |
94 for (i by 2) { | |
95 for (j) { | |
96 x[j-1] = x[j]; | |
97 x[j-1] = x[j]; | |
98 } | |
99 } effect: move content to front by one element | |
100 */ | |
101 | |
102 /* Modify the loop tree for the fact that all code once belonging | |
103 to the OLD loop or the outer loop of OLD now is inside LOOP. */ | |
104 | |
105 static void | |
106 merge_loop_tree (struct loop *loop, struct loop *old) | |
107 { | |
108 basic_block *bbs; | |
109 int i, n; | |
110 struct loop *subloop; | |
111 edge e; | |
112 edge_iterator ei; | |
113 | |
114 /* Find its nodes. */ | |
115 bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun)); | |
116 n = get_loop_body_with_size (loop, bbs, n_basic_blocks_for_fn (cfun)); | |
117 | |
118 for (i = 0; i < n; i++) | |
119 { | |
120 /* If the block was direct child of OLD loop it's now part | |
121 of LOOP. If it was outside OLD, then it moved into LOOP | |
122 as well. This avoids changing the loop father for BBs | |
123 in inner loops of OLD. */ | |
124 if (bbs[i]->loop_father == old | |
125 || loop_depth (bbs[i]->loop_father) < loop_depth (old)) | |
126 { | |
127 remove_bb_from_loops (bbs[i]); | |
128 add_bb_to_loop (bbs[i], loop); | |
129 continue; | |
130 } | |
131 | |
132 /* If we find a direct subloop of OLD, move it to LOOP. */ | |
133 subloop = bbs[i]->loop_father; | |
134 if (loop_outer (subloop) == old && subloop->header == bbs[i]) | |
135 { | |
136 flow_loop_tree_node_remove (subloop); | |
137 flow_loop_tree_node_add (loop, subloop); | |
138 } | |
139 } | |
140 | |
141 /* Update the information about loop exit edges. */ | |
142 for (i = 0; i < n; i++) | |
143 { | |
144 FOR_EACH_EDGE (e, ei, bbs[i]->succs) | |
145 { | |
146 rescan_loop_exit (e, false, false); | |
147 } | |
148 } | |
149 | |
150 loop->num_nodes = n; | |
151 | |
152 free (bbs); | |
153 } | |
154 | |
155 /* BB is part of the outer loop of an unroll-and-jam situation. | |
156 Check if any statements therein would prevent the transformation. */ | |
157 | |
158 static bool | |
159 bb_prevents_fusion_p (basic_block bb) | |
160 { | |
161 gimple_stmt_iterator gsi; | |
162 /* BB is duplicated by outer unrolling and then all N-1 first copies | |
163 move into the body of the fused inner loop. If BB exits the outer loop | |
164 the last copy still does so, and the first N-1 copies are cancelled | |
165 by loop unrolling, so also after fusion it's the exit block. | |
166 But there might be other reasons that prevent fusion: | |
167 * stores or unknown side-effects prevent fusion | |
168 * loads don't | |
169 * computations into SSA names: these aren't problematic. Their | |
170 result will be unused on the exit edges of the first N-1 copies | |
171 (those aren't taken after unrolling). If they are used on the | |
172 other edge (the one leading to the outer latch block) they are | |
173 loop-carried (on the outer loop) and the Nth copy of BB will | |
174 compute them again (i.e. the first N-1 copies will be dead). */ | |
175 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) | |
176 { | |
177 gimple *g = gsi_stmt (gsi); | |
178 if (gimple_vdef (g) || gimple_has_side_effects (g)) | |
179 return true; | |
180 } | |
181 return false; | |
182 } | |
183 | |
184 /* Given an inner loop LOOP (of some OUTER loop) determine if | |
185 we can safely fuse copies of it (generated by outer unrolling). | |
186 If so return true, otherwise return false. */ | |
187 | |
188 static bool | |
189 unroll_jam_possible_p (struct loop *outer, struct loop *loop) | |
190 { | |
191 basic_block *bbs; | |
192 int i, n; | |
193 struct tree_niter_desc niter; | |
194 | |
195 /* When fusing the loops we skip the latch block | |
196 of the first one, so it mustn't have any effects to | |
197 preserve. */ | |
198 if (!empty_block_p (loop->latch)) | |
199 return false; | |
200 | |
201 if (!single_exit (loop)) | |
202 return false; | |
203 | |
204 /* We need a perfect nest. Quick check for adjacent inner loops. */ | |
205 if (outer->inner != loop || loop->next) | |
206 return false; | |
207 | |
208 /* Prevent head-controlled inner loops, that we usually have. | |
209 The guard block would need to be accepted | |
210 (invariant condition either entering or skipping the loop), | |
211 without also accepting arbitrary control flow. When unswitching | |
212 ran before us (as with -O3) this won't be a problem because its | |
213 outer loop unswitching will have moved out the invariant condition. | |
214 | |
215 If we do that we need to extend fuse_loops() to cope with this | |
216 by threading through the (still invariant) copied condition | |
217 between the two loop copies. */ | |
218 if (!dominated_by_p (CDI_DOMINATORS, outer->latch, loop->header)) | |
219 return false; | |
220 | |
221 /* The number of iterations of the inner loop must be loop invariant | |
222 with respect to the outer loop. */ | |
223 if (!number_of_iterations_exit (loop, single_exit (loop), &niter, | |
224 false, true) | |
225 || niter.cmp == ERROR_MARK | |
226 || !integer_zerop (niter.may_be_zero) | |
227 || !expr_invariant_in_loop_p (outer, niter.niter)) | |
228 return false; | |
229 | |
230 /* If the inner loop produces any values that are used inside the | |
231 outer loop (except the virtual op) then it can flow | |
232 back (perhaps indirectly) into the inner loop. This prevents | |
233 fusion: without fusion the value at the last iteration is used, | |
234 with fusion the value after the initial iteration is used. | |
235 | |
236 If all uses are outside the outer loop this doesn't prevent fusion; | |
237 the value of the last iteration is still used (and the values from | |
238 all intermediate iterations are dead). */ | |
239 gphi_iterator psi; | |
240 for (psi = gsi_start_phis (single_exit (loop)->dest); | |
241 !gsi_end_p (psi); gsi_next (&psi)) | |
242 { | |
243 imm_use_iterator imm_iter; | |
244 use_operand_p use_p; | |
245 tree op = gimple_phi_result (psi.phi ()); | |
246 if (virtual_operand_p (op)) | |
247 continue; | |
248 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, op) | |
249 { | |
250 gimple *use_stmt = USE_STMT (use_p); | |
251 if (!is_gimple_debug (use_stmt) | |
252 && flow_bb_inside_loop_p (outer, gimple_bb (use_stmt))) | |
253 return false; | |
254 } | |
255 } | |
256 | |
257 /* And check blocks belonging to just outer loop. */ | |
258 bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun)); | |
259 n = get_loop_body_with_size (outer, bbs, n_basic_blocks_for_fn (cfun)); | |
260 | |
261 for (i = 0; i < n; i++) | |
262 if (bbs[i]->loop_father == outer && bb_prevents_fusion_p (bbs[i])) | |
263 break; | |
264 free (bbs); | |
265 if (i != n) | |
266 return false; | |
267 | |
268 /* For now we can safely fuse copies of LOOP only if all | |
269 loop carried variables are inductions (or the virtual op). | |
270 | |
271 We could handle reductions as well (the initial value in the second | |
272 body would be the after-iter value of the first body) if it's over | |
273 an associative and commutative operation. We wouldn't | |
274 be able to handle unknown cycles. */ | |
275 for (psi = gsi_start_phis (loop->header); !gsi_end_p (psi); gsi_next (&psi)) | |
276 { | |
277 affine_iv iv; | |
278 tree op = gimple_phi_result (psi.phi ()); | |
279 | |
280 if (virtual_operand_p (op)) | |
281 continue; | |
282 if (!simple_iv (loop, loop, op, &iv, true)) | |
283 return false; | |
284 /* The inductions must be regular, loop invariant step and initial | |
285 value. */ | |
286 if (!expr_invariant_in_loop_p (outer, iv.step) | |
287 || !expr_invariant_in_loop_p (outer, iv.base)) | |
288 return false; | |
289 /* XXX With more effort we could also be able to deal with inductions | |
290 where the initial value is loop variant but a simple IV in the | |
291 outer loop. The initial value for the second body would be | |
292 the original initial value plus iv.base.step. The next value | |
293 for the fused loop would be the original next value of the first | |
294 copy, _not_ the next value of the second body. */ | |
295 } | |
296 | |
297 return true; | |
298 } | |
299 | |
300 /* Fuse LOOP with all further neighbors. The loops are expected to | |
301 be in appropriate form. */ | |
302 | |
303 static void | |
304 fuse_loops (struct loop *loop) | |
305 { | |
306 struct loop *next = loop->next; | |
307 | |
308 while (next) | |
309 { | |
310 edge e; | |
311 | |
312 remove_branch (single_pred_edge (loop->latch)); | |
313 /* Make delete_basic_block not fiddle with the loop structure. */ | |
314 basic_block oldlatch = loop->latch; | |
315 loop->latch = NULL; | |
316 delete_basic_block (oldlatch); | |
317 e = redirect_edge_and_branch (loop_latch_edge (next), | |
318 loop->header); | |
319 loop->latch = e->src; | |
320 flush_pending_stmts (e); | |
321 | |
322 gcc_assert (EDGE_COUNT (next->header->preds) == 1); | |
323 | |
324 /* The PHI nodes of the second body (single-argument now) | |
325 need adjustments to use the right values: either directly | |
326 the value of the corresponding PHI in the first copy or | |
327 the one leaving the first body which unrolling did for us. | |
328 | |
329 See also unroll_jam_possible_p() for further possibilities. */ | |
330 gphi_iterator psi_first, psi_second; | |
331 e = single_pred_edge (next->header); | |
332 for (psi_first = gsi_start_phis (loop->header), | |
333 psi_second = gsi_start_phis (next->header); | |
334 !gsi_end_p (psi_first); | |
335 gsi_next (&psi_first), gsi_next (&psi_second)) | |
336 { | |
337 gphi *phi_first = psi_first.phi (); | |
338 gphi *phi_second = psi_second.phi (); | |
339 tree firstop = gimple_phi_result (phi_first); | |
340 /* The virtual operand is correct already as it's | |
341 always live at exit, hence has a LCSSA node and outer | |
342 loop unrolling updated SSA form. */ | |
343 if (virtual_operand_p (firstop)) | |
344 continue; | |
345 | |
346 /* Due to unroll_jam_possible_p() we know that this is | |
347 an induction. The second body goes over the same | |
348 iteration space. */ | |
349 add_phi_arg (phi_second, firstop, e, | |
350 gimple_location (phi_first)); | |
351 } | |
352 gcc_assert (gsi_end_p (psi_second)); | |
353 | |
354 merge_loop_tree (loop, next); | |
355 gcc_assert (!next->num_nodes); | |
356 struct loop *ln = next->next; | |
357 delete_loop (next); | |
358 next = ln; | |
359 } | |
360 rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop); | |
361 } | |
362 | |
363 /* Returns true if the distance in DDR can be determined and adjusts | |
364 the unroll factor in *UNROLL to make unrolling valid for that distance. | |
365 Otherwise return false. | |
366 | |
367 If this data dep can lead to a removed memory reference, increment | |
368 *REMOVED and adjust *PROFIT_UNROLL to be the necessary unroll factor | |
369 for this to happen. */ | |
370 | |
371 static bool | |
372 adjust_unroll_factor (struct data_dependence_relation *ddr, | |
373 unsigned *unroll, unsigned *profit_unroll, | |
374 unsigned *removed) | |
375 { | |
376 bool ret = false; | |
377 if (DDR_ARE_DEPENDENT (ddr) != chrec_known) | |
378 { | |
379 if (DDR_NUM_DIST_VECTS (ddr) == 0) | |
380 return false; | |
381 unsigned i; | |
382 lambda_vector dist_v; | |
383 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) | |
384 { | |
385 /* A distance (a,b) is at worst transformed into (a/N,b) by the | |
386 unrolling (factor N), so the transformation is valid if | |
387 a >= N, or b > 0, or b is zero and a > 0. Otherwise the unroll | |
388 factor needs to be limited so that the first condition holds. | |
389 That may limit the factor down to zero in the worst case. */ | |
390 int dist = dist_v[0]; | |
391 if (dist < 0) | |
392 gcc_unreachable (); | |
393 else if ((unsigned)dist >= *unroll) | |
394 ; | |
395 else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - 1) | |
396 || (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1) | |
397 && dist > 0)) | |
398 ; | |
399 else | |
400 *unroll = dist; | |
401 | |
402 /* With a distance (a,0) it's always profitable to unroll-and-jam | |
403 (by a+1), because one memory reference will go away. With | |
404 (a,b) and b != 0 that's less clear. We will increase the | |
405 number of streams without lowering the number of mem refs. | |
406 So for now only handle the first situation. */ | |
407 if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)) | |
408 { | |
409 *profit_unroll = MAX (*profit_unroll, (unsigned)dist + 1); | |
410 (*removed)++; | |
411 } | |
412 | |
413 ret = true; | |
414 } | |
415 } | |
416 return ret; | |
417 } | |
418 | |
419 /* Main entry point for the unroll-and-jam transformation | |
420 described above. */ | |
421 | |
422 static unsigned int | |
423 tree_loop_unroll_and_jam (void) | |
424 { | |
425 struct loop *loop; | |
426 bool changed = false; | |
427 | |
428 gcc_assert (scev_initialized_p ()); | |
429 | |
430 /* Go through all innermost loops. */ | |
431 FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST) | |
432 { | |
433 struct loop *outer = loop_outer (loop); | |
434 | |
435 if (loop_depth (loop) < 2 | |
436 || optimize_loop_nest_for_size_p (outer)) | |
437 continue; | |
438 | |
439 if (!unroll_jam_possible_p (outer, loop)) | |
440 continue; | |
441 | |
442 vec<data_reference_p> datarefs; | |
443 vec<ddr_p> dependences; | |
444 unsigned unroll_factor, profit_unroll, removed; | |
445 struct tree_niter_desc desc; | |
446 bool unroll = false; | |
447 | |
448 auto_vec<loop_p, 3> loop_nest; | |
449 dependences.create (10); | |
450 datarefs.create (10); | |
451 if (!compute_data_dependences_for_loop (outer, true, &loop_nest, | |
452 &datarefs, &dependences)) | |
453 { | |
454 if (dump_file && (dump_flags & TDF_DETAILS)) | |
455 fprintf (dump_file, "Cannot analyze data dependencies\n"); | |
456 free_data_refs (datarefs); | |
457 free_dependence_relations (dependences); | |
458 return false; | |
459 } | |
460 if (!datarefs.length ()) | |
461 continue; | |
462 | |
463 if (dump_file && (dump_flags & TDF_DETAILS)) | |
464 dump_data_dependence_relations (dump_file, dependences); | |
465 | |
466 unroll_factor = (unsigned)-1; | |
467 profit_unroll = 1; | |
468 removed = 0; | |
469 | |
470 /* Check all dependencies. */ | |
471 unsigned i; | |
472 struct data_dependence_relation *ddr; | |
473 FOR_EACH_VEC_ELT (dependences, i, ddr) | |
474 { | |
475 struct data_reference *dra, *drb; | |
476 | |
477 /* If the refs are independend there's nothing to do. */ | |
478 if (DDR_ARE_DEPENDENT (ddr) == chrec_known) | |
479 continue; | |
480 dra = DDR_A (ddr); | |
481 drb = DDR_B (ddr); | |
482 /* Nothing interesting for the self dependencies. */ | |
483 if (dra == drb) | |
484 continue; | |
485 | |
486 /* Now check the distance vector, for determining a sensible | |
487 outer unroll factor, and for validity of merging the inner | |
488 loop copies. */ | |
489 if (!adjust_unroll_factor (ddr, &unroll_factor, &profit_unroll, | |
490 &removed)) | |
491 { | |
492 /* Couldn't get the distance vector. For two reads that's | |
493 harmless (we assume we should unroll). For at least | |
494 one write this means we can't check the dependence direction | |
495 and hence can't determine safety. */ | |
496 | |
497 if (DR_IS_WRITE (dra) || DR_IS_WRITE (drb)) | |
498 { | |
499 unroll_factor = 0; | |
500 break; | |
501 } | |
502 } | |
503 } | |
504 | |
505 /* We regard a user-specified minimum percentage of zero as a request | |
506 to ignore all profitability concerns and apply the transformation | |
507 always. */ | |
508 if (!PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT)) | |
509 profit_unroll = 2; | |
510 else if (removed * 100 / datarefs.length () | |
511 < (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT)) | |
512 profit_unroll = 1; | |
513 if (unroll_factor > profit_unroll) | |
514 unroll_factor = profit_unroll; | |
515 if (unroll_factor > (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MAX_UNROLL)) | |
516 unroll_factor = PARAM_VALUE (PARAM_UNROLL_JAM_MAX_UNROLL); | |
517 unroll = (unroll_factor > 1 | |
518 && can_unroll_loop_p (outer, unroll_factor, &desc)); | |
519 | |
520 if (unroll) | |
521 { | |
522 if (dump_enabled_p ()) | |
523 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | TDF_DETAILS, | |
524 find_loop_location (outer), | |
525 "applying unroll and jam with factor %d\n", | |
526 unroll_factor); | |
527 initialize_original_copy_tables (); | |
528 tree_unroll_loop (outer, unroll_factor, single_dom_exit (outer), | |
529 &desc); | |
530 free_original_copy_tables (); | |
531 fuse_loops (outer->inner); | |
532 changed = true; | |
533 } | |
534 | |
535 loop_nest.release (); | |
536 free_dependence_relations (dependences); | |
537 free_data_refs (datarefs); | |
538 } | |
539 | |
540 if (changed) | |
541 { | |
542 scev_reset (); | |
543 free_dominance_info (CDI_DOMINATORS); | |
544 return TODO_cleanup_cfg; | |
545 } | |
546 return 0; | |
547 } | |
548 | |
549 /* Pass boilerplate */ | |
550 | |
551 namespace { | |
552 | |
553 const pass_data pass_data_loop_jam = | |
554 { | |
555 GIMPLE_PASS, /* type */ | |
556 "unrolljam", /* name */ | |
557 OPTGROUP_LOOP, /* optinfo_flags */ | |
558 TV_LOOP_JAM, /* tv_id */ | |
559 PROP_cfg, /* properties_required */ | |
560 0, /* properties_provided */ | |
561 0, /* properties_destroyed */ | |
562 0, /* todo_flags_start */ | |
563 0, /* todo_flags_finish */ | |
564 }; | |
565 | |
566 class pass_loop_jam : public gimple_opt_pass | |
567 { | |
568 public: | |
569 pass_loop_jam (gcc::context *ctxt) | |
570 : gimple_opt_pass (pass_data_loop_jam, ctxt) | |
571 {} | |
572 | |
573 /* opt_pass methods: */ | |
574 virtual bool gate (function *) { return flag_unroll_jam != 0; } | |
575 virtual unsigned int execute (function *); | |
576 | |
577 }; | |
578 | |
579 unsigned int | |
580 pass_loop_jam::execute (function *fun) | |
581 { | |
582 if (number_of_loops (fun) <= 1) | |
583 return 0; | |
584 | |
585 return tree_loop_unroll_and_jam (); | |
586 } | |
587 | |
588 } | |
589 | |
590 gimple_opt_pass * | |
591 make_pass_loop_jam (gcc::context *ctxt) | |
592 { | |
593 return new pass_loop_jam (ctxt); | |
594 } |