comparison gcc/gimple-loop-jam.c @ 131:84e7813d76e9

gcc-8.2
author mir3636
date Thu, 25 Oct 2018 07:37:49 +0900
parents
children 1830386684a0
comparison
equal deleted inserted replaced
111:04ced10e8804 131:84e7813d76e9
1 /* Loop unroll-and-jam.
2 Copyright (C) 2017-2018 Free Software Foundation, Inc.
3
4 This file is part of GCC.
5
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
9 later version.
10
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
19
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "params.h"
24 #include "tree-pass.h"
25 #include "backend.h"
26 #include "tree.h"
27 #include "gimple.h"
28 #include "ssa.h"
29 #include "fold-const.h"
30 #include "tree-cfg.h"
31 #include "tree-ssa.h"
32 #include "tree-ssa-loop-niter.h"
33 #include "tree-ssa-loop.h"
34 #include "tree-ssa-loop-manip.h"
35 #include "cfgloop.h"
36 #include "tree-scalar-evolution.h"
37 #include "gimple-iterator.h"
38 #include "cfghooks.h"
39 #include "tree-data-ref.h"
40 #include "tree-ssa-loop-ivopts.h"
41 #include "tree-vectorizer.h"
42
43 /* Unroll and Jam transformation
44
45 This is a combination of two transformations, where the second
46 is not always valid. It's applicable if a loop nest has redundancies
47 over the iterations of an outer loop while not having that with
48 an inner loop.
49
50 Given this nest:
51 for (i) {
52 for (j) {
53 B(i,j)
54 }
55 }
56
57 first unroll:
58 for (i by 2) {
59 for (j) {
60 B(i,j)
61 }
62 for (j) {
63 B(i+1,j)
64 }
65 }
66
67 then fuse the two adjacent inner loops resulting from that:
68 for (i by 2) {
69 for (j) {
70 B(i,j)
71 B(i+1,j)
72 }
73 }
74
75 As the order of evaluations of the body B changes this is valid
76 only in certain situations: all distance vectors need to be forward.
77 Additionally if there are multiple induction variables than just
78 a counting control IV (j above) we can also deal with some situations.
79
80 The validity is checked by unroll_jam_possible_p, and the data-dep
81 testing below.
82
83 A trivial example where the fusion is wrong would be when
84 B(i,j) == x[j-1] = x[j];
85 for (i by 2) {
86 for (j) {
87 x[j-1] = x[j];
88 }
89 for (j) {
90 x[j-1] = x[j];
91 }
92 } effect: move content to front by two elements
93 -->
94 for (i by 2) {
95 for (j) {
96 x[j-1] = x[j];
97 x[j-1] = x[j];
98 }
99 } effect: move content to front by one element
100 */
101
102 /* Modify the loop tree for the fact that all code once belonging
103 to the OLD loop or the outer loop of OLD now is inside LOOP. */
104
105 static void
106 merge_loop_tree (struct loop *loop, struct loop *old)
107 {
108 basic_block *bbs;
109 int i, n;
110 struct loop *subloop;
111 edge e;
112 edge_iterator ei;
113
114 /* Find its nodes. */
115 bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
116 n = get_loop_body_with_size (loop, bbs, n_basic_blocks_for_fn (cfun));
117
118 for (i = 0; i < n; i++)
119 {
120 /* If the block was direct child of OLD loop it's now part
121 of LOOP. If it was outside OLD, then it moved into LOOP
122 as well. This avoids changing the loop father for BBs
123 in inner loops of OLD. */
124 if (bbs[i]->loop_father == old
125 || loop_depth (bbs[i]->loop_father) < loop_depth (old))
126 {
127 remove_bb_from_loops (bbs[i]);
128 add_bb_to_loop (bbs[i], loop);
129 continue;
130 }
131
132 /* If we find a direct subloop of OLD, move it to LOOP. */
133 subloop = bbs[i]->loop_father;
134 if (loop_outer (subloop) == old && subloop->header == bbs[i])
135 {
136 flow_loop_tree_node_remove (subloop);
137 flow_loop_tree_node_add (loop, subloop);
138 }
139 }
140
141 /* Update the information about loop exit edges. */
142 for (i = 0; i < n; i++)
143 {
144 FOR_EACH_EDGE (e, ei, bbs[i]->succs)
145 {
146 rescan_loop_exit (e, false, false);
147 }
148 }
149
150 loop->num_nodes = n;
151
152 free (bbs);
153 }
154
155 /* BB is part of the outer loop of an unroll-and-jam situation.
156 Check if any statements therein would prevent the transformation. */
157
158 static bool
159 bb_prevents_fusion_p (basic_block bb)
160 {
161 gimple_stmt_iterator gsi;
162 /* BB is duplicated by outer unrolling and then all N-1 first copies
163 move into the body of the fused inner loop. If BB exits the outer loop
164 the last copy still does so, and the first N-1 copies are cancelled
165 by loop unrolling, so also after fusion it's the exit block.
166 But there might be other reasons that prevent fusion:
167 * stores or unknown side-effects prevent fusion
168 * loads don't
169 * computations into SSA names: these aren't problematic. Their
170 result will be unused on the exit edges of the first N-1 copies
171 (those aren't taken after unrolling). If they are used on the
172 other edge (the one leading to the outer latch block) they are
173 loop-carried (on the outer loop) and the Nth copy of BB will
174 compute them again (i.e. the first N-1 copies will be dead). */
175 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
176 {
177 gimple *g = gsi_stmt (gsi);
178 if (gimple_vdef (g) || gimple_has_side_effects (g))
179 return true;
180 }
181 return false;
182 }
183
184 /* Given an inner loop LOOP (of some OUTER loop) determine if
185 we can safely fuse copies of it (generated by outer unrolling).
186 If so return true, otherwise return false. */
187
188 static bool
189 unroll_jam_possible_p (struct loop *outer, struct loop *loop)
190 {
191 basic_block *bbs;
192 int i, n;
193 struct tree_niter_desc niter;
194
195 /* When fusing the loops we skip the latch block
196 of the first one, so it mustn't have any effects to
197 preserve. */
198 if (!empty_block_p (loop->latch))
199 return false;
200
201 if (!single_exit (loop))
202 return false;
203
204 /* We need a perfect nest. Quick check for adjacent inner loops. */
205 if (outer->inner != loop || loop->next)
206 return false;
207
208 /* Prevent head-controlled inner loops, that we usually have.
209 The guard block would need to be accepted
210 (invariant condition either entering or skipping the loop),
211 without also accepting arbitrary control flow. When unswitching
212 ran before us (as with -O3) this won't be a problem because its
213 outer loop unswitching will have moved out the invariant condition.
214
215 If we do that we need to extend fuse_loops() to cope with this
216 by threading through the (still invariant) copied condition
217 between the two loop copies. */
218 if (!dominated_by_p (CDI_DOMINATORS, outer->latch, loop->header))
219 return false;
220
221 /* The number of iterations of the inner loop must be loop invariant
222 with respect to the outer loop. */
223 if (!number_of_iterations_exit (loop, single_exit (loop), &niter,
224 false, true)
225 || niter.cmp == ERROR_MARK
226 || !integer_zerop (niter.may_be_zero)
227 || !expr_invariant_in_loop_p (outer, niter.niter))
228 return false;
229
230 /* If the inner loop produces any values that are used inside the
231 outer loop (except the virtual op) then it can flow
232 back (perhaps indirectly) into the inner loop. This prevents
233 fusion: without fusion the value at the last iteration is used,
234 with fusion the value after the initial iteration is used.
235
236 If all uses are outside the outer loop this doesn't prevent fusion;
237 the value of the last iteration is still used (and the values from
238 all intermediate iterations are dead). */
239 gphi_iterator psi;
240 for (psi = gsi_start_phis (single_exit (loop)->dest);
241 !gsi_end_p (psi); gsi_next (&psi))
242 {
243 imm_use_iterator imm_iter;
244 use_operand_p use_p;
245 tree op = gimple_phi_result (psi.phi ());
246 if (virtual_operand_p (op))
247 continue;
248 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, op)
249 {
250 gimple *use_stmt = USE_STMT (use_p);
251 if (!is_gimple_debug (use_stmt)
252 && flow_bb_inside_loop_p (outer, gimple_bb (use_stmt)))
253 return false;
254 }
255 }
256
257 /* And check blocks belonging to just outer loop. */
258 bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
259 n = get_loop_body_with_size (outer, bbs, n_basic_blocks_for_fn (cfun));
260
261 for (i = 0; i < n; i++)
262 if (bbs[i]->loop_father == outer && bb_prevents_fusion_p (bbs[i]))
263 break;
264 free (bbs);
265 if (i != n)
266 return false;
267
268 /* For now we can safely fuse copies of LOOP only if all
269 loop carried variables are inductions (or the virtual op).
270
271 We could handle reductions as well (the initial value in the second
272 body would be the after-iter value of the first body) if it's over
273 an associative and commutative operation. We wouldn't
274 be able to handle unknown cycles. */
275 for (psi = gsi_start_phis (loop->header); !gsi_end_p (psi); gsi_next (&psi))
276 {
277 affine_iv iv;
278 tree op = gimple_phi_result (psi.phi ());
279
280 if (virtual_operand_p (op))
281 continue;
282 if (!simple_iv (loop, loop, op, &iv, true))
283 return false;
284 /* The inductions must be regular, loop invariant step and initial
285 value. */
286 if (!expr_invariant_in_loop_p (outer, iv.step)
287 || !expr_invariant_in_loop_p (outer, iv.base))
288 return false;
289 /* XXX With more effort we could also be able to deal with inductions
290 where the initial value is loop variant but a simple IV in the
291 outer loop. The initial value for the second body would be
292 the original initial value plus iv.base.step. The next value
293 for the fused loop would be the original next value of the first
294 copy, _not_ the next value of the second body. */
295 }
296
297 return true;
298 }
299
300 /* Fuse LOOP with all further neighbors. The loops are expected to
301 be in appropriate form. */
302
303 static void
304 fuse_loops (struct loop *loop)
305 {
306 struct loop *next = loop->next;
307
308 while (next)
309 {
310 edge e;
311
312 remove_branch (single_pred_edge (loop->latch));
313 /* Make delete_basic_block not fiddle with the loop structure. */
314 basic_block oldlatch = loop->latch;
315 loop->latch = NULL;
316 delete_basic_block (oldlatch);
317 e = redirect_edge_and_branch (loop_latch_edge (next),
318 loop->header);
319 loop->latch = e->src;
320 flush_pending_stmts (e);
321
322 gcc_assert (EDGE_COUNT (next->header->preds) == 1);
323
324 /* The PHI nodes of the second body (single-argument now)
325 need adjustments to use the right values: either directly
326 the value of the corresponding PHI in the first copy or
327 the one leaving the first body which unrolling did for us.
328
329 See also unroll_jam_possible_p() for further possibilities. */
330 gphi_iterator psi_first, psi_second;
331 e = single_pred_edge (next->header);
332 for (psi_first = gsi_start_phis (loop->header),
333 psi_second = gsi_start_phis (next->header);
334 !gsi_end_p (psi_first);
335 gsi_next (&psi_first), gsi_next (&psi_second))
336 {
337 gphi *phi_first = psi_first.phi ();
338 gphi *phi_second = psi_second.phi ();
339 tree firstop = gimple_phi_result (phi_first);
340 /* The virtual operand is correct already as it's
341 always live at exit, hence has a LCSSA node and outer
342 loop unrolling updated SSA form. */
343 if (virtual_operand_p (firstop))
344 continue;
345
346 /* Due to unroll_jam_possible_p() we know that this is
347 an induction. The second body goes over the same
348 iteration space. */
349 add_phi_arg (phi_second, firstop, e,
350 gimple_location (phi_first));
351 }
352 gcc_assert (gsi_end_p (psi_second));
353
354 merge_loop_tree (loop, next);
355 gcc_assert (!next->num_nodes);
356 struct loop *ln = next->next;
357 delete_loop (next);
358 next = ln;
359 }
360 rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop);
361 }
362
363 /* Returns true if the distance in DDR can be determined and adjusts
364 the unroll factor in *UNROLL to make unrolling valid for that distance.
365 Otherwise return false.
366
367 If this data dep can lead to a removed memory reference, increment
368 *REMOVED and adjust *PROFIT_UNROLL to be the necessary unroll factor
369 for this to happen. */
370
371 static bool
372 adjust_unroll_factor (struct data_dependence_relation *ddr,
373 unsigned *unroll, unsigned *profit_unroll,
374 unsigned *removed)
375 {
376 bool ret = false;
377 if (DDR_ARE_DEPENDENT (ddr) != chrec_known)
378 {
379 if (DDR_NUM_DIST_VECTS (ddr) == 0)
380 return false;
381 unsigned i;
382 lambda_vector dist_v;
383 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
384 {
385 /* A distance (a,b) is at worst transformed into (a/N,b) by the
386 unrolling (factor N), so the transformation is valid if
387 a >= N, or b > 0, or b is zero and a > 0. Otherwise the unroll
388 factor needs to be limited so that the first condition holds.
389 That may limit the factor down to zero in the worst case. */
390 int dist = dist_v[0];
391 if (dist < 0)
392 gcc_unreachable ();
393 else if ((unsigned)dist >= *unroll)
394 ;
395 else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)
396 || (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)
397 && dist > 0))
398 ;
399 else
400 *unroll = dist;
401
402 /* With a distance (a,0) it's always profitable to unroll-and-jam
403 (by a+1), because one memory reference will go away. With
404 (a,b) and b != 0 that's less clear. We will increase the
405 number of streams without lowering the number of mem refs.
406 So for now only handle the first situation. */
407 if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
408 {
409 *profit_unroll = MAX (*profit_unroll, (unsigned)dist + 1);
410 (*removed)++;
411 }
412
413 ret = true;
414 }
415 }
416 return ret;
417 }
418
419 /* Main entry point for the unroll-and-jam transformation
420 described above. */
421
422 static unsigned int
423 tree_loop_unroll_and_jam (void)
424 {
425 struct loop *loop;
426 bool changed = false;
427
428 gcc_assert (scev_initialized_p ());
429
430 /* Go through all innermost loops. */
431 FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
432 {
433 struct loop *outer = loop_outer (loop);
434
435 if (loop_depth (loop) < 2
436 || optimize_loop_nest_for_size_p (outer))
437 continue;
438
439 if (!unroll_jam_possible_p (outer, loop))
440 continue;
441
442 vec<data_reference_p> datarefs;
443 vec<ddr_p> dependences;
444 unsigned unroll_factor, profit_unroll, removed;
445 struct tree_niter_desc desc;
446 bool unroll = false;
447
448 auto_vec<loop_p, 3> loop_nest;
449 dependences.create (10);
450 datarefs.create (10);
451 if (!compute_data_dependences_for_loop (outer, true, &loop_nest,
452 &datarefs, &dependences))
453 {
454 if (dump_file && (dump_flags & TDF_DETAILS))
455 fprintf (dump_file, "Cannot analyze data dependencies\n");
456 free_data_refs (datarefs);
457 free_dependence_relations (dependences);
458 return false;
459 }
460 if (!datarefs.length ())
461 continue;
462
463 if (dump_file && (dump_flags & TDF_DETAILS))
464 dump_data_dependence_relations (dump_file, dependences);
465
466 unroll_factor = (unsigned)-1;
467 profit_unroll = 1;
468 removed = 0;
469
470 /* Check all dependencies. */
471 unsigned i;
472 struct data_dependence_relation *ddr;
473 FOR_EACH_VEC_ELT (dependences, i, ddr)
474 {
475 struct data_reference *dra, *drb;
476
477 /* If the refs are independend there's nothing to do. */
478 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
479 continue;
480 dra = DDR_A (ddr);
481 drb = DDR_B (ddr);
482 /* Nothing interesting for the self dependencies. */
483 if (dra == drb)
484 continue;
485
486 /* Now check the distance vector, for determining a sensible
487 outer unroll factor, and for validity of merging the inner
488 loop copies. */
489 if (!adjust_unroll_factor (ddr, &unroll_factor, &profit_unroll,
490 &removed))
491 {
492 /* Couldn't get the distance vector. For two reads that's
493 harmless (we assume we should unroll). For at least
494 one write this means we can't check the dependence direction
495 and hence can't determine safety. */
496
497 if (DR_IS_WRITE (dra) || DR_IS_WRITE (drb))
498 {
499 unroll_factor = 0;
500 break;
501 }
502 }
503 }
504
505 /* We regard a user-specified minimum percentage of zero as a request
506 to ignore all profitability concerns and apply the transformation
507 always. */
508 if (!PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT))
509 profit_unroll = 2;
510 else if (removed * 100 / datarefs.length ()
511 < (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT))
512 profit_unroll = 1;
513 if (unroll_factor > profit_unroll)
514 unroll_factor = profit_unroll;
515 if (unroll_factor > (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MAX_UNROLL))
516 unroll_factor = PARAM_VALUE (PARAM_UNROLL_JAM_MAX_UNROLL);
517 unroll = (unroll_factor > 1
518 && can_unroll_loop_p (outer, unroll_factor, &desc));
519
520 if (unroll)
521 {
522 if (dump_enabled_p ())
523 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | TDF_DETAILS,
524 find_loop_location (outer),
525 "applying unroll and jam with factor %d\n",
526 unroll_factor);
527 initialize_original_copy_tables ();
528 tree_unroll_loop (outer, unroll_factor, single_dom_exit (outer),
529 &desc);
530 free_original_copy_tables ();
531 fuse_loops (outer->inner);
532 changed = true;
533 }
534
535 loop_nest.release ();
536 free_dependence_relations (dependences);
537 free_data_refs (datarefs);
538 }
539
540 if (changed)
541 {
542 scev_reset ();
543 free_dominance_info (CDI_DOMINATORS);
544 return TODO_cleanup_cfg;
545 }
546 return 0;
547 }
548
549 /* Pass boilerplate */
550
551 namespace {
552
553 const pass_data pass_data_loop_jam =
554 {
555 GIMPLE_PASS, /* type */
556 "unrolljam", /* name */
557 OPTGROUP_LOOP, /* optinfo_flags */
558 TV_LOOP_JAM, /* tv_id */
559 PROP_cfg, /* properties_required */
560 0, /* properties_provided */
561 0, /* properties_destroyed */
562 0, /* todo_flags_start */
563 0, /* todo_flags_finish */
564 };
565
566 class pass_loop_jam : public gimple_opt_pass
567 {
568 public:
569 pass_loop_jam (gcc::context *ctxt)
570 : gimple_opt_pass (pass_data_loop_jam, ctxt)
571 {}
572
573 /* opt_pass methods: */
574 virtual bool gate (function *) { return flag_unroll_jam != 0; }
575 virtual unsigned int execute (function *);
576
577 };
578
579 unsigned int
580 pass_loop_jam::execute (function *fun)
581 {
582 if (number_of_loops (fun) <= 1)
583 return 0;
584
585 return tree_loop_unroll_and_jam ();
586 }
587
588 }
589
590 gimple_opt_pass *
591 make_pass_loop_jam (gcc::context *ctxt)
592 {
593 return new pass_loop_jam (ctxt);
594 }