145
|
1 /* Loop versioning pass.
|
|
2 Copyright (C) 2018-2020 Free Software Foundation, Inc.
|
|
3
|
|
4 This file is part of GCC.
|
|
5
|
|
6 GCC is free software; you can redistribute it and/or modify it
|
|
7 under the terms of the GNU General Public License as published by the
|
|
8 Free Software Foundation; either version 3, or (at your option) any
|
|
9 later version.
|
|
10
|
|
11 GCC is distributed in the hope that it will be useful, but WITHOUT
|
|
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
14 for more details.
|
|
15
|
|
16 You should have received a copy of the GNU General Public License
|
|
17 along with GCC; see the file COPYING3. If not see
|
|
18 <http://www.gnu.org/licenses/>. */
|
|
19
|
|
20 #include "config.h"
|
|
21 #include "system.h"
|
|
22 #include "coretypes.h"
|
|
23 #include "backend.h"
|
|
24 #include "tree.h"
|
|
25 #include "gimple.h"
|
|
26 #include "gimple-iterator.h"
|
|
27 #include "tree-pass.h"
|
|
28 #include "gimplify-me.h"
|
|
29 #include "cfgloop.h"
|
|
30 #include "tree-ssa-loop.h"
|
|
31 #include "ssa.h"
|
|
32 #include "tree-scalar-evolution.h"
|
|
33 #include "tree-chrec.h"
|
|
34 #include "tree-ssa-loop-ivopts.h"
|
|
35 #include "fold-const.h"
|
|
36 #include "tree-ssa-propagate.h"
|
|
37 #include "tree-inline.h"
|
|
38 #include "domwalk.h"
|
|
39 #include "alloc-pool.h"
|
|
40 #include "vr-values.h"
|
|
41 #include "gimple-ssa-evrp-analyze.h"
|
|
42 #include "tree-vectorizer.h"
|
|
43 #include "omp-general.h"
|
|
44 #include "predict.h"
|
|
45 #include "tree-into-ssa.h"
|
|
46
|
|
47 namespace {
|
|
48
|
|
49 /* This pass looks for loops that could be simplified if certain loop
|
|
50 invariant conditions were true. It is effectively a form of loop
|
|
51 splitting in which the pass produces the split conditions itself,
|
|
52 instead of using ones that are already present in the IL.
|
|
53
|
|
54 Versioning for when strides are 1
|
|
55 ---------------------------------
|
|
56
|
|
57 At the moment the only thing the pass looks for are memory references
|
|
58 like:
|
|
59
|
|
60 for (auto i : ...)
|
|
61 ...x[i * stride]...
|
|
62
|
|
63 It considers changing such loops to:
|
|
64
|
|
65 if (stride == 1)
|
|
66 for (auto i : ...) [A]
|
|
67 ...x[i]...
|
|
68 else
|
|
69 for (auto i : ...) [B]
|
|
70 ...x[i * stride]...
|
|
71
|
|
72 This can have several benefits:
|
|
73
|
|
74 (1) [A] is often easier or cheaper to vectorize than [B].
|
|
75
|
|
76 (2) The scalar code in [A] is simpler than the scalar code in [B]
|
|
77 (if the loops cannot be vectorized or need an epilogue loop).
|
|
78
|
|
79 (3) We might recognize [A] as a pattern, such as a memcpy or memset.
|
|
80
|
|
81 (4) [A] has simpler address evolutions, which can help other passes
|
|
82 like loop interchange.
|
|
83
|
|
84 The optimization is particularly useful for assumed-shape arrays in
|
|
85 Fortran, where the stride of the innermost dimension depends on the
|
|
86 array descriptor but is often equal to 1 in practice. For example:
|
|
87
|
|
88 subroutine f1(x)
|
|
89 real :: x(:)
|
|
90 x(:) = 100
|
|
91 end subroutine f1
|
|
92
|
|
93 generates the equivalent of:
|
|
94
|
|
95 raw_stride = *x.dim[0].stride;
|
|
96 stride = raw_stride != 0 ? raw_stride : 1;
|
|
97 x_base = *x.data;
|
|
98 ...
|
|
99 tmp1 = stride * S;
|
|
100 tmp2 = tmp1 - stride;
|
|
101 *x_base[tmp2] = 1.0e+2;
|
|
102
|
|
103 but in the common case that stride == 1, the last three statements
|
|
104 simplify to:
|
|
105
|
|
106 tmp3 = S + -1;
|
|
107 *x_base[tmp3] = 1.0e+2;
|
|
108
|
|
109 The optimization is in principle very simple. The difficult parts are:
|
|
110
|
|
111 (a) deciding which parts of a general address calculation correspond
|
|
112 to the inner dimension of an array, since this usually isn't explicit
|
|
113 in the IL, and for C often isn't even explicit in the source code
|
|
114
|
|
115 (b) estimating when the transformation is worthwhile
|
|
116
|
|
117 Structure
|
|
118 ---------
|
|
119
|
|
120 The pass has four phases:
|
|
121
|
|
122 (1) Walk through the statements looking for and recording potential
|
|
123 versioning opportunities. Stop if there are none.
|
|
124
|
|
125 (2) Use context-sensitive range information to see whether any versioning
|
|
126 conditions are impossible in practice. Remove them if so, and stop
|
|
127 if no opportunities remain.
|
|
128
|
|
129 (We do this only after (1) to keep compile time down when no
|
|
130 versioning opportunities exist.)
|
|
131
|
|
132 (3) Apply the cost model. Decide which versioning opportunities are
|
|
133 worthwhile and at which nesting level they should be applied.
|
|
134
|
|
135 (4) Attempt to version all the loops selected by (3), so that:
|
|
136
|
|
137 for (...)
|
|
138 ...
|
|
139
|
|
140 becomes:
|
|
141
|
|
142 if (!cond)
|
|
143 for (...) // Original loop
|
|
144 ...
|
|
145 else
|
|
146 for (...) // New loop
|
|
147 ...
|
|
148
|
|
149 Use the version condition COND to simplify the new loop. */
|
|
150
|
|
151 /* Enumerates the likelihood that a particular value indexes the inner
|
|
152 dimension of an array. */
|
|
153 enum inner_likelihood {
|
|
154 INNER_UNLIKELY,
|
|
155 INNER_DONT_KNOW,
|
|
156 INNER_LIKELY
|
|
157 };
|
|
158
|
|
159 /* Information about one term of an address_info. */
|
|
160 struct address_term_info
|
|
161 {
|
|
162 /* The value of the term is EXPR * MULTIPLIER. */
|
|
163 tree expr;
|
|
164 unsigned HOST_WIDE_INT multiplier;
|
|
165
|
|
166 /* The stride applied by EXPR in each iteration of some unrecorded loop,
|
|
167 or null if no stride has been identified. */
|
|
168 tree stride;
|
|
169
|
|
170 /* Enumerates the likelihood that EXPR indexes the inner dimension
|
|
171 of an array. */
|
|
172 enum inner_likelihood inner_likelihood;
|
|
173
|
|
174 /* True if STRIDE == 1 is a versioning opportunity when considered
|
|
175 in isolation. */
|
|
176 bool versioning_opportunity_p;
|
|
177 };
|
|
178
|
|
179 /* Information about an address calculation, and the range of constant
|
|
180 offsets applied to it. */
|
|
181 class address_info
|
|
182 {
|
|
183 public:
|
|
184 static const unsigned int MAX_TERMS = 8;
|
|
185
|
|
186 /* One statement that calculates the address. If multiple statements
|
|
187 share the same address, we only record the first. */
|
|
188 gimple *stmt;
|
|
189
|
|
190 /* The loop containing STMT (cached for convenience). If multiple
|
|
191 statements share the same address, they all belong to this loop. */
|
|
192 class loop *loop;
|
|
193
|
|
194 /* A decomposition of the calculation into a sum of terms plus an
|
|
195 optional base. When BASE is provided, it is never an SSA name.
|
|
196 Once initialization is complete, all members of TERMs are SSA names. */
|
|
197 tree base;
|
|
198 auto_vec<address_term_info, MAX_TERMS> terms;
|
|
199
|
|
200 /* All bytes accessed from the address fall in the offset range
|
|
201 [MIN_OFFSET, MAX_OFFSET). */
|
|
202 HOST_WIDE_INT min_offset, max_offset;
|
|
203 };
|
|
204
|
|
205 /* Stores addresses based on their base and terms (ignoring the offsets). */
|
|
206 struct address_info_hasher : nofree_ptr_hash <address_info>
|
|
207 {
|
|
208 static hashval_t hash (const address_info *);
|
|
209 static bool equal (const address_info *, const address_info *);
|
|
210 };
|
|
211
|
|
212 /* Information about the versioning we'd like to apply to a loop. */
|
|
213 class loop_info
|
|
214 {
|
|
215 public:
|
|
216 bool worth_versioning_p () const;
|
|
217
|
|
218 /* True if we've decided not to version this loop. The remaining
|
|
219 fields are meaningless if so. */
|
|
220 bool rejected_p;
|
|
221
|
|
222 /* True if at least one subloop of this loop benefits from versioning. */
|
|
223 bool subloops_benefit_p;
|
|
224
|
|
225 /* An estimate of the total number of instructions in the loop,
|
|
226 excluding those in subloops that benefit from versioning. */
|
|
227 unsigned int num_insns;
|
|
228
|
|
229 /* The outermost loop that can handle all the version checks
|
|
230 described below. */
|
|
231 class loop *outermost;
|
|
232
|
|
233 /* The first entry in the list of blocks that belong to this loop
|
|
234 (and not to subloops). m_next_block_in_loop provides the chain
|
|
235 pointers for the list. */
|
|
236 basic_block block_list;
|
|
237
|
|
238 /* We'd like to version the loop for the case in which these SSA names
|
|
239 (keyed off their SSA_NAME_VERSION) are all equal to 1 at runtime. */
|
|
240 bitmap_head unity_names;
|
|
241
|
|
242 /* If versioning succeeds, this points the version of the loop that
|
|
243 assumes the version conditions holds. */
|
|
244 class loop *optimized_loop;
|
|
245 };
|
|
246
|
|
247 /* The main pass structure. */
|
|
248 class loop_versioning
|
|
249 {
|
|
250 public:
|
|
251 loop_versioning (function *);
|
|
252 ~loop_versioning ();
|
|
253 unsigned int run ();
|
|
254
|
|
255 private:
|
|
256 /* Used to walk the dominator tree to find loop versioning conditions
|
|
257 that are always false. */
|
|
258 class lv_dom_walker : public dom_walker
|
|
259 {
|
|
260 public:
|
|
261 lv_dom_walker (loop_versioning &);
|
|
262
|
|
263 edge before_dom_children (basic_block) FINAL OVERRIDE;
|
|
264 void after_dom_children (basic_block) FINAL OVERRIDE;
|
|
265
|
|
266 private:
|
|
267 /* The parent pass. */
|
|
268 loop_versioning &m_lv;
|
|
269
|
|
270 /* Used to build context-dependent range information. */
|
|
271 evrp_range_analyzer m_range_analyzer;
|
|
272 };
|
|
273
|
|
274 /* Used to simplify statements based on conditions that are established
|
|
275 by the version checks. */
|
|
276 class name_prop : public substitute_and_fold_engine
|
|
277 {
|
|
278 public:
|
|
279 name_prop (loop_info &li) : m_li (li) {}
|
|
280 tree get_value (tree) FINAL OVERRIDE;
|
|
281
|
|
282 private:
|
|
283 /* Information about the versioning we've performed on the loop. */
|
|
284 loop_info &m_li;
|
|
285 };
|
|
286
|
|
287 loop_info &get_loop_info (class loop *loop) { return m_loops[loop->num]; }
|
|
288
|
|
289 unsigned int max_insns_for_loop (class loop *);
|
|
290 bool expensive_stmt_p (gimple *);
|
|
291
|
|
292 void version_for_unity (gimple *, tree);
|
|
293 bool acceptable_multiplier_p (tree, unsigned HOST_WIDE_INT,
|
|
294 unsigned HOST_WIDE_INT * = 0);
|
|
295 bool acceptable_type_p (tree, unsigned HOST_WIDE_INT *);
|
|
296 bool multiply_term_by (address_term_info &, tree);
|
|
297 inner_likelihood get_inner_likelihood (tree, unsigned HOST_WIDE_INT);
|
|
298 void dump_inner_likelihood (address_info &, address_term_info &);
|
|
299 void analyze_stride (address_info &, address_term_info &,
|
|
300 tree, class loop *);
|
|
301 bool find_per_loop_multiplication (address_info &, address_term_info &);
|
|
302 bool analyze_term_using_scevs (address_info &, address_term_info &);
|
|
303 void analyze_arbitrary_term (address_info &, address_term_info &);
|
|
304 void analyze_address_fragment (address_info &);
|
|
305 void record_address_fragment (gimple *, unsigned HOST_WIDE_INT,
|
|
306 tree, unsigned HOST_WIDE_INT, HOST_WIDE_INT);
|
|
307 void analyze_expr (gimple *, tree);
|
|
308 bool analyze_block (basic_block);
|
|
309 bool analyze_blocks ();
|
|
310
|
|
311 void prune_loop_conditions (class loop *, vr_values *);
|
|
312 bool prune_conditions ();
|
|
313
|
|
314 void merge_loop_info (class loop *, class loop *);
|
|
315 void add_loop_to_queue (class loop *);
|
|
316 bool decide_whether_loop_is_versionable (class loop *);
|
|
317 bool make_versioning_decisions ();
|
|
318
|
|
319 bool version_loop (class loop *);
|
|
320 void implement_versioning_decisions ();
|
|
321
|
|
322 /* The function we're optimizing. */
|
|
323 function *m_fn;
|
|
324
|
|
325 /* The obstack to use for all pass-specific bitmaps. */
|
|
326 bitmap_obstack m_bitmap_obstack;
|
|
327
|
|
328 /* An obstack to use for general allocation. */
|
|
329 obstack m_obstack;
|
|
330
|
|
331 /* The number of loops in the function. */
|
|
332 unsigned int m_nloops;
|
|
333
|
|
334 /* The total number of loop version conditions we've found. */
|
|
335 unsigned int m_num_conditions;
|
|
336
|
|
337 /* Assume that an address fragment of the form i * stride * scale
|
|
338 (for variable stride and constant scale) will not benefit from
|
|
339 versioning for stride == 1 when scale is greater than this value. */
|
|
340 unsigned HOST_WIDE_INT m_maximum_scale;
|
|
341
|
|
342 /* Information about each loop. */
|
|
343 auto_vec<loop_info> m_loops;
|
|
344
|
|
345 /* Used to form a linked list of blocks that belong to a loop,
|
|
346 started by loop_info::block_list. */
|
|
347 auto_vec<basic_block> m_next_block_in_loop;
|
|
348
|
|
349 /* The list of loops that we've decided to version. */
|
|
350 auto_vec<class loop *> m_loops_to_version;
|
|
351
|
|
352 /* A table of addresses in the current loop, keyed off their values
|
|
353 but not their offsets. */
|
|
354 hash_table <address_info_hasher> m_address_table;
|
|
355
|
|
356 /* A list of all addresses in M_ADDRESS_TABLE, in a predictable order. */
|
|
357 auto_vec <address_info *, 32> m_address_list;
|
|
358 };
|
|
359
|
|
360 /* If EXPR is an SSA name and not a default definition, return the
|
|
361 defining statement, otherwise return null. */
|
|
362
|
|
363 static gimple *
|
|
364 maybe_get_stmt (tree expr)
|
|
365 {
|
|
366 if (TREE_CODE (expr) == SSA_NAME && !SSA_NAME_IS_DEFAULT_DEF (expr))
|
|
367 return SSA_NAME_DEF_STMT (expr);
|
|
368 return NULL;
|
|
369 }
|
|
370
|
|
371 /* Like maybe_get_stmt, but also return null if the defining
|
|
372 statement isn't an assignment. */
|
|
373
|
|
374 static gassign *
|
|
375 maybe_get_assign (tree expr)
|
|
376 {
|
|
377 return safe_dyn_cast <gassign *> (maybe_get_stmt (expr));
|
|
378 }
|
|
379
|
|
380 /* Return true if this pass should look through a cast of expression FROM
|
|
381 to type TYPE when analyzing pieces of an address. */
|
|
382
|
|
383 static bool
|
|
384 look_through_cast_p (tree type, tree from)
|
|
385 {
|
|
386 return (INTEGRAL_TYPE_P (TREE_TYPE (from)) == INTEGRAL_TYPE_P (type)
|
|
387 && POINTER_TYPE_P (TREE_TYPE (from)) == POINTER_TYPE_P (type));
|
|
388 }
|
|
389
|
|
390 /* Strip all conversions of integers or pointers from EXPR, regardless
|
|
391 of whether the conversions are nops. This is useful in the context
|
|
392 of this pass because we're not trying to fold or simulate the
|
|
393 expression; we just want to see how it's structured. */
|
|
394
|
|
395 static tree
|
|
396 strip_casts (tree expr)
|
|
397 {
|
|
398 const unsigned int MAX_NITERS = 4;
|
|
399
|
|
400 tree type = TREE_TYPE (expr);
|
|
401 while (CONVERT_EXPR_P (expr)
|
|
402 && look_through_cast_p (type, TREE_OPERAND (expr, 0)))
|
|
403 expr = TREE_OPERAND (expr, 0);
|
|
404
|
|
405 for (unsigned int niters = 0; niters < MAX_NITERS; ++niters)
|
|
406 {
|
|
407 gassign *assign = maybe_get_assign (expr);
|
|
408 if (assign
|
|
409 && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign))
|
|
410 && look_through_cast_p (type, gimple_assign_rhs1 (assign)))
|
|
411 expr = gimple_assign_rhs1 (assign);
|
|
412 else
|
|
413 break;
|
|
414 }
|
|
415 return expr;
|
|
416 }
|
|
417
|
|
418 /* Compare two address_term_infos in the same address_info. */
|
|
419
|
|
420 static int
|
|
421 compare_address_terms (const void *a_uncast, const void *b_uncast)
|
|
422 {
|
|
423 const address_term_info *a = (const address_term_info *) a_uncast;
|
|
424 const address_term_info *b = (const address_term_info *) b_uncast;
|
|
425
|
|
426 if (a->expr != b->expr)
|
|
427 return SSA_NAME_VERSION (a->expr) < SSA_NAME_VERSION (b->expr) ? -1 : 1;
|
|
428
|
|
429 if (a->multiplier != b->multiplier)
|
|
430 return a->multiplier < b->multiplier ? -1 : 1;
|
|
431
|
|
432 return 0;
|
|
433 }
|
|
434
|
|
435 /* Dump ADDRESS using flags FLAGS. */
|
|
436
|
|
437 static void
|
|
438 dump_address_info (dump_flags_t flags, address_info &address)
|
|
439 {
|
|
440 if (address.base)
|
|
441 dump_printf (flags, "%T + ", address.base);
|
|
442 for (unsigned int i = 0; i < address.terms.length (); ++i)
|
|
443 {
|
|
444 if (i != 0)
|
|
445 dump_printf (flags, " + ");
|
|
446 dump_printf (flags, "%T", address.terms[i].expr);
|
|
447 if (address.terms[i].multiplier != 1)
|
|
448 dump_printf (flags, " * %wd", address.terms[i].multiplier);
|
|
449 }
|
|
450 dump_printf (flags, " + [%wd, %wd]",
|
|
451 address.min_offset, address.max_offset - 1);
|
|
452 }
|
|
453
|
|
454 /* Hash an address_info based on its base and terms. */
|
|
455
|
|
456 hashval_t
|
|
457 address_info_hasher::hash (const address_info *info)
|
|
458 {
|
|
459 inchash::hash hash;
|
|
460 hash.add_int (info->base ? TREE_CODE (info->base) : 0);
|
|
461 hash.add_int (info->terms.length ());
|
|
462 for (unsigned int i = 0; i < info->terms.length (); ++i)
|
|
463 {
|
|
464 hash.add_int (SSA_NAME_VERSION (info->terms[i].expr));
|
|
465 hash.add_hwi (info->terms[i].multiplier);
|
|
466 }
|
|
467 return hash.end ();
|
|
468 }
|
|
469
|
|
470 /* Return true if two address_infos have equal bases and terms. Other
|
|
471 properties might be different (such as the statement or constant
|
|
472 offset range). */
|
|
473
|
|
474 bool
|
|
475 address_info_hasher::equal (const address_info *a, const address_info *b)
|
|
476 {
|
|
477 if (a->base != b->base
|
|
478 && (!a->base || !b->base || !operand_equal_p (a->base, b->base, 0)))
|
|
479 return false;
|
|
480
|
|
481 if (a->terms.length () != b->terms.length ())
|
|
482 return false;
|
|
483
|
|
484 for (unsigned int i = 0; i < a->terms.length (); ++i)
|
|
485 if (a->terms[i].expr != b->terms[i].expr
|
|
486 || a->terms[i].multiplier != b->terms[i].multiplier)
|
|
487 return false;
|
|
488
|
|
489 return true;
|
|
490 }
|
|
491
|
|
492 /* Return true if we want to version the loop, i.e. if we have a
|
|
493 specific reason for doing so and no specific reason not to. */
|
|
494
|
|
495 bool
|
|
496 loop_info::worth_versioning_p () const
|
|
497 {
|
|
498 return (!rejected_p
|
|
499 && (!bitmap_empty_p (&unity_names) || subloops_benefit_p));
|
|
500 }
|
|
501
|
|
502 loop_versioning::lv_dom_walker::lv_dom_walker (loop_versioning &lv)
|
|
503 : dom_walker (CDI_DOMINATORS), m_lv (lv), m_range_analyzer (false)
|
|
504 {
|
|
505 }
|
|
506
|
|
507 /* Process BB before processing the blocks it dominates. */
|
|
508
|
|
509 edge
|
|
510 loop_versioning::lv_dom_walker::before_dom_children (basic_block bb)
|
|
511 {
|
|
512 m_range_analyzer.enter (bb);
|
|
513
|
|
514 if (bb == bb->loop_father->header)
|
|
515 m_lv.prune_loop_conditions (bb->loop_father,
|
|
516 m_range_analyzer.get_vr_values ());
|
|
517
|
|
518 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
|
|
519 gsi_next (&si))
|
|
520 m_range_analyzer.record_ranges_from_stmt (gsi_stmt (si), false);
|
|
521
|
|
522 return NULL;
|
|
523 }
|
|
524
|
|
525 /* Process BB after processing the blocks it dominates. */
|
|
526
|
|
527 void
|
|
528 loop_versioning::lv_dom_walker::after_dom_children (basic_block bb)
|
|
529 {
|
|
530 m_range_analyzer.leave (bb);
|
|
531 }
|
|
532
|
|
533 /* Decide whether to replace VAL with a new value in a versioned loop.
|
|
534 Return the new value if so, otherwise return null. */
|
|
535
|
|
536 tree
|
|
537 loop_versioning::name_prop::get_value (tree val)
|
|
538 {
|
|
539 if (TREE_CODE (val) == SSA_NAME
|
|
540 && bitmap_bit_p (&m_li.unity_names, SSA_NAME_VERSION (val)))
|
|
541 return build_one_cst (TREE_TYPE (val));
|
|
542 return NULL_TREE;
|
|
543 }
|
|
544
|
|
545 /* Initialize the structure to optimize FN. */
|
|
546
|
|
547 loop_versioning::loop_versioning (function *fn)
|
|
548 : m_fn (fn),
|
|
549 m_nloops (number_of_loops (fn)),
|
|
550 m_num_conditions (0),
|
|
551 m_address_table (31)
|
|
552 {
|
|
553 bitmap_obstack_initialize (&m_bitmap_obstack);
|
|
554 gcc_obstack_init (&m_obstack);
|
|
555
|
|
556 /* Initialize the loop information. */
|
|
557 m_loops.safe_grow_cleared (m_nloops);
|
|
558 for (unsigned int i = 0; i < m_nloops; ++i)
|
|
559 {
|
|
560 m_loops[i].outermost = get_loop (m_fn, 0);
|
|
561 bitmap_initialize (&m_loops[i].unity_names, &m_bitmap_obstack);
|
|
562 }
|
|
563
|
|
564 /* Initialize the list of blocks that belong to each loop. */
|
|
565 unsigned int nbbs = last_basic_block_for_fn (fn);
|
|
566 m_next_block_in_loop.safe_grow (nbbs);
|
|
567 basic_block bb;
|
|
568 FOR_EACH_BB_FN (bb, fn)
|
|
569 {
|
|
570 loop_info &li = get_loop_info (bb->loop_father);
|
|
571 m_next_block_in_loop[bb->index] = li.block_list;
|
|
572 li.block_list = bb;
|
|
573 }
|
|
574
|
|
575 /* MAX_FIXED_MODE_SIZE should be a reasonable maximum scale for
|
|
576 unvectorizable code, since it is the largest size that can be
|
|
577 handled efficiently by scalar code. omp_max_vf calculates the
|
|
578 maximum number of bytes in a vector, when such a value is relevant
|
|
579 to loop optimization. */
|
|
580 m_maximum_scale = estimated_poly_value (omp_max_vf ());
|
|
581 m_maximum_scale = MAX (m_maximum_scale, MAX_FIXED_MODE_SIZE);
|
|
582 }
|
|
583
|
|
584 loop_versioning::~loop_versioning ()
|
|
585 {
|
|
586 bitmap_obstack_release (&m_bitmap_obstack);
|
|
587 obstack_free (&m_obstack, NULL);
|
|
588 }
|
|
589
|
|
590 /* Return the maximum number of instructions allowed in LOOP before
|
|
591 it becomes too big for versioning.
|
|
592
|
|
593 There are separate limits for inner and outer loops. The limit for
|
|
594 inner loops applies only to loops that benefit directly from versioning.
|
|
595 The limit for outer loops applies to all code in the outer loop and
|
|
596 its subloops that *doesn't* benefit directly from versioning; such code
|
|
597 would be "taken along for the ride". The idea is that if the cost of
|
|
598 the latter is small, it is better to version outer loops rather than
|
|
599 inner loops, both to reduce the number of repeated checks and to enable
|
|
600 more of the loop nest to be optimized as a natural nest (e.g. by loop
|
|
601 interchange or outer-loop vectorization). */
|
|
602
|
|
603 unsigned int
|
|
604 loop_versioning::max_insns_for_loop (class loop *loop)
|
|
605 {
|
|
606 return (loop->inner
|
|
607 ? param_loop_versioning_max_outer_insns
|
|
608 : param_loop_versioning_max_inner_insns);
|
|
609 }
|
|
610
|
|
611 /* Return true if for cost reasons we should avoid versioning any loop
|
|
612 that contains STMT.
|
|
613
|
|
614 Note that we don't need to check whether versioning is invalid for
|
|
615 correctness reasons, since the versioning process does that for us.
|
|
616 The conditions involved are too rare to be worth duplicating here. */
|
|
617
|
|
618 bool
|
|
619 loop_versioning::expensive_stmt_p (gimple *stmt)
|
|
620 {
|
|
621 if (gcall *call = dyn_cast <gcall *> (stmt))
|
|
622 /* Assume for now that the time spent in an "expensive" call would
|
|
623 overwhelm any saving from versioning. */
|
|
624 return !gimple_inexpensive_call_p (call);
|
|
625 return false;
|
|
626 }
|
|
627
|
|
628 /* Record that we want to version the loop that contains STMT for the
|
|
629 case in which SSA name NAME is equal to 1. We already know that NAME
|
|
630 is invariant in the loop. */
|
|
631
|
|
632 void
|
|
633 loop_versioning::version_for_unity (gimple *stmt, tree name)
|
|
634 {
|
|
635 class loop *loop = loop_containing_stmt (stmt);
|
|
636 loop_info &li = get_loop_info (loop);
|
|
637
|
|
638 if (bitmap_set_bit (&li.unity_names, SSA_NAME_VERSION (name)))
|
|
639 {
|
|
640 /* This is the first time we've wanted to version LOOP for NAME.
|
|
641 Keep track of the outermost loop that can handle all versioning
|
|
642 checks in LI. */
|
|
643 class loop *outermost
|
|
644 = outermost_invariant_loop_for_expr (loop, name);
|
|
645 if (loop_depth (li.outermost) < loop_depth (outermost))
|
|
646 li.outermost = outermost;
|
|
647
|
|
648 if (dump_enabled_p ())
|
|
649 {
|
|
650 dump_printf_loc (MSG_NOTE, stmt, "want to version containing loop"
|
|
651 " for when %T == 1", name);
|
|
652 if (outermost == loop)
|
|
653 dump_printf (MSG_NOTE, "; cannot hoist check further");
|
|
654 else
|
|
655 {
|
|
656 dump_printf (MSG_NOTE, "; could implement the check at loop"
|
|
657 " depth %d", loop_depth (outermost));
|
|
658 if (loop_depth (li.outermost) > loop_depth (outermost))
|
|
659 dump_printf (MSG_NOTE, ", but other checks only allow"
|
|
660 " a depth of %d", loop_depth (li.outermost));
|
|
661 }
|
|
662 dump_printf (MSG_NOTE, "\n");
|
|
663 }
|
|
664
|
|
665 m_num_conditions += 1;
|
|
666 }
|
|
667 else
|
|
668 {
|
|
669 /* This is a duplicate request. */
|
|
670 if (dump_enabled_p ())
|
|
671 dump_printf_loc (MSG_NOTE, stmt, "already asked to version containing"
|
|
672 " loop for when %T == 1\n", name);
|
|
673 }
|
|
674 }
|
|
675
|
|
676 /* Return true if OP1_TREE is constant and if in principle it is worth
|
|
677 versioning an address fragment of the form:
|
|
678
|
|
679 i * OP1_TREE * OP2 * stride
|
|
680
|
|
681 for the case in which stride == 1. This in practice means testing
|
|
682 whether:
|
|
683
|
|
684 OP1_TREE * OP2 <= M_MAXIMUM_SCALE.
|
|
685
|
|
686 If RESULT is nonnull, store OP1_TREE * OP2 there when returning true. */
|
|
687
|
|
688 bool
|
|
689 loop_versioning::acceptable_multiplier_p (tree op1_tree,
|
|
690 unsigned HOST_WIDE_INT op2,
|
|
691 unsigned HOST_WIDE_INT *result)
|
|
692 {
|
|
693 if (tree_fits_uhwi_p (op1_tree))
|
|
694 {
|
|
695 unsigned HOST_WIDE_INT op1 = tree_to_uhwi (op1_tree);
|
|
696 /* The first part checks for overflow. */
|
|
697 if (op1 * op2 >= op2 && op1 * op2 <= m_maximum_scale)
|
|
698 {
|
|
699 if (result)
|
|
700 *result = op1 * op2;
|
|
701 return true;
|
|
702 }
|
|
703 }
|
|
704 return false;
|
|
705 }
|
|
706
|
|
707 /* Return true if it is worth using loop versioning on a memory access
|
|
708 of type TYPE. Store the size of the access in *SIZE if so. */
|
|
709
|
|
710 bool
|
|
711 loop_versioning::acceptable_type_p (tree type, unsigned HOST_WIDE_INT *size)
|
|
712 {
|
|
713 return (TYPE_SIZE_UNIT (type)
|
|
714 && acceptable_multiplier_p (TYPE_SIZE_UNIT (type), 1, size));
|
|
715 }
|
|
716
|
|
717 /* See whether OP is constant and whether we can multiply TERM by that
|
|
718 constant without exceeding M_MAXIMUM_SCALE. Return true and update
|
|
719 TERM if so. */
|
|
720
|
|
721 bool
|
|
722 loop_versioning::multiply_term_by (address_term_info &term, tree op)
|
|
723 {
|
|
724 return acceptable_multiplier_p (op, term.multiplier, &term.multiplier);
|
|
725 }
|
|
726
|
|
727 /* Decide whether an address fragment of the form STRIDE * MULTIPLIER
|
|
728 is likely to be indexing an innermost dimension, returning the result
|
|
729 as an INNER_* probability. */
|
|
730
|
|
731 inner_likelihood
|
|
732 loop_versioning::get_inner_likelihood (tree stride,
|
|
733 unsigned HOST_WIDE_INT multiplier)
|
|
734 {
|
|
735 const unsigned int MAX_NITERS = 8;
|
|
736
|
|
737 /* Iterate over possible values of STRIDE. Return INNER_LIKELY if at
|
|
738 least one of those values is likely to be for the innermost dimension.
|
|
739 Record in UNLIKELY_P if at least one of those values is unlikely to be
|
|
740 for the innermost dimension.
|
|
741
|
|
742 E.g. for:
|
|
743
|
|
744 stride = cond ? a * b : 1
|
|
745
|
|
746 we should treat STRIDE as being a likely inner dimension, since
|
|
747 we know that it is 1 under at least some circumstances. (See the
|
|
748 Fortran example below.) However:
|
|
749
|
|
750 stride = a * b
|
|
751
|
|
752 on its own is unlikely to be for the innermost dimension, since
|
|
753 that would require both a and b to be 1 at runtime. */
|
|
754 bool unlikely_p = false;
|
|
755 tree worklist[MAX_NITERS];
|
|
756 unsigned int length = 0;
|
|
757 worklist[length++] = stride;
|
|
758 for (unsigned int i = 0; i < length; ++i)
|
|
759 {
|
|
760 tree expr = worklist[i];
|
|
761
|
|
762 if (CONSTANT_CLASS_P (expr))
|
|
763 {
|
|
764 /* See if EXPR * MULTIPLIER would be consistent with an individual
|
|
765 access or a small grouped access. */
|
|
766 if (acceptable_multiplier_p (expr, multiplier))
|
|
767 return INNER_LIKELY;
|
|
768 else
|
|
769 unlikely_p = true;
|
|
770 }
|
|
771 else if (gimple *stmt = maybe_get_stmt (expr))
|
|
772 {
|
|
773 /* If EXPR is set by a PHI node, queue its arguments in case
|
|
774 we find one that is consistent with an inner dimension.
|
|
775
|
|
776 An important instance of this is the Fortran handling of array
|
|
777 descriptors, which calculates the stride of the inner dimension
|
|
778 using a PHI equivalent of:
|
|
779
|
|
780 raw_stride = a.dim[0].stride;
|
|
781 stride = raw_stride != 0 ? raw_stride : 1;
|
|
782
|
|
783 (Strides for outer dimensions do not treat 0 specially.) */
|
|
784 if (gphi *phi = dyn_cast <gphi *> (stmt))
|
|
785 {
|
|
786 unsigned int nargs = gimple_phi_num_args (phi);
|
|
787 for (unsigned int j = 0; j < nargs && length < MAX_NITERS; ++j)
|
|
788 worklist[length++] = strip_casts (gimple_phi_arg_def (phi, j));
|
|
789 }
|
|
790 /* If the value is set by an assignment, expect it to be read
|
|
791 from memory (such as an array descriptor) rather than be
|
|
792 calculated. */
|
|
793 else if (gassign *assign = dyn_cast <gassign *> (stmt))
|
|
794 {
|
|
795 if (!gimple_assign_load_p (assign))
|
|
796 unlikely_p = true;
|
|
797 }
|
|
798 /* Things like calls don't really tell us anything. */
|
|
799 }
|
|
800 }
|
|
801
|
|
802 /* We didn't find any possible values of STRIDE that were likely to be
|
|
803 for the innermost dimension. If we found one that was actively
|
|
804 unlikely to be for the innermost dimension, assume that that applies
|
|
805 to STRIDE too. */
|
|
806 return unlikely_p ? INNER_UNLIKELY : INNER_DONT_KNOW;
|
|
807 }
|
|
808
|
|
809 /* Dump the likelihood that TERM's stride is for the innermost dimension.
|
|
810 ADDRESS is the address that contains TERM. */
|
|
811
|
|
812 void
|
|
813 loop_versioning::dump_inner_likelihood (address_info &address,
|
|
814 address_term_info &term)
|
|
815 {
|
|
816 if (term.inner_likelihood == INNER_LIKELY)
|
|
817 dump_printf_loc (MSG_NOTE, address.stmt, "%T is likely to be the"
|
|
818 " innermost dimension\n", term.stride);
|
|
819 else if (term.inner_likelihood == INNER_UNLIKELY)
|
|
820 dump_printf_loc (MSG_NOTE, address.stmt, "%T is probably not the"
|
|
821 " innermost dimension\n", term.stride);
|
|
822 else
|
|
823 dump_printf_loc (MSG_NOTE, address.stmt, "cannot tell whether %T"
|
|
824 " is the innermost dimension\n", term.stride);
|
|
825 }
|
|
826
|
|
827 /* The caller has identified that STRIDE is the stride of interest
|
|
828 in TERM, and that the stride is applied in OP_LOOP. Record this
|
|
829 information in TERM, deciding whether STRIDE is likely to be for
|
|
830 the innermost dimension of an array and whether it represents a
|
|
831 versioning opportunity. ADDRESS is the address that contains TERM. */
|
|
832
|
|
833 void
|
|
834 loop_versioning::analyze_stride (address_info &address,
|
|
835 address_term_info &term,
|
|
836 tree stride, class loop *op_loop)
|
|
837 {
|
|
838 term.stride = stride;
|
|
839
|
|
840 term.inner_likelihood = get_inner_likelihood (stride, term.multiplier);
|
|
841 if (dump_enabled_p ())
|
|
842 dump_inner_likelihood (address, term);
|
|
843
|
|
844 /* To be a versioning opportunity we require:
|
|
845
|
|
846 - The multiplier applied by TERM is equal to the access size,
|
|
847 so that when STRIDE is 1, the accesses in successive loop
|
|
848 iterations are consecutive.
|
|
849
|
|
850 This is deliberately conservative. We could relax it to handle
|
|
851 other cases (such as those with gaps between iterations) if we
|
|
852 find any real testcases for which it's useful.
|
|
853
|
|
854 - the stride is applied in the same loop as STMT rather than
|
|
855 in an outer loop. Although versioning for strides applied in
|
|
856 outer loops could help in some cases -- such as enabling
|
|
857 more loop interchange -- the savings are much lower than for
|
|
858 inner loops.
|
|
859
|
|
860 - the stride is an SSA name that is invariant in STMT's loop,
|
|
861 since otherwise versioning isn't possible. */
|
|
862 unsigned HOST_WIDE_INT access_size = address.max_offset - address.min_offset;
|
|
863 if (term.multiplier == access_size
|
|
864 && address.loop == op_loop
|
|
865 && TREE_CODE (stride) == SSA_NAME
|
|
866 && expr_invariant_in_loop_p (address.loop, stride))
|
|
867 {
|
|
868 term.versioning_opportunity_p = true;
|
|
869 if (dump_enabled_p ())
|
|
870 dump_printf_loc (MSG_NOTE, address.stmt, "%T == 1 is a versioning"
|
|
871 " opportunity\n", stride);
|
|
872 }
|
|
873 }
|
|
874
|
|
875 /* See whether address term TERM (which belongs to ADDRESS) is the result
|
|
876 of multiplying a varying SSA name by a loop-invariant SSA name.
|
|
877 Return true and update TERM if so.
|
|
878
|
|
879 This handles both cases that SCEV might handle, such as:
|
|
880
|
|
881 for (int i = 0; i < n; ++i)
|
|
882 res += a[i * stride];
|
|
883
|
|
884 and ones in which the term varies arbitrarily between iterations, such as:
|
|
885
|
|
886 for (int i = 0; i < n; ++i)
|
|
887 res += a[index[i] * stride]; */
|
|
888
|
|
889 bool
|
|
890 loop_versioning::find_per_loop_multiplication (address_info &address,
|
|
891 address_term_info &term)
|
|
892 {
|
|
893 gassign *mult = maybe_get_assign (term.expr);
|
|
894 if (!mult || gimple_assign_rhs_code (mult) != MULT_EXPR)
|
|
895 return false;
|
|
896
|
|
897 class loop *mult_loop = loop_containing_stmt (mult);
|
|
898 if (!loop_outer (mult_loop))
|
|
899 return false;
|
|
900
|
|
901 tree op1 = strip_casts (gimple_assign_rhs1 (mult));
|
|
902 tree op2 = strip_casts (gimple_assign_rhs2 (mult));
|
|
903 if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
|
|
904 return false;
|
|
905
|
|
906 bool invariant1_p = expr_invariant_in_loop_p (mult_loop, op1);
|
|
907 bool invariant2_p = expr_invariant_in_loop_p (mult_loop, op2);
|
|
908 if (invariant1_p == invariant2_p)
|
|
909 return false;
|
|
910
|
|
911 /* Make sure that the loop invariant is OP2 rather than OP1. */
|
|
912 if (invariant1_p)
|
|
913 std::swap (op1, op2);
|
|
914
|
|
915 if (dump_enabled_p ())
|
|
916 dump_printf_loc (MSG_NOTE, address.stmt, "address term %T = varying %T"
|
|
917 " * loop-invariant %T\n", term.expr, op1, op2);
|
|
918 analyze_stride (address, term, op2, mult_loop);
|
|
919 return true;
|
|
920 }
|
|
921
|
|
922 /* Try to use scalar evolutions to find an address stride for TERM,
|
|
923 which belongs to ADDRESS. Return true and update TERM if so.
|
|
924
|
|
925 Here we are interested in any evolution information we can find,
|
|
926 not just evolutions wrt ADDRESS->LOOP. For example, if we find that
|
|
927 an outer loop obviously iterates over the inner dimension of an array,
|
|
928 that information can help us eliminate worthless versioning opportunities
|
|
929 in inner loops. */
|
|
930
|
|
931 bool
|
|
932 loop_versioning::analyze_term_using_scevs (address_info &address,
|
|
933 address_term_info &term)
|
|
934 {
|
|
935 gimple *setter = maybe_get_stmt (term.expr);
|
|
936 if (!setter)
|
|
937 return false;
|
|
938
|
|
939 class loop *wrt_loop = loop_containing_stmt (setter);
|
|
940 if (!loop_outer (wrt_loop))
|
|
941 return false;
|
|
942
|
|
943 tree chrec = strip_casts (analyze_scalar_evolution (wrt_loop, term.expr));
|
|
944 if (TREE_CODE (chrec) == POLYNOMIAL_CHREC)
|
|
945 {
|
|
946 if (dump_enabled_p ())
|
|
947 dump_printf_loc (MSG_NOTE, address.stmt,
|
|
948 "address term %T = %T\n", term.expr, chrec);
|
|
949
|
|
950 /* Peel casts and accumulate constant multiplications, up to the
|
|
951 limit allowed by M_MAXIMUM_SCALE. */
|
|
952 tree stride = strip_casts (CHREC_RIGHT (chrec));
|
|
953 while (TREE_CODE (stride) == MULT_EXPR
|
|
954 && multiply_term_by (term, TREE_OPERAND (stride, 1)))
|
|
955 stride = strip_casts (TREE_OPERAND (stride, 0));
|
|
956
|
|
957 gassign *assign;
|
|
958 while ((assign = maybe_get_assign (stride))
|
|
959 && gimple_assign_rhs_code (assign) == MULT_EXPR
|
|
960 && multiply_term_by (term, gimple_assign_rhs2 (assign)))
|
|
961 {
|
|
962 if (dump_enabled_p ())
|
|
963 dump_printf_loc (MSG_NOTE, address.stmt,
|
|
964 "looking through %G", assign);
|
|
965 stride = strip_casts (gimple_assign_rhs1 (assign));
|
|
966 }
|
|
967
|
|
968 analyze_stride (address, term, stride, get_chrec_loop (chrec));
|
|
969 return true;
|
|
970 }
|
|
971
|
|
972 return false;
|
|
973 }
|
|
974
|
|
975 /* Address term TERM is an arbitrary term that provides no versioning
|
|
976 opportunities. Analyze it to see whether it contains any likely
|
|
977 inner strides, so that we don't mistakenly version for other
|
|
978 (less likely) candidates.
|
|
979
|
|
980 This copes with invariant innermost indices such as:
|
|
981
|
|
982 x(i, :) = 100
|
|
983
|
|
984 where the "i" component of the address is invariant in the loop
|
|
985 but provides the real inner stride.
|
|
986
|
|
987 ADDRESS is the address that contains TERM. */
|
|
988
|
|
989 void
|
|
990 loop_versioning::analyze_arbitrary_term (address_info &address,
|
|
991 address_term_info &term)
|
|
992
|
|
993 {
|
|
994 /* A multiplication offers two potential strides. Pick the one that
|
|
995 is most likely to be an innermost stride. */
|
|
996 tree expr = term.expr, alt = NULL_TREE;
|
|
997 gassign *mult = maybe_get_assign (expr);
|
|
998 if (mult && gimple_assign_rhs_code (mult) == MULT_EXPR)
|
|
999 {
|
|
1000 expr = strip_casts (gimple_assign_rhs1 (mult));
|
|
1001 alt = strip_casts (gimple_assign_rhs2 (mult));
|
|
1002 }
|
|
1003 term.stride = expr;
|
|
1004 term.inner_likelihood = get_inner_likelihood (expr, term.multiplier);
|
|
1005 if (alt)
|
|
1006 {
|
|
1007 inner_likelihood alt_l = get_inner_likelihood (alt, term.multiplier);
|
|
1008 if (alt_l > term.inner_likelihood)
|
|
1009 {
|
|
1010 term.stride = alt;
|
|
1011 term.inner_likelihood = alt_l;
|
|
1012 }
|
|
1013 }
|
|
1014 if (dump_enabled_p ())
|
|
1015 dump_inner_likelihood (address, term);
|
|
1016 }
|
|
1017
|
|
1018 /* Try to identify loop strides in ADDRESS and try to choose realistic
|
|
1019 versioning opportunities based on these strides.
|
|
1020
|
|
1021 The main difficulty here isn't finding strides that could be used
|
|
1022 in a version check (that's pretty easy). The problem instead is to
|
|
1023 avoid versioning for some stride S that is unlikely ever to be 1 at
|
|
1024 runtime. Versioning for S == 1 on its own would lead to unnecessary
|
|
1025 code bloat, while adding S == 1 to more realistic version conditions
|
|
1026 would lose the optimisation opportunity offered by those other conditions.
|
|
1027
|
|
1028 For example, versioning for a stride of 1 in the Fortran code:
|
|
1029
|
|
1030 integer :: a(:,:)
|
|
1031 a(1,:) = 1
|
|
1032
|
|
1033 is not usually a good idea, since the assignment is iterating over
|
|
1034 an outer dimension and is relatively unlikely to have a stride of 1.
|
|
1035 (It isn't impossible, since the inner dimension might be 1, or the
|
|
1036 array might be transposed.) Similarly, in:
|
|
1037
|
|
1038 integer :: a(:,:), b(:,:)
|
|
1039 b(:,1) = a(1,:)
|
|
1040
|
|
1041 b(:,1) is relatively likely to have a stride of 1 while a(1,:) isn't.
|
|
1042 Versioning for when both strides are 1 would lose most of the benefit
|
|
1043 of versioning for b's access.
|
|
1044
|
|
1045 The approach we take is as follows:
|
|
1046
|
|
1047 - Analyze each term to see whether it has an identifiable stride,
|
|
1048 regardless of which loop applies the stride.
|
|
1049
|
|
1050 - Evaluate the likelihood that each such stride is for the innermost
|
|
1051 dimension of an array, on the scale "likely", "don't know" or "unlikely".
|
|
1052
|
|
1053 - If there is a single "likely" innermost stride, and that stride is
|
|
1054 applied in the loop that contains STMT, version the loop for when the
|
|
1055 stride is 1. This deals with the cases in which we're fairly
|
|
1056 confident of doing the right thing, such as the b(:,1) reference above.
|
|
1057
|
|
1058 - If there are no "likely" innermost strides, and the loop that contains
|
|
1059 STMT uses a stride that we rated as "don't know", version for when
|
|
1060 that stride is 1. This is principally used for C code such as:
|
|
1061
|
|
1062 for (int i = 0; i < n; ++i)
|
|
1063 a[i * x] = ...;
|
|
1064
|
|
1065 and:
|
|
1066
|
|
1067 for (int j = 0; j < n; ++j)
|
|
1068 for (int i = 0; i < n; ++i)
|
|
1069 a[i * x + j * y] = ...;
|
|
1070
|
|
1071 where nothing in the way "x" and "y" are set gives a hint as to
|
|
1072 whether "i" iterates over the innermost dimension of the array.
|
|
1073 In these situations it seems reasonable to assume the the
|
|
1074 programmer has nested the loops appropriately (although of course
|
|
1075 there are examples like GEMM in which this assumption doesn't hold
|
|
1076 for all accesses in the loop).
|
|
1077
|
|
1078 This case is also useful for the Fortran equivalent of the
|
|
1079 above C code. */
|
|
1080
|
|
1081 void
|
|
1082 loop_versioning::analyze_address_fragment (address_info &address)
|
|
1083 {
|
|
1084 if (dump_enabled_p ())
|
|
1085 {
|
|
1086 dump_printf_loc (MSG_NOTE, address.stmt, "analyzing address fragment ");
|
|
1087 dump_address_info (MSG_NOTE, address);
|
|
1088 dump_printf (MSG_NOTE, "\n");
|
|
1089 }
|
|
1090
|
|
1091 /* Analyze each component of the sum to see whether it involves an
|
|
1092 apparent stride.
|
|
1093
|
|
1094 There is an overlap between the addresses that
|
|
1095 find_per_loop_multiplication and analyze_term_using_scevs can handle,
|
|
1096 but the former is much cheaper than SCEV analysis, so try it first. */
|
|
1097 for (unsigned int i = 0; i < address.terms.length (); ++i)
|
|
1098 if (!find_per_loop_multiplication (address, address.terms[i])
|
|
1099 && !analyze_term_using_scevs (address, address.terms[i])
|
|
1100 && !POINTER_TYPE_P (TREE_TYPE (address.terms[i].expr)))
|
|
1101 analyze_arbitrary_term (address, address.terms[i]);
|
|
1102
|
|
1103 /* Check for strides that are likely to be for the innermost dimension.
|
|
1104
|
|
1105 1. If there is a single likely inner stride, if it is an SSA name,
|
|
1106 and if it is worth versioning the loop for when the SSA name
|
|
1107 equals 1, record that we want to do so.
|
|
1108
|
|
1109 2. Otherwise, if there any likely inner strides, bail out. This means
|
|
1110 one of:
|
|
1111
|
|
1112 (a) There are multiple likely inner strides. This suggests we're
|
|
1113 confused and be can't be confident of doing the right thing.
|
|
1114
|
|
1115 (b) There is a single likely inner stride and it is a constant
|
|
1116 rather than an SSA name. This can mean either that the access
|
|
1117 is a natural one without any variable strides, such as:
|
|
1118
|
|
1119 for (int i = 0; i < n; ++i)
|
|
1120 a[i] += 1;
|
|
1121
|
|
1122 or that a variable stride is applied to an outer dimension,
|
|
1123 such as:
|
|
1124
|
|
1125 for (int i = 0; i < n; ++i)
|
|
1126 for (int j = 0; j < n; ++j)
|
|
1127 a[j * stride][i] += 1;
|
|
1128
|
|
1129 (c) There is a single likely inner stride, and it is an SSA name,
|
|
1130 but it isn't a worthwhile versioning opportunity. This usually
|
|
1131 means that the variable stride is applied by an outer loop,
|
|
1132 such as:
|
|
1133
|
|
1134 for (int i = 0; i < n; ++i)
|
|
1135 for (int j = 0; j < n; ++j)
|
|
1136 a[j][i * stride] += 1;
|
|
1137
|
|
1138 or (using an example with a more natural loop nesting):
|
|
1139
|
|
1140 for (int i = 0; i < n; ++i)
|
|
1141 for (int j = 0; j < n; ++j)
|
|
1142 a[i][j] += b[i * stride];
|
|
1143
|
|
1144 in cases where b[i * stride] cannot (yet) be hoisted for
|
|
1145 aliasing reasons.
|
|
1146
|
|
1147 3. If there are no likely inner strides, fall through to the next
|
|
1148 set of checks.
|
|
1149
|
|
1150 Pointer equality is enough to check for uniqueness in (1), since we
|
|
1151 only care about SSA names. */
|
|
1152 tree chosen_stride = NULL_TREE;
|
|
1153 tree version_stride = NULL_TREE;
|
|
1154 for (unsigned int i = 0; i < address.terms.length (); ++i)
|
|
1155 if (chosen_stride != address.terms[i].stride
|
|
1156 && address.terms[i].inner_likelihood == INNER_LIKELY)
|
|
1157 {
|
|
1158 if (chosen_stride)
|
|
1159 return;
|
|
1160 chosen_stride = address.terms[i].stride;
|
|
1161 if (address.terms[i].versioning_opportunity_p)
|
|
1162 version_stride = chosen_stride;
|
|
1163 }
|
|
1164
|
|
1165 /* If there are no likely inner strides, see if there is a single
|
|
1166 versioning opportunity for a stride that was rated as INNER_DONT_KNOW.
|
|
1167 See the comment above the function for the cases that this code
|
|
1168 handles. */
|
|
1169 if (!chosen_stride)
|
|
1170 for (unsigned int i = 0; i < address.terms.length (); ++i)
|
|
1171 if (version_stride != address.terms[i].stride
|
|
1172 && address.terms[i].inner_likelihood == INNER_DONT_KNOW
|
|
1173 && address.terms[i].versioning_opportunity_p)
|
|
1174 {
|
|
1175 if (version_stride)
|
|
1176 return;
|
|
1177 version_stride = address.terms[i].stride;
|
|
1178 }
|
|
1179
|
|
1180 if (version_stride)
|
|
1181 version_for_unity (address.stmt, version_stride);
|
|
1182 }
|
|
1183
|
|
1184 /* Treat EXPR * MULTIPLIER + OFFSET as a fragment of an address that addresses
|
|
1185 TYPE_SIZE bytes and record this address fragment for later processing.
|
|
1186 STMT is the statement that contains the address. */
|
|
1187
|
|
1188 void
|
|
1189 loop_versioning::record_address_fragment (gimple *stmt,
|
|
1190 unsigned HOST_WIDE_INT type_size,
|
|
1191 tree expr,
|
|
1192 unsigned HOST_WIDE_INT multiplier,
|
|
1193 HOST_WIDE_INT offset)
|
|
1194 {
|
|
1195 /* We're only interested in computed values. */
|
|
1196 if (TREE_CODE (expr) != SSA_NAME)
|
|
1197 return;
|
|
1198
|
|
1199 /* Quick exit if no part of the address is calculated in STMT's loop,
|
|
1200 since such addresses have no versioning opportunities. */
|
|
1201 class loop *loop = loop_containing_stmt (stmt);
|
|
1202 if (expr_invariant_in_loop_p (loop, expr))
|
|
1203 return;
|
|
1204
|
|
1205 /* Set up an address_info for EXPR * MULTIPLIER. */
|
|
1206 address_info *address = XOBNEW (&m_obstack, address_info);
|
|
1207 new (address) address_info;
|
|
1208 address->stmt = stmt;
|
|
1209 address->loop = loop;
|
|
1210 address->base = NULL_TREE;
|
|
1211 address->terms.quick_grow (1);
|
|
1212 address->terms[0].expr = expr;
|
|
1213 address->terms[0].multiplier = multiplier;
|
|
1214 address->terms[0].stride = NULL_TREE;
|
|
1215 address->terms[0].inner_likelihood = INNER_UNLIKELY;
|
|
1216 address->terms[0].versioning_opportunity_p = false;
|
|
1217 address->min_offset = offset;
|
|
1218
|
|
1219 /* Peel apart the expression into a sum of address_terms, where each
|
|
1220 term is multiplied by a constant. Treat a + b and a - b the same,
|
|
1221 since it doesn't matter for our purposes whether an address is
|
|
1222 increasing or decreasing. Distribute (a + b) * constant into
|
|
1223 a * constant + b * constant.
|
|
1224
|
|
1225 We don't care which loop each term belongs to, since we want to
|
|
1226 examine as many candidate strides as possible when determining
|
|
1227 which is likely to be for the innermost dimension. We therefore
|
|
1228 don't limit the search to statements in STMT's loop. */
|
|
1229 for (unsigned int i = 0; i < address->terms.length (); )
|
|
1230 {
|
|
1231 if (gassign *assign = maybe_get_assign (address->terms[i].expr))
|
|
1232 {
|
|
1233 tree_code code = gimple_assign_rhs_code (assign);
|
|
1234 if (code == PLUS_EXPR
|
|
1235 || code == POINTER_PLUS_EXPR
|
|
1236 || code == MINUS_EXPR)
|
|
1237 {
|
|
1238 tree op1 = gimple_assign_rhs1 (assign);
|
|
1239 tree op2 = gimple_assign_rhs2 (assign);
|
|
1240 if (TREE_CODE (op2) == INTEGER_CST)
|
|
1241 {
|
|
1242 address->terms[i].expr = strip_casts (op1);
|
|
1243 /* This is heuristic only, so don't worry about truncation
|
|
1244 or overflow. */
|
|
1245 address->min_offset += (TREE_INT_CST_LOW (op2)
|
|
1246 * address->terms[i].multiplier);
|
|
1247 continue;
|
|
1248 }
|
|
1249 else if (address->terms.length () < address_info::MAX_TERMS)
|
|
1250 {
|
|
1251 unsigned int j = address->terms.length ();
|
|
1252 address->terms.quick_push (address->terms[i]);
|
|
1253 address->terms[i].expr = strip_casts (op1);
|
|
1254 address->terms[j].expr = strip_casts (op2);
|
|
1255 continue;
|
|
1256 }
|
|
1257 }
|
|
1258 if (code == MULT_EXPR)
|
|
1259 {
|
|
1260 tree op1 = gimple_assign_rhs1 (assign);
|
|
1261 tree op2 = gimple_assign_rhs2 (assign);
|
|
1262 if (multiply_term_by (address->terms[i], op2))
|
|
1263 {
|
|
1264 address->terms[i].expr = strip_casts (op1);
|
|
1265 continue;
|
|
1266 }
|
|
1267 }
|
|
1268 if (CONVERT_EXPR_CODE_P (code))
|
|
1269 {
|
|
1270 tree op1 = gimple_assign_rhs1 (assign);
|
|
1271 address->terms[i].expr = strip_casts (op1);
|
|
1272 continue;
|
|
1273 }
|
|
1274 }
|
|
1275 i += 1;
|
|
1276 }
|
|
1277
|
|
1278 /* Peel off any symbolic pointer. */
|
|
1279 if (TREE_CODE (address->terms[0].expr) != SSA_NAME
|
|
1280 && address->terms[0].multiplier == 1)
|
|
1281 {
|
|
1282 if (address->terms.length () == 1)
|
|
1283 {
|
|
1284 obstack_free (&m_obstack, address);
|
|
1285 return;
|
|
1286 }
|
|
1287 address->base = address->terms[0].expr;
|
|
1288 address->terms.ordered_remove (0);
|
|
1289 }
|
|
1290
|
|
1291 /* Require all remaining terms to be SSA names. (This could be false
|
|
1292 for unfolded statements, but they aren't worth dealing with.) */
|
|
1293 for (unsigned int i = 0; i < address->terms.length (); ++i)
|
|
1294 if (TREE_CODE (address->terms[i].expr) != SSA_NAME)
|
|
1295 {
|
|
1296 obstack_free (&m_obstack, address);
|
|
1297 return;
|
|
1298 }
|
|
1299
|
|
1300 /* The loop above set MIN_OFFSET based on the first byte of the
|
|
1301 referenced data. Calculate the end + 1. */
|
|
1302 address->max_offset = address->min_offset + type_size;
|
|
1303
|
|
1304 /* Put the terms into a canonical order for the hash table lookup below. */
|
|
1305 address->terms.qsort (compare_address_terms);
|
|
1306
|
|
1307 if (dump_enabled_p ())
|
|
1308 {
|
|
1309 dump_printf_loc (MSG_NOTE, stmt, "recording address fragment %T", expr);
|
|
1310 if (multiplier != 1)
|
|
1311 dump_printf (MSG_NOTE, " * %wd", multiplier);
|
|
1312 dump_printf (MSG_NOTE, " = ");
|
|
1313 dump_address_info (MSG_NOTE, *address);
|
|
1314 dump_printf (MSG_NOTE, "\n");
|
|
1315 }
|
|
1316
|
|
1317 /* Pool address information with the same terms (but potentially
|
|
1318 different offsets). */
|
|
1319 address_info **slot = m_address_table.find_slot (address, INSERT);
|
|
1320 if (address_info *old_address = *slot)
|
|
1321 {
|
|
1322 /* We've already seen an address with the same terms. Extend the
|
|
1323 offset range to account for this access. Doing this can paper
|
|
1324 over gaps, such as in:
|
|
1325
|
|
1326 a[i * stride * 4] + a[i * stride * 4 + 3];
|
|
1327
|
|
1328 where nothing references "+ 1" or "+ 2". However, the vectorizer
|
|
1329 handles such gapped accesses without problems, so it's not worth
|
|
1330 trying to exclude them. */
|
|
1331 if (old_address->min_offset > address->min_offset)
|
|
1332 old_address->min_offset = address->min_offset;
|
|
1333 if (old_address->max_offset < address->max_offset)
|
|
1334 old_address->max_offset = address->max_offset;
|
|
1335 obstack_free (&m_obstack, address);
|
|
1336 }
|
|
1337 else
|
|
1338 {
|
|
1339 /* This is the first time we've seen an address with these terms. */
|
|
1340 *slot = address;
|
|
1341 m_address_list.safe_push (address);
|
|
1342 }
|
|
1343 }
|
|
1344
|
|
1345 /* Analyze expression EXPR, which occurs in STMT. */
|
|
1346
|
|
1347 void
|
|
1348 loop_versioning::analyze_expr (gimple *stmt, tree expr)
|
|
1349 {
|
|
1350 unsigned HOST_WIDE_INT type_size;
|
|
1351
|
|
1352 while (handled_component_p (expr))
|
|
1353 {
|
|
1354 /* See whether we can use versioning to avoid a multiplication
|
|
1355 in an array index. */
|
|
1356 if (TREE_CODE (expr) == ARRAY_REF
|
|
1357 && acceptable_type_p (TREE_TYPE (expr), &type_size))
|
|
1358 record_address_fragment (stmt, type_size,
|
|
1359 TREE_OPERAND (expr, 1), type_size, 0);
|
|
1360 expr = TREE_OPERAND (expr, 0);
|
|
1361 }
|
|
1362
|
|
1363 /* See whether we can use versioning to avoid a multiplication
|
|
1364 in the pointer calculation of a MEM_REF. */
|
|
1365 if (TREE_CODE (expr) == MEM_REF
|
|
1366 && acceptable_type_p (TREE_TYPE (expr), &type_size))
|
|
1367 record_address_fragment (stmt, type_size, TREE_OPERAND (expr, 0), 1,
|
|
1368 /* This is heuristic only, so don't worry
|
|
1369 about truncation or overflow. */
|
|
1370 TREE_INT_CST_LOW (TREE_OPERAND (expr, 1)));
|
|
1371
|
|
1372 /* These would be easy to handle if they existed at this stage. */
|
|
1373 gcc_checking_assert (TREE_CODE (expr) != TARGET_MEM_REF);
|
|
1374 }
|
|
1375
|
|
1376 /* Analyze all the statements in BB looking for useful version checks.
|
|
1377 Return true on success, false if something prevents the block from
|
|
1378 being versioned. */
|
|
1379
|
|
1380 bool
|
|
1381 loop_versioning::analyze_block (basic_block bb)
|
|
1382 {
|
|
1383 class loop *loop = bb->loop_father;
|
|
1384 loop_info &li = get_loop_info (loop);
|
|
1385 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
|
|
1386 gsi_next (&gsi))
|
|
1387 {
|
|
1388 gimple *stmt = gsi_stmt (gsi);
|
|
1389 if (is_gimple_debug (stmt))
|
|
1390 continue;
|
|
1391
|
|
1392 if (expensive_stmt_p (stmt))
|
|
1393 {
|
|
1394 if (dump_enabled_p ())
|
|
1395 dump_printf_loc (MSG_NOTE, stmt, "expensive statement"
|
|
1396 " prevents versioning: %G", stmt);
|
|
1397 return false;
|
|
1398 }
|
|
1399
|
|
1400 /* Only look for direct versioning opportunities in inner loops
|
|
1401 since the benefit tends to be much smaller for outer loops. */
|
|
1402 if (!loop->inner)
|
|
1403 {
|
|
1404 unsigned int nops = gimple_num_ops (stmt);
|
|
1405 for (unsigned int i = 0; i < nops; ++i)
|
|
1406 if (tree op = gimple_op (stmt, i))
|
|
1407 analyze_expr (stmt, op);
|
|
1408 }
|
|
1409
|
|
1410 /* The point of the instruction limit is to prevent excessive
|
|
1411 code growth, so this is a size-based estimate even though
|
|
1412 the optimization is aimed at speed. */
|
|
1413 li.num_insns += estimate_num_insns (stmt, &eni_size_weights);
|
|
1414 }
|
|
1415
|
|
1416 return true;
|
|
1417 }
|
|
1418
|
|
1419 /* Analyze all the blocks in the function, looking for useful version checks.
|
|
1420 Return true if we found one. */
|
|
1421
|
|
1422 bool
|
|
1423 loop_versioning::analyze_blocks ()
|
|
1424 {
|
|
1425 AUTO_DUMP_SCOPE ("analyze_blocks",
|
|
1426 dump_user_location_t::from_function_decl (m_fn->decl));
|
|
1427
|
|
1428 /* For now we don't try to version the whole function, although
|
|
1429 versioning at that level could be useful in some cases. */
|
|
1430 get_loop_info (get_loop (m_fn, 0)).rejected_p = true;
|
|
1431
|
|
1432 class loop *loop;
|
|
1433 FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
|
|
1434 {
|
|
1435 loop_info &linfo = get_loop_info (loop);
|
|
1436
|
|
1437 /* Ignore cold loops. */
|
|
1438 if (!optimize_loop_for_speed_p (loop))
|
|
1439 linfo.rejected_p = true;
|
|
1440
|
|
1441 /* See whether an inner loop prevents versioning of this loop. */
|
|
1442 if (!linfo.rejected_p)
|
|
1443 for (class loop *inner = loop->inner; inner; inner = inner->next)
|
|
1444 if (get_loop_info (inner).rejected_p)
|
|
1445 {
|
|
1446 linfo.rejected_p = true;
|
|
1447 break;
|
|
1448 }
|
|
1449
|
|
1450 /* If versioning the loop is still a possibility, examine the
|
|
1451 statements in the loop to look for versioning opportunities. */
|
|
1452 if (!linfo.rejected_p)
|
|
1453 {
|
|
1454 void *start_point = obstack_alloc (&m_obstack, 0);
|
|
1455
|
|
1456 for (basic_block bb = linfo.block_list; bb;
|
|
1457 bb = m_next_block_in_loop[bb->index])
|
|
1458 if (!analyze_block (bb))
|
|
1459 {
|
|
1460 linfo.rejected_p = true;
|
|
1461 break;
|
|
1462 }
|
|
1463
|
|
1464 if (!linfo.rejected_p)
|
|
1465 {
|
|
1466 /* Process any queued address fragments, now that we have
|
|
1467 complete grouping information. */
|
|
1468 address_info *address;
|
|
1469 unsigned int i;
|
|
1470 FOR_EACH_VEC_ELT (m_address_list, i, address)
|
|
1471 analyze_address_fragment (*address);
|
|
1472 }
|
|
1473
|
|
1474 m_address_table.empty ();
|
|
1475 m_address_list.truncate (0);
|
|
1476 obstack_free (&m_obstack, start_point);
|
|
1477 }
|
|
1478 }
|
|
1479
|
|
1480 return m_num_conditions != 0;
|
|
1481 }
|
|
1482
|
|
1483 /* Use the ranges in VRS to remove impossible versioning conditions from
|
|
1484 LOOP. */
|
|
1485
|
|
1486 void
|
|
1487 loop_versioning::prune_loop_conditions (class loop *loop, vr_values *vrs)
|
|
1488 {
|
|
1489 loop_info &li = get_loop_info (loop);
|
|
1490
|
|
1491 int to_remove = -1;
|
|
1492 bitmap_iterator bi;
|
|
1493 unsigned int i;
|
|
1494 EXECUTE_IF_SET_IN_BITMAP (&li.unity_names, 0, i, bi)
|
|
1495 {
|
|
1496 tree name = ssa_name (i);
|
|
1497 const value_range_equiv *vr = vrs->get_value_range (name);
|
|
1498 if (vr && !vr->may_contain_p (build_one_cst (TREE_TYPE (name))))
|
|
1499 {
|
|
1500 if (dump_enabled_p ())
|
|
1501 dump_printf_loc (MSG_NOTE, find_loop_location (loop),
|
|
1502 "%T can never be 1 in this loop\n", name);
|
|
1503
|
|
1504 if (to_remove >= 0)
|
|
1505 bitmap_clear_bit (&li.unity_names, to_remove);
|
|
1506 to_remove = i;
|
|
1507 m_num_conditions -= 1;
|
|
1508 }
|
|
1509 }
|
|
1510 if (to_remove >= 0)
|
|
1511 bitmap_clear_bit (&li.unity_names, to_remove);
|
|
1512 }
|
|
1513
|
|
1514 /* Remove any scheduled loop version conditions that will never be true.
|
|
1515 Return true if any remain. */
|
|
1516
|
|
1517 bool
|
|
1518 loop_versioning::prune_conditions ()
|
|
1519 {
|
|
1520 AUTO_DUMP_SCOPE ("prune_loop_conditions",
|
|
1521 dump_user_location_t::from_function_decl (m_fn->decl));
|
|
1522
|
|
1523 calculate_dominance_info (CDI_DOMINATORS);
|
|
1524 lv_dom_walker dom_walker (*this);
|
|
1525 dom_walker.walk (ENTRY_BLOCK_PTR_FOR_FN (m_fn));
|
|
1526 return m_num_conditions != 0;
|
|
1527 }
|
|
1528
|
|
1529 /* Merge the version checks for INNER into immediately-enclosing loop
|
|
1530 OUTER. */
|
|
1531
|
|
1532 void
|
|
1533 loop_versioning::merge_loop_info (class loop *outer, class loop *inner)
|
|
1534 {
|
|
1535 loop_info &inner_li = get_loop_info (inner);
|
|
1536 loop_info &outer_li = get_loop_info (outer);
|
|
1537
|
|
1538 if (dump_enabled_p ())
|
|
1539 {
|
|
1540 bitmap_iterator bi;
|
|
1541 unsigned int i;
|
|
1542 EXECUTE_IF_SET_IN_BITMAP (&inner_li.unity_names, 0, i, bi)
|
|
1543 if (!bitmap_bit_p (&outer_li.unity_names, i))
|
|
1544 dump_printf_loc (MSG_NOTE, find_loop_location (inner),
|
|
1545 "hoisting check that %T == 1 to outer loop\n",
|
|
1546 ssa_name (i));
|
|
1547 }
|
|
1548
|
|
1549 bitmap_ior_into (&outer_li.unity_names, &inner_li.unity_names);
|
|
1550 if (loop_depth (outer_li.outermost) < loop_depth (inner_li.outermost))
|
|
1551 outer_li.outermost = inner_li.outermost;
|
|
1552 }
|
|
1553
|
|
1554 /* Add LOOP to the queue of loops to version. */
|
|
1555
|
|
1556 void
|
|
1557 loop_versioning::add_loop_to_queue (class loop *loop)
|
|
1558 {
|
|
1559 loop_info &li = get_loop_info (loop);
|
|
1560
|
|
1561 if (dump_enabled_p ())
|
|
1562 dump_printf_loc (MSG_NOTE, find_loop_location (loop),
|
|
1563 "queuing this loop for versioning\n");
|
|
1564 m_loops_to_version.safe_push (loop);
|
|
1565
|
|
1566 /* Don't try to version superloops. */
|
|
1567 li.rejected_p = true;
|
|
1568 }
|
|
1569
|
|
1570 /* Decide whether the cost model would allow us to version LOOP,
|
|
1571 either directly or as part of a parent loop, and return true if so.
|
|
1572 This does not imply that the loop is actually worth versioning in its
|
|
1573 own right, just that it would be valid to version it if something
|
|
1574 benefited.
|
|
1575
|
|
1576 We have already made this decision for all inner loops of LOOP. */
|
|
1577
|
|
1578 bool
|
|
1579 loop_versioning::decide_whether_loop_is_versionable (class loop *loop)
|
|
1580 {
|
|
1581 loop_info &li = get_loop_info (loop);
|
|
1582
|
|
1583 if (li.rejected_p)
|
|
1584 return false;
|
|
1585
|
|
1586 /* Examine the decisions made for inner loops. */
|
|
1587 for (class loop *inner = loop->inner; inner; inner = inner->next)
|
|
1588 {
|
|
1589 loop_info &inner_li = get_loop_info (inner);
|
|
1590 if (inner_li.rejected_p)
|
|
1591 {
|
|
1592 if (dump_enabled_p ())
|
|
1593 dump_printf_loc (MSG_NOTE, find_loop_location (loop),
|
|
1594 "not versioning this loop because one of its"
|
|
1595 " inner loops should not be versioned\n");
|
|
1596 return false;
|
|
1597 }
|
|
1598
|
|
1599 if (inner_li.worth_versioning_p ())
|
|
1600 li.subloops_benefit_p = true;
|
|
1601
|
|
1602 /* Accumulate the number of instructions from subloops that are not
|
|
1603 the innermost, or that don't benefit from versioning. Only the
|
|
1604 instructions from innermost loops that benefit from versioning
|
|
1605 should be weighed against loop-versioning-max-inner-insns;
|
|
1606 everything else should be weighed against
|
|
1607 loop-versioning-max-outer-insns. */
|
|
1608 if (!inner_li.worth_versioning_p () || inner->inner)
|
|
1609 {
|
|
1610 if (dump_enabled_p ())
|
|
1611 dump_printf_loc (MSG_NOTE, find_loop_location (loop),
|
|
1612 "counting %d instructions from this loop"
|
|
1613 " against its parent loop\n", inner_li.num_insns);
|
|
1614 li.num_insns += inner_li.num_insns;
|
|
1615 }
|
|
1616 }
|
|
1617
|
|
1618 /* Enforce the size limits. */
|
|
1619 if (li.worth_versioning_p ())
|
|
1620 {
|
|
1621 unsigned int max_num_insns = max_insns_for_loop (loop);
|
|
1622 if (dump_enabled_p ())
|
|
1623 dump_printf_loc (MSG_NOTE, find_loop_location (loop),
|
|
1624 "this loop has %d instructions, against"
|
|
1625 " a versioning limit of %d\n",
|
|
1626 li.num_insns, max_num_insns);
|
|
1627 if (li.num_insns > max_num_insns)
|
|
1628 {
|
|
1629 if (dump_enabled_p ())
|
|
1630 dump_printf_loc (MSG_MISSED_OPTIMIZATION
|
|
1631 | MSG_PRIORITY_USER_FACING,
|
|
1632 find_loop_location (loop),
|
|
1633 "this loop is too big to version");
|
|
1634 return false;
|
|
1635 }
|
|
1636 }
|
|
1637
|
|
1638 /* Hoist all version checks from subloops to this loop. */
|
|
1639 for (class loop *subloop = loop->inner; subloop; subloop = subloop->next)
|
|
1640 merge_loop_info (loop, subloop);
|
|
1641
|
|
1642 return true;
|
|
1643 }
|
|
1644
|
|
1645 /* Decide which loops to version and add them to the versioning queue.
|
|
1646 Return true if there are any loops to version. */
|
|
1647
|
|
1648 bool
|
|
1649 loop_versioning::make_versioning_decisions ()
|
|
1650 {
|
|
1651 AUTO_DUMP_SCOPE ("make_versioning_decisions",
|
|
1652 dump_user_location_t::from_function_decl (m_fn->decl));
|
|
1653
|
|
1654 class loop *loop;
|
|
1655 FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
|
|
1656 {
|
|
1657 loop_info &linfo = get_loop_info (loop);
|
|
1658 if (decide_whether_loop_is_versionable (loop))
|
|
1659 {
|
|
1660 /* Commit to versioning LOOP directly if we can't hoist the
|
|
1661 version checks any further. */
|
|
1662 if (linfo.worth_versioning_p ()
|
|
1663 && (loop_depth (loop) == 1 || linfo.outermost == loop))
|
|
1664 add_loop_to_queue (loop);
|
|
1665 }
|
|
1666 else
|
|
1667 {
|
|
1668 /* We can't version this loop, so individually version any
|
|
1669 subloops that would benefit and haven't been versioned yet. */
|
|
1670 linfo.rejected_p = true;
|
|
1671 for (class loop *subloop = loop->inner; subloop;
|
|
1672 subloop = subloop->next)
|
|
1673 if (get_loop_info (subloop).worth_versioning_p ())
|
|
1674 add_loop_to_queue (subloop);
|
|
1675 }
|
|
1676 }
|
|
1677
|
|
1678 return !m_loops_to_version.is_empty ();
|
|
1679 }
|
|
1680
|
|
1681 /* Attempt to implement loop versioning for LOOP, using the information
|
|
1682 cached in the associated loop_info. Return true on success. */
|
|
1683
|
|
1684 bool
|
|
1685 loop_versioning::version_loop (class loop *loop)
|
|
1686 {
|
|
1687 loop_info &li = get_loop_info (loop);
|
|
1688
|
|
1689 /* Build up a condition that selects the original loop instead of
|
|
1690 the simplified loop. */
|
|
1691 tree cond = boolean_false_node;
|
|
1692 bitmap_iterator bi;
|
|
1693 unsigned int i;
|
|
1694 EXECUTE_IF_SET_IN_BITMAP (&li.unity_names, 0, i, bi)
|
|
1695 {
|
|
1696 tree name = ssa_name (i);
|
|
1697 tree ne_one = fold_build2 (NE_EXPR, boolean_type_node, name,
|
|
1698 build_one_cst (TREE_TYPE (name)));
|
|
1699 cond = fold_build2 (TRUTH_OR_EXPR, boolean_type_node, cond, ne_one);
|
|
1700 }
|
|
1701
|
|
1702 /* Convert the condition into a suitable gcond. */
|
|
1703 gimple_seq stmts = NULL;
|
|
1704 cond = force_gimple_operand_1 (cond, &stmts, is_gimple_condexpr, NULL_TREE);
|
|
1705
|
|
1706 /* Version the loop. */
|
|
1707 initialize_original_copy_tables ();
|
|
1708 basic_block cond_bb;
|
|
1709 li.optimized_loop = loop_version (loop, cond, &cond_bb,
|
|
1710 profile_probability::unlikely (),
|
|
1711 profile_probability::likely (),
|
|
1712 profile_probability::unlikely (),
|
|
1713 profile_probability::likely (), true);
|
|
1714 free_original_copy_tables ();
|
|
1715 if (!li.optimized_loop)
|
|
1716 {
|
|
1717 if (dump_enabled_p ())
|
|
1718 dump_printf_loc (MSG_MISSED_OPTIMIZATION, find_loop_location (loop),
|
|
1719 "tried but failed to version this loop for when"
|
|
1720 " certain strides are 1\n");
|
|
1721 return false;
|
|
1722 }
|
|
1723
|
|
1724 if (dump_enabled_p ())
|
|
1725 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, find_loop_location (loop),
|
|
1726 "versioned this loop for when certain strides are 1\n");
|
|
1727
|
|
1728 /* Insert the statements that feed COND. */
|
|
1729 if (stmts)
|
|
1730 {
|
|
1731 gimple_stmt_iterator gsi = gsi_last_bb (cond_bb);
|
|
1732 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
|
|
1733 }
|
|
1734
|
|
1735 return true;
|
|
1736 }
|
|
1737
|
|
1738 /* Attempt to version all loops in the versioning queue. */
|
|
1739
|
|
1740 void
|
|
1741 loop_versioning::implement_versioning_decisions ()
|
|
1742 {
|
|
1743 /* No AUTO_DUMP_SCOPE here since all messages are top-level and
|
|
1744 user-facing at this point. */
|
|
1745
|
|
1746 bool any_succeeded_p = false;
|
|
1747 class loop *loop;
|
|
1748 unsigned int i;
|
|
1749 FOR_EACH_VEC_ELT (m_loops_to_version, i, loop)
|
|
1750 if (version_loop (loop))
|
|
1751 any_succeeded_p = true;
|
|
1752 if (!any_succeeded_p)
|
|
1753 return;
|
|
1754
|
|
1755 update_ssa (TODO_update_ssa);
|
|
1756
|
|
1757 /* Simplify the new loop, which is used when COND is false. */
|
|
1758 FOR_EACH_VEC_ELT (m_loops_to_version, i, loop)
|
|
1759 {
|
|
1760 loop_info &linfo = get_loop_info (loop);
|
|
1761 if (linfo.optimized_loop)
|
|
1762 name_prop (linfo).substitute_and_fold (linfo.optimized_loop->header);
|
|
1763 }
|
|
1764 }
|
|
1765
|
|
1766 /* Run the pass and return a set of TODO_* flags. */
|
|
1767
|
|
1768 unsigned int
|
|
1769 loop_versioning::run ()
|
|
1770 {
|
|
1771 gcc_assert (scev_initialized_p ());
|
|
1772
|
|
1773 if (analyze_blocks ()
|
|
1774 && prune_conditions ()
|
|
1775 && make_versioning_decisions ())
|
|
1776 implement_versioning_decisions ();
|
|
1777
|
|
1778 return 0;
|
|
1779 }
|
|
1780
|
|
1781 /* Loop versioning pass. */
|
|
1782
|
|
1783 const pass_data pass_data_loop_versioning =
|
|
1784 {
|
|
1785 GIMPLE_PASS, /* type */
|
|
1786 "lversion", /* name */
|
|
1787 OPTGROUP_LOOP, /* optinfo_flags */
|
|
1788 TV_LOOP_VERSIONING, /* tv_id */
|
|
1789 PROP_cfg, /* properties_required */
|
|
1790 0, /* properties_provided */
|
|
1791 0, /* properties_destroyed */
|
|
1792 0, /* todo_flags_start */
|
|
1793 0, /* todo_flags_finish */
|
|
1794 };
|
|
1795
|
|
1796 class pass_loop_versioning : public gimple_opt_pass
|
|
1797 {
|
|
1798 public:
|
|
1799 pass_loop_versioning (gcc::context *ctxt)
|
|
1800 : gimple_opt_pass (pass_data_loop_versioning, ctxt)
|
|
1801 {}
|
|
1802
|
|
1803 /* opt_pass methods: */
|
|
1804 virtual bool gate (function *) { return flag_version_loops_for_strides; }
|
|
1805 virtual unsigned int execute (function *);
|
|
1806 };
|
|
1807
|
|
1808 unsigned int
|
|
1809 pass_loop_versioning::execute (function *fn)
|
|
1810 {
|
|
1811 if (number_of_loops (fn) <= 1)
|
|
1812 return 0;
|
|
1813
|
|
1814 return loop_versioning (fn).run ();
|
|
1815 }
|
|
1816
|
|
1817 } // anon namespace
|
|
1818
|
|
1819 gimple_opt_pass *
|
|
1820 make_pass_loop_versioning (gcc::context *ctxt)
|
|
1821 {
|
|
1822 return new pass_loop_versioning (ctxt);
|
|
1823 }
|