0
|
1 /* Loop Vectorization
|
|
2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008 Free Software
|
|
3 Foundation, Inc.
|
|
4 Contributed by Dorit Naishlos <dorit@il.ibm.com>
|
|
5
|
|
6 This file is part of GCC.
|
|
7
|
|
8 GCC is free software; you can redistribute it and/or modify it under
|
|
9 the terms of the GNU General Public License as published by the Free
|
|
10 Software Foundation; either version 3, or (at your option) any later
|
|
11 version.
|
|
12
|
|
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
16 for more details.
|
|
17
|
|
18 You should have received a copy of the GNU General Public License
|
|
19 along with GCC; see the file COPYING3. If not see
|
|
20 <http://www.gnu.org/licenses/>. */
|
|
21
|
|
22 /* Loop Vectorization Pass.
|
|
23
|
|
24 This pass tries to vectorize loops. This first implementation focuses on
|
|
25 simple inner-most loops, with no conditional control flow, and a set of
|
|
26 simple operations which vector form can be expressed using existing
|
|
27 tree codes (PLUS, MULT etc).
|
|
28
|
|
29 For example, the vectorizer transforms the following simple loop:
|
|
30
|
|
31 short a[N]; short b[N]; short c[N]; int i;
|
|
32
|
|
33 for (i=0; i<N; i++){
|
|
34 a[i] = b[i] + c[i];
|
|
35 }
|
|
36
|
|
37 as if it was manually vectorized by rewriting the source code into:
|
|
38
|
|
39 typedef int __attribute__((mode(V8HI))) v8hi;
|
|
40 short a[N]; short b[N]; short c[N]; int i;
|
|
41 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
|
|
42 v8hi va, vb, vc;
|
|
43
|
|
44 for (i=0; i<N/8; i++){
|
|
45 vb = pb[i];
|
|
46 vc = pc[i];
|
|
47 va = vb + vc;
|
|
48 pa[i] = va;
|
|
49 }
|
|
50
|
|
51 The main entry to this pass is vectorize_loops(), in which
|
|
52 the vectorizer applies a set of analyses on a given set of loops,
|
|
53 followed by the actual vectorization transformation for the loops that
|
|
54 had successfully passed the analysis phase.
|
|
55
|
|
56 Throughout this pass we make a distinction between two types of
|
|
57 data: scalars (which are represented by SSA_NAMES), and memory references
|
|
58 ("data-refs"). These two types of data require different handling both
|
|
59 during analysis and transformation. The types of data-refs that the
|
|
60 vectorizer currently supports are ARRAY_REFS which base is an array DECL
|
|
61 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
|
|
62 accesses are required to have a simple (consecutive) access pattern.
|
|
63
|
|
64 Analysis phase:
|
|
65 ===============
|
|
66 The driver for the analysis phase is vect_analyze_loop_nest().
|
|
67 It applies a set of analyses, some of which rely on the scalar evolution
|
|
68 analyzer (scev) developed by Sebastian Pop.
|
|
69
|
|
70 During the analysis phase the vectorizer records some information
|
|
71 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
|
|
72 loop, as well as general information about the loop as a whole, which is
|
|
73 recorded in a "loop_vec_info" struct attached to each loop.
|
|
74
|
|
75 Transformation phase:
|
|
76 =====================
|
|
77 The loop transformation phase scans all the stmts in the loop, and
|
|
78 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
|
|
79 the loop that needs to be vectorized. It insert the vector code sequence
|
|
80 just before the scalar stmt S, and records a pointer to the vector code
|
|
81 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
|
|
82 attached to S). This pointer will be used for the vectorization of following
|
|
83 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
|
|
84 otherwise, we rely on dead code elimination for removing it.
|
|
85
|
|
86 For example, say stmt S1 was vectorized into stmt VS1:
|
|
87
|
|
88 VS1: vb = px[i];
|
|
89 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
|
|
90 S2: a = b;
|
|
91
|
|
92 To vectorize stmt S2, the vectorizer first finds the stmt that defines
|
|
93 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
|
|
94 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
|
|
95 resulting sequence would be:
|
|
96
|
|
97 VS1: vb = px[i];
|
|
98 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
|
|
99 VS2: va = vb;
|
|
100 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
|
|
101
|
|
102 Operands that are not SSA_NAMEs, are data-refs that appear in
|
|
103 load/store operations (like 'x[i]' in S1), and are handled differently.
|
|
104
|
|
105 Target modeling:
|
|
106 =================
|
|
107 Currently the only target specific information that is used is the
|
|
108 size of the vector (in bytes) - "UNITS_PER_SIMD_WORD". Targets that can
|
|
109 support different sizes of vectors, for now will need to specify one value
|
|
110 for "UNITS_PER_SIMD_WORD". More flexibility will be added in the future.
|
|
111
|
|
112 Since we only vectorize operations which vector form can be
|
|
113 expressed using existing tree codes, to verify that an operation is
|
|
114 supported, the vectorizer checks the relevant optab at the relevant
|
|
115 machine_mode (e.g, optab_handler (add_optab, V8HImode)->insn_code). If
|
|
116 the value found is CODE_FOR_nothing, then there's no target support, and
|
|
117 we can't vectorize the stmt.
|
|
118
|
|
119 For additional information on this project see:
|
|
120 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
|
|
121 */
|
|
122
|
|
123 #include "config.h"
|
|
124 #include "system.h"
|
|
125 #include "coretypes.h"
|
|
126 #include "tm.h"
|
|
127 #include "ggc.h"
|
|
128 #include "tree.h"
|
|
129 #include "target.h"
|
|
130 #include "rtl.h"
|
|
131 #include "basic-block.h"
|
|
132 #include "diagnostic.h"
|
|
133 #include "tree-flow.h"
|
|
134 #include "tree-dump.h"
|
|
135 #include "timevar.h"
|
|
136 #include "cfgloop.h"
|
|
137 #include "cfglayout.h"
|
|
138 #include "expr.h"
|
|
139 #include "recog.h"
|
|
140 #include "optabs.h"
|
|
141 #include "params.h"
|
|
142 #include "toplev.h"
|
|
143 #include "tree-chrec.h"
|
|
144 #include "tree-data-ref.h"
|
|
145 #include "tree-scalar-evolution.h"
|
|
146 #include "input.h"
|
|
147 #include "hashtab.h"
|
|
148 #include "tree-vectorizer.h"
|
|
149 #include "tree-pass.h"
|
|
150 #include "langhooks.h"
|
|
151
|
|
152 /*************************************************************************
|
|
153 General Vectorization Utilities
|
|
154 *************************************************************************/
|
|
155
|
|
156 /* vect_dump will be set to stderr or dump_file if exist. */
|
|
157 FILE *vect_dump;
|
|
158
|
|
159 /* vect_verbosity_level set to an invalid value
|
|
160 to mark that it's uninitialized. */
|
|
161 enum verbosity_levels vect_verbosity_level = MAX_VERBOSITY_LEVEL;
|
|
162
|
|
163 /* Loop location. */
|
|
164 static LOC vect_loop_location;
|
|
165
|
|
166 /* Bitmap of virtual variables to be renamed. */
|
|
167 bitmap vect_memsyms_to_rename;
|
|
168
|
|
169 /* Vector mapping GIMPLE stmt to stmt_vec_info. */
|
|
170 VEC(vec_void_p,heap) *stmt_vec_info_vec;
|
|
171
|
|
172
|
|
173 /*************************************************************************
|
|
174 Simple Loop Peeling Utilities
|
|
175
|
|
176 Utilities to support loop peeling for vectorization purposes.
|
|
177 *************************************************************************/
|
|
178
|
|
179
|
|
180 /* Renames the use *OP_P. */
|
|
181
|
|
182 static void
|
|
183 rename_use_op (use_operand_p op_p)
|
|
184 {
|
|
185 tree new_name;
|
|
186
|
|
187 if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
|
|
188 return;
|
|
189
|
|
190 new_name = get_current_def (USE_FROM_PTR (op_p));
|
|
191
|
|
192 /* Something defined outside of the loop. */
|
|
193 if (!new_name)
|
|
194 return;
|
|
195
|
|
196 /* An ordinary ssa name defined in the loop. */
|
|
197
|
|
198 SET_USE (op_p, new_name);
|
|
199 }
|
|
200
|
|
201
|
|
202 /* Renames the variables in basic block BB. */
|
|
203
|
|
204 void
|
|
205 rename_variables_in_bb (basic_block bb)
|
|
206 {
|
|
207 gimple_stmt_iterator gsi;
|
|
208 gimple stmt;
|
|
209 use_operand_p use_p;
|
|
210 ssa_op_iter iter;
|
|
211 edge e;
|
|
212 edge_iterator ei;
|
|
213 struct loop *loop = bb->loop_father;
|
|
214
|
|
215 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
|
|
216 {
|
|
217 stmt = gsi_stmt (gsi);
|
|
218 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
|
|
219 rename_use_op (use_p);
|
|
220 }
|
|
221
|
|
222 FOR_EACH_EDGE (e, ei, bb->succs)
|
|
223 {
|
|
224 if (!flow_bb_inside_loop_p (loop, e->dest))
|
|
225 continue;
|
|
226 for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
|
|
227 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi_stmt (gsi), e));
|
|
228 }
|
|
229 }
|
|
230
|
|
231
|
|
232 /* Renames variables in new generated LOOP. */
|
|
233
|
|
234 void
|
|
235 rename_variables_in_loop (struct loop *loop)
|
|
236 {
|
|
237 unsigned i;
|
|
238 basic_block *bbs;
|
|
239
|
|
240 bbs = get_loop_body (loop);
|
|
241
|
|
242 for (i = 0; i < loop->num_nodes; i++)
|
|
243 rename_variables_in_bb (bbs[i]);
|
|
244
|
|
245 free (bbs);
|
|
246 }
|
|
247
|
|
248
|
|
249 /* Update the PHI nodes of NEW_LOOP.
|
|
250
|
|
251 NEW_LOOP is a duplicate of ORIG_LOOP.
|
|
252 AFTER indicates whether NEW_LOOP executes before or after ORIG_LOOP:
|
|
253 AFTER is true if NEW_LOOP executes after ORIG_LOOP, and false if it
|
|
254 executes before it. */
|
|
255
|
|
256 static void
|
|
257 slpeel_update_phis_for_duplicate_loop (struct loop *orig_loop,
|
|
258 struct loop *new_loop, bool after)
|
|
259 {
|
|
260 tree new_ssa_name;
|
|
261 gimple phi_new, phi_orig;
|
|
262 tree def;
|
|
263 edge orig_loop_latch = loop_latch_edge (orig_loop);
|
|
264 edge orig_entry_e = loop_preheader_edge (orig_loop);
|
|
265 edge new_loop_exit_e = single_exit (new_loop);
|
|
266 edge new_loop_entry_e = loop_preheader_edge (new_loop);
|
|
267 edge entry_arg_e = (after ? orig_loop_latch : orig_entry_e);
|
|
268 gimple_stmt_iterator gsi_new, gsi_orig;
|
|
269
|
|
270 /*
|
|
271 step 1. For each loop-header-phi:
|
|
272 Add the first phi argument for the phi in NEW_LOOP
|
|
273 (the one associated with the entry of NEW_LOOP)
|
|
274
|
|
275 step 2. For each loop-header-phi:
|
|
276 Add the second phi argument for the phi in NEW_LOOP
|
|
277 (the one associated with the latch of NEW_LOOP)
|
|
278
|
|
279 step 3. Update the phis in the successor block of NEW_LOOP.
|
|
280
|
|
281 case 1: NEW_LOOP was placed before ORIG_LOOP:
|
|
282 The successor block of NEW_LOOP is the header of ORIG_LOOP.
|
|
283 Updating the phis in the successor block can therefore be done
|
|
284 along with the scanning of the loop header phis, because the
|
|
285 header blocks of ORIG_LOOP and NEW_LOOP have exactly the same
|
|
286 phi nodes, organized in the same order.
|
|
287
|
|
288 case 2: NEW_LOOP was placed after ORIG_LOOP:
|
|
289 The successor block of NEW_LOOP is the original exit block of
|
|
290 ORIG_LOOP - the phis to be updated are the loop-closed-ssa phis.
|
|
291 We postpone updating these phis to a later stage (when
|
|
292 loop guards are added).
|
|
293 */
|
|
294
|
|
295
|
|
296 /* Scan the phis in the headers of the old and new loops
|
|
297 (they are organized in exactly the same order). */
|
|
298
|
|
299 for (gsi_new = gsi_start_phis (new_loop->header),
|
|
300 gsi_orig = gsi_start_phis (orig_loop->header);
|
|
301 !gsi_end_p (gsi_new) && !gsi_end_p (gsi_orig);
|
|
302 gsi_next (&gsi_new), gsi_next (&gsi_orig))
|
|
303 {
|
|
304 phi_new = gsi_stmt (gsi_new);
|
|
305 phi_orig = gsi_stmt (gsi_orig);
|
|
306
|
|
307 /* step 1. */
|
|
308 def = PHI_ARG_DEF_FROM_EDGE (phi_orig, entry_arg_e);
|
|
309 add_phi_arg (phi_new, def, new_loop_entry_e);
|
|
310
|
|
311 /* step 2. */
|
|
312 def = PHI_ARG_DEF_FROM_EDGE (phi_orig, orig_loop_latch);
|
|
313 if (TREE_CODE (def) != SSA_NAME)
|
|
314 continue;
|
|
315
|
|
316 new_ssa_name = get_current_def (def);
|
|
317 if (!new_ssa_name)
|
|
318 {
|
|
319 /* This only happens if there are no definitions
|
|
320 inside the loop. use the phi_result in this case. */
|
|
321 new_ssa_name = PHI_RESULT (phi_new);
|
|
322 }
|
|
323
|
|
324 /* An ordinary ssa name defined in the loop. */
|
|
325 add_phi_arg (phi_new, new_ssa_name, loop_latch_edge (new_loop));
|
|
326
|
|
327 /* step 3 (case 1). */
|
|
328 if (!after)
|
|
329 {
|
|
330 gcc_assert (new_loop_exit_e == orig_entry_e);
|
|
331 SET_PHI_ARG_DEF (phi_orig,
|
|
332 new_loop_exit_e->dest_idx,
|
|
333 new_ssa_name);
|
|
334 }
|
|
335 }
|
|
336 }
|
|
337
|
|
338
|
|
339 /* Update PHI nodes for a guard of the LOOP.
|
|
340
|
|
341 Input:
|
|
342 - LOOP, GUARD_EDGE: LOOP is a loop for which we added guard code that
|
|
343 controls whether LOOP is to be executed. GUARD_EDGE is the edge that
|
|
344 originates from the guard-bb, skips LOOP and reaches the (unique) exit
|
|
345 bb of LOOP. This loop-exit-bb is an empty bb with one successor.
|
|
346 We denote this bb NEW_MERGE_BB because before the guard code was added
|
|
347 it had a single predecessor (the LOOP header), and now it became a merge
|
|
348 point of two paths - the path that ends with the LOOP exit-edge, and
|
|
349 the path that ends with GUARD_EDGE.
|
|
350 - NEW_EXIT_BB: New basic block that is added by this function between LOOP
|
|
351 and NEW_MERGE_BB. It is used to place loop-closed-ssa-form exit-phis.
|
|
352
|
|
353 ===> The CFG before the guard-code was added:
|
|
354 LOOP_header_bb:
|
|
355 loop_body
|
|
356 if (exit_loop) goto update_bb
|
|
357 else goto LOOP_header_bb
|
|
358 update_bb:
|
|
359
|
|
360 ==> The CFG after the guard-code was added:
|
|
361 guard_bb:
|
|
362 if (LOOP_guard_condition) goto new_merge_bb
|
|
363 else goto LOOP_header_bb
|
|
364 LOOP_header_bb:
|
|
365 loop_body
|
|
366 if (exit_loop_condition) goto new_merge_bb
|
|
367 else goto LOOP_header_bb
|
|
368 new_merge_bb:
|
|
369 goto update_bb
|
|
370 update_bb:
|
|
371
|
|
372 ==> The CFG after this function:
|
|
373 guard_bb:
|
|
374 if (LOOP_guard_condition) goto new_merge_bb
|
|
375 else goto LOOP_header_bb
|
|
376 LOOP_header_bb:
|
|
377 loop_body
|
|
378 if (exit_loop_condition) goto new_exit_bb
|
|
379 else goto LOOP_header_bb
|
|
380 new_exit_bb:
|
|
381 new_merge_bb:
|
|
382 goto update_bb
|
|
383 update_bb:
|
|
384
|
|
385 This function:
|
|
386 1. creates and updates the relevant phi nodes to account for the new
|
|
387 incoming edge (GUARD_EDGE) into NEW_MERGE_BB. This involves:
|
|
388 1.1. Create phi nodes at NEW_MERGE_BB.
|
|
389 1.2. Update the phi nodes at the successor of NEW_MERGE_BB (denoted
|
|
390 UPDATE_BB). UPDATE_BB was the exit-bb of LOOP before NEW_MERGE_BB
|
|
391 2. preserves loop-closed-ssa-form by creating the required phi nodes
|
|
392 at the exit of LOOP (i.e, in NEW_EXIT_BB).
|
|
393
|
|
394 There are two flavors to this function:
|
|
395
|
|
396 slpeel_update_phi_nodes_for_guard1:
|
|
397 Here the guard controls whether we enter or skip LOOP, where LOOP is a
|
|
398 prolog_loop (loop1 below), and the new phis created in NEW_MERGE_BB are
|
|
399 for variables that have phis in the loop header.
|
|
400
|
|
401 slpeel_update_phi_nodes_for_guard2:
|
|
402 Here the guard controls whether we enter or skip LOOP, where LOOP is an
|
|
403 epilog_loop (loop2 below), and the new phis created in NEW_MERGE_BB are
|
|
404 for variables that have phis in the loop exit.
|
|
405
|
|
406 I.E., the overall structure is:
|
|
407
|
|
408 loop1_preheader_bb:
|
|
409 guard1 (goto loop1/merge1_bb)
|
|
410 loop1
|
|
411 loop1_exit_bb:
|
|
412 guard2 (goto merge1_bb/merge2_bb)
|
|
413 merge1_bb
|
|
414 loop2
|
|
415 loop2_exit_bb
|
|
416 merge2_bb
|
|
417 next_bb
|
|
418
|
|
419 slpeel_update_phi_nodes_for_guard1 takes care of creating phis in
|
|
420 loop1_exit_bb and merge1_bb. These are entry phis (phis for the vars
|
|
421 that have phis in loop1->header).
|
|
422
|
|
423 slpeel_update_phi_nodes_for_guard2 takes care of creating phis in
|
|
424 loop2_exit_bb and merge2_bb. These are exit phis (phis for the vars
|
|
425 that have phis in next_bb). It also adds some of these phis to
|
|
426 loop1_exit_bb.
|
|
427
|
|
428 slpeel_update_phi_nodes_for_guard1 is always called before
|
|
429 slpeel_update_phi_nodes_for_guard2. They are both needed in order
|
|
430 to create correct data-flow and loop-closed-ssa-form.
|
|
431
|
|
432 Generally slpeel_update_phi_nodes_for_guard1 creates phis for variables
|
|
433 that change between iterations of a loop (and therefore have a phi-node
|
|
434 at the loop entry), whereas slpeel_update_phi_nodes_for_guard2 creates
|
|
435 phis for variables that are used out of the loop (and therefore have
|
|
436 loop-closed exit phis). Some variables may be both updated between
|
|
437 iterations and used after the loop. This is why in loop1_exit_bb we
|
|
438 may need both entry_phis (created by slpeel_update_phi_nodes_for_guard1)
|
|
439 and exit phis (created by slpeel_update_phi_nodes_for_guard2).
|
|
440
|
|
441 - IS_NEW_LOOP: if IS_NEW_LOOP is true, then LOOP is a newly created copy of
|
|
442 an original loop. i.e., we have:
|
|
443
|
|
444 orig_loop
|
|
445 guard_bb (goto LOOP/new_merge)
|
|
446 new_loop <-- LOOP
|
|
447 new_exit
|
|
448 new_merge
|
|
449 next_bb
|
|
450
|
|
451 If IS_NEW_LOOP is false, then LOOP is an original loop, in which case we
|
|
452 have:
|
|
453
|
|
454 new_loop
|
|
455 guard_bb (goto LOOP/new_merge)
|
|
456 orig_loop <-- LOOP
|
|
457 new_exit
|
|
458 new_merge
|
|
459 next_bb
|
|
460
|
|
461 The SSA names defined in the original loop have a current
|
|
462 reaching definition that that records the corresponding new
|
|
463 ssa-name used in the new duplicated loop copy.
|
|
464 */
|
|
465
|
|
466 /* Function slpeel_update_phi_nodes_for_guard1
|
|
467
|
|
468 Input:
|
|
469 - GUARD_EDGE, LOOP, IS_NEW_LOOP, NEW_EXIT_BB - as explained above.
|
|
470 - DEFS - a bitmap of ssa names to mark new names for which we recorded
|
|
471 information.
|
|
472
|
|
473 In the context of the overall structure, we have:
|
|
474
|
|
475 loop1_preheader_bb:
|
|
476 guard1 (goto loop1/merge1_bb)
|
|
477 LOOP-> loop1
|
|
478 loop1_exit_bb:
|
|
479 guard2 (goto merge1_bb/merge2_bb)
|
|
480 merge1_bb
|
|
481 loop2
|
|
482 loop2_exit_bb
|
|
483 merge2_bb
|
|
484 next_bb
|
|
485
|
|
486 For each name updated between loop iterations (i.e - for each name that has
|
|
487 an entry (loop-header) phi in LOOP) we create a new phi in:
|
|
488 1. merge1_bb (to account for the edge from guard1)
|
|
489 2. loop1_exit_bb (an exit-phi to keep LOOP in loop-closed form)
|
|
490 */
|
|
491
|
|
492 static void
|
|
493 slpeel_update_phi_nodes_for_guard1 (edge guard_edge, struct loop *loop,
|
|
494 bool is_new_loop, basic_block *new_exit_bb,
|
|
495 bitmap *defs)
|
|
496 {
|
|
497 gimple orig_phi, new_phi;
|
|
498 gimple update_phi, update_phi2;
|
|
499 tree guard_arg, loop_arg;
|
|
500 basic_block new_merge_bb = guard_edge->dest;
|
|
501 edge e = EDGE_SUCC (new_merge_bb, 0);
|
|
502 basic_block update_bb = e->dest;
|
|
503 basic_block orig_bb = loop->header;
|
|
504 edge new_exit_e;
|
|
505 tree current_new_name;
|
|
506 tree name;
|
|
507 gimple_stmt_iterator gsi_orig, gsi_update;
|
|
508
|
|
509 /* Create new bb between loop and new_merge_bb. */
|
|
510 *new_exit_bb = split_edge (single_exit (loop));
|
|
511
|
|
512 new_exit_e = EDGE_SUCC (*new_exit_bb, 0);
|
|
513
|
|
514 for (gsi_orig = gsi_start_phis (orig_bb),
|
|
515 gsi_update = gsi_start_phis (update_bb);
|
|
516 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
|
|
517 gsi_next (&gsi_orig), gsi_next (&gsi_update))
|
|
518 {
|
|
519 orig_phi = gsi_stmt (gsi_orig);
|
|
520 update_phi = gsi_stmt (gsi_update);
|
|
521
|
|
522 /* Virtual phi; Mark it for renaming. We actually want to call
|
|
523 mar_sym_for_renaming, but since all ssa renaming datastructures
|
|
524 are going to be freed before we get to call ssa_update, we just
|
|
525 record this name for now in a bitmap, and will mark it for
|
|
526 renaming later. */
|
|
527 name = PHI_RESULT (orig_phi);
|
|
528 if (!is_gimple_reg (SSA_NAME_VAR (name)))
|
|
529 bitmap_set_bit (vect_memsyms_to_rename, DECL_UID (SSA_NAME_VAR (name)));
|
|
530
|
|
531 /** 1. Handle new-merge-point phis **/
|
|
532
|
|
533 /* 1.1. Generate new phi node in NEW_MERGE_BB: */
|
|
534 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
|
|
535 new_merge_bb);
|
|
536
|
|
537 /* 1.2. NEW_MERGE_BB has two incoming edges: GUARD_EDGE and the exit-edge
|
|
538 of LOOP. Set the two phi args in NEW_PHI for these edges: */
|
|
539 loop_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, EDGE_SUCC (loop->latch, 0));
|
|
540 guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, loop_preheader_edge (loop));
|
|
541
|
|
542 add_phi_arg (new_phi, loop_arg, new_exit_e);
|
|
543 add_phi_arg (new_phi, guard_arg, guard_edge);
|
|
544
|
|
545 /* 1.3. Update phi in successor block. */
|
|
546 gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi, e) == loop_arg
|
|
547 || PHI_ARG_DEF_FROM_EDGE (update_phi, e) == guard_arg);
|
|
548 SET_PHI_ARG_DEF (update_phi, e->dest_idx, PHI_RESULT (new_phi));
|
|
549 update_phi2 = new_phi;
|
|
550
|
|
551
|
|
552 /** 2. Handle loop-closed-ssa-form phis **/
|
|
553
|
|
554 if (!is_gimple_reg (PHI_RESULT (orig_phi)))
|
|
555 continue;
|
|
556
|
|
557 /* 2.1. Generate new phi node in NEW_EXIT_BB: */
|
|
558 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
|
|
559 *new_exit_bb);
|
|
560
|
|
561 /* 2.2. NEW_EXIT_BB has one incoming edge: the exit-edge of the loop. */
|
|
562 add_phi_arg (new_phi, loop_arg, single_exit (loop));
|
|
563
|
|
564 /* 2.3. Update phi in successor of NEW_EXIT_BB: */
|
|
565 gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi2, new_exit_e) == loop_arg);
|
|
566 SET_PHI_ARG_DEF (update_phi2, new_exit_e->dest_idx, PHI_RESULT (new_phi));
|
|
567
|
|
568 /* 2.4. Record the newly created name with set_current_def.
|
|
569 We want to find a name such that
|
|
570 name = get_current_def (orig_loop_name)
|
|
571 and to set its current definition as follows:
|
|
572 set_current_def (name, new_phi_name)
|
|
573
|
|
574 If LOOP is a new loop then loop_arg is already the name we're
|
|
575 looking for. If LOOP is the original loop, then loop_arg is
|
|
576 the orig_loop_name and the relevant name is recorded in its
|
|
577 current reaching definition. */
|
|
578 if (is_new_loop)
|
|
579 current_new_name = loop_arg;
|
|
580 else
|
|
581 {
|
|
582 current_new_name = get_current_def (loop_arg);
|
|
583 /* current_def is not available only if the variable does not
|
|
584 change inside the loop, in which case we also don't care
|
|
585 about recording a current_def for it because we won't be
|
|
586 trying to create loop-exit-phis for it. */
|
|
587 if (!current_new_name)
|
|
588 continue;
|
|
589 }
|
|
590 gcc_assert (get_current_def (current_new_name) == NULL_TREE);
|
|
591
|
|
592 set_current_def (current_new_name, PHI_RESULT (new_phi));
|
|
593 bitmap_set_bit (*defs, SSA_NAME_VERSION (current_new_name));
|
|
594 }
|
|
595 }
|
|
596
|
|
597
|
|
598 /* Function slpeel_update_phi_nodes_for_guard2
|
|
599
|
|
600 Input:
|
|
601 - GUARD_EDGE, LOOP, IS_NEW_LOOP, NEW_EXIT_BB - as explained above.
|
|
602
|
|
603 In the context of the overall structure, we have:
|
|
604
|
|
605 loop1_preheader_bb:
|
|
606 guard1 (goto loop1/merge1_bb)
|
|
607 loop1
|
|
608 loop1_exit_bb:
|
|
609 guard2 (goto merge1_bb/merge2_bb)
|
|
610 merge1_bb
|
|
611 LOOP-> loop2
|
|
612 loop2_exit_bb
|
|
613 merge2_bb
|
|
614 next_bb
|
|
615
|
|
616 For each name used out side the loop (i.e - for each name that has an exit
|
|
617 phi in next_bb) we create a new phi in:
|
|
618 1. merge2_bb (to account for the edge from guard_bb)
|
|
619 2. loop2_exit_bb (an exit-phi to keep LOOP in loop-closed form)
|
|
620 3. guard2 bb (an exit phi to keep the preceding loop in loop-closed form),
|
|
621 if needed (if it wasn't handled by slpeel_update_phis_nodes_for_phi1).
|
|
622 */
|
|
623
|
|
624 static void
|
|
625 slpeel_update_phi_nodes_for_guard2 (edge guard_edge, struct loop *loop,
|
|
626 bool is_new_loop, basic_block *new_exit_bb)
|
|
627 {
|
|
628 gimple orig_phi, new_phi;
|
|
629 gimple update_phi, update_phi2;
|
|
630 tree guard_arg, loop_arg;
|
|
631 basic_block new_merge_bb = guard_edge->dest;
|
|
632 edge e = EDGE_SUCC (new_merge_bb, 0);
|
|
633 basic_block update_bb = e->dest;
|
|
634 edge new_exit_e;
|
|
635 tree orig_def, orig_def_new_name;
|
|
636 tree new_name, new_name2;
|
|
637 tree arg;
|
|
638 gimple_stmt_iterator gsi;
|
|
639
|
|
640 /* Create new bb between loop and new_merge_bb. */
|
|
641 *new_exit_bb = split_edge (single_exit (loop));
|
|
642
|
|
643 new_exit_e = EDGE_SUCC (*new_exit_bb, 0);
|
|
644
|
|
645 for (gsi = gsi_start_phis (update_bb); !gsi_end_p (gsi); gsi_next (&gsi))
|
|
646 {
|
|
647 update_phi = gsi_stmt (gsi);
|
|
648 orig_phi = update_phi;
|
|
649 orig_def = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
|
|
650 /* This loop-closed-phi actually doesn't represent a use
|
|
651 out of the loop - the phi arg is a constant. */
|
|
652 if (TREE_CODE (orig_def) != SSA_NAME)
|
|
653 continue;
|
|
654 orig_def_new_name = get_current_def (orig_def);
|
|
655 arg = NULL_TREE;
|
|
656
|
|
657 /** 1. Handle new-merge-point phis **/
|
|
658
|
|
659 /* 1.1. Generate new phi node in NEW_MERGE_BB: */
|
|
660 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
|
|
661 new_merge_bb);
|
|
662
|
|
663 /* 1.2. NEW_MERGE_BB has two incoming edges: GUARD_EDGE and the exit-edge
|
|
664 of LOOP. Set the two PHI args in NEW_PHI for these edges: */
|
|
665 new_name = orig_def;
|
|
666 new_name2 = NULL_TREE;
|
|
667 if (orig_def_new_name)
|
|
668 {
|
|
669 new_name = orig_def_new_name;
|
|
670 /* Some variables have both loop-entry-phis and loop-exit-phis.
|
|
671 Such variables were given yet newer names by phis placed in
|
|
672 guard_bb by slpeel_update_phi_nodes_for_guard1. I.e:
|
|
673 new_name2 = get_current_def (get_current_def (orig_name)). */
|
|
674 new_name2 = get_current_def (new_name);
|
|
675 }
|
|
676
|
|
677 if (is_new_loop)
|
|
678 {
|
|
679 guard_arg = orig_def;
|
|
680 loop_arg = new_name;
|
|
681 }
|
|
682 else
|
|
683 {
|
|
684 guard_arg = new_name;
|
|
685 loop_arg = orig_def;
|
|
686 }
|
|
687 if (new_name2)
|
|
688 guard_arg = new_name2;
|
|
689
|
|
690 add_phi_arg (new_phi, loop_arg, new_exit_e);
|
|
691 add_phi_arg (new_phi, guard_arg, guard_edge);
|
|
692
|
|
693 /* 1.3. Update phi in successor block. */
|
|
694 gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi, e) == orig_def);
|
|
695 SET_PHI_ARG_DEF (update_phi, e->dest_idx, PHI_RESULT (new_phi));
|
|
696 update_phi2 = new_phi;
|
|
697
|
|
698
|
|
699 /** 2. Handle loop-closed-ssa-form phis **/
|
|
700
|
|
701 /* 2.1. Generate new phi node in NEW_EXIT_BB: */
|
|
702 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
|
|
703 *new_exit_bb);
|
|
704
|
|
705 /* 2.2. NEW_EXIT_BB has one incoming edge: the exit-edge of the loop. */
|
|
706 add_phi_arg (new_phi, loop_arg, single_exit (loop));
|
|
707
|
|
708 /* 2.3. Update phi in successor of NEW_EXIT_BB: */
|
|
709 gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi2, new_exit_e) == loop_arg);
|
|
710 SET_PHI_ARG_DEF (update_phi2, new_exit_e->dest_idx, PHI_RESULT (new_phi));
|
|
711
|
|
712
|
|
713 /** 3. Handle loop-closed-ssa-form phis for first loop **/
|
|
714
|
|
715 /* 3.1. Find the relevant names that need an exit-phi in
|
|
716 GUARD_BB, i.e. names for which
|
|
717 slpeel_update_phi_nodes_for_guard1 had not already created a
|
|
718 phi node. This is the case for names that are used outside
|
|
719 the loop (and therefore need an exit phi) but are not updated
|
|
720 across loop iterations (and therefore don't have a
|
|
721 loop-header-phi).
|
|
722
|
|
723 slpeel_update_phi_nodes_for_guard1 is responsible for
|
|
724 creating loop-exit phis in GUARD_BB for names that have a
|
|
725 loop-header-phi. When such a phi is created we also record
|
|
726 the new name in its current definition. If this new name
|
|
727 exists, then guard_arg was set to this new name (see 1.2
|
|
728 above). Therefore, if guard_arg is not this new name, this
|
|
729 is an indication that an exit-phi in GUARD_BB was not yet
|
|
730 created, so we take care of it here. */
|
|
731 if (guard_arg == new_name2)
|
|
732 continue;
|
|
733 arg = guard_arg;
|
|
734
|
|
735 /* 3.2. Generate new phi node in GUARD_BB: */
|
|
736 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
|
|
737 guard_edge->src);
|
|
738
|
|
739 /* 3.3. GUARD_BB has one incoming edge: */
|
|
740 gcc_assert (EDGE_COUNT (guard_edge->src->preds) == 1);
|
|
741 add_phi_arg (new_phi, arg, EDGE_PRED (guard_edge->src, 0));
|
|
742
|
|
743 /* 3.4. Update phi in successor of GUARD_BB: */
|
|
744 gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi2, guard_edge)
|
|
745 == guard_arg);
|
|
746 SET_PHI_ARG_DEF (update_phi2, guard_edge->dest_idx, PHI_RESULT (new_phi));
|
|
747 }
|
|
748 }
|
|
749
|
|
750
|
|
751 /* Make the LOOP iterate NITERS times. This is done by adding a new IV
|
|
752 that starts at zero, increases by one and its limit is NITERS.
|
|
753
|
|
754 Assumption: the exit-condition of LOOP is the last stmt in the loop. */
|
|
755
|
|
756 void
|
|
757 slpeel_make_loop_iterate_ntimes (struct loop *loop, tree niters)
|
|
758 {
|
|
759 tree indx_before_incr, indx_after_incr;
|
|
760 gimple cond_stmt;
|
|
761 gimple orig_cond;
|
|
762 edge exit_edge = single_exit (loop);
|
|
763 gimple_stmt_iterator loop_cond_gsi;
|
|
764 gimple_stmt_iterator incr_gsi;
|
|
765 bool insert_after;
|
|
766 tree init = build_int_cst (TREE_TYPE (niters), 0);
|
|
767 tree step = build_int_cst (TREE_TYPE (niters), 1);
|
|
768 LOC loop_loc;
|
|
769 enum tree_code code;
|
|
770
|
|
771 orig_cond = get_loop_exit_condition (loop);
|
|
772 gcc_assert (orig_cond);
|
|
773 loop_cond_gsi = gsi_for_stmt (orig_cond);
|
|
774
|
|
775 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
|
|
776 create_iv (init, step, NULL_TREE, loop,
|
|
777 &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
|
|
778
|
|
779 indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
|
|
780 true, NULL_TREE, true,
|
|
781 GSI_SAME_STMT);
|
|
782 niters = force_gimple_operand_gsi (&loop_cond_gsi, niters, true, NULL_TREE,
|
|
783 true, GSI_SAME_STMT);
|
|
784
|
|
785 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
|
|
786 cond_stmt = gimple_build_cond (code, indx_after_incr, niters, NULL_TREE,
|
|
787 NULL_TREE);
|
|
788
|
|
789 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
|
|
790
|
|
791 /* Remove old loop exit test: */
|
|
792 gsi_remove (&loop_cond_gsi, true);
|
|
793
|
|
794 loop_loc = find_loop_location (loop);
|
|
795 if (dump_file && (dump_flags & TDF_DETAILS))
|
|
796 {
|
|
797 if (loop_loc != UNKNOWN_LOC)
|
|
798 fprintf (dump_file, "\nloop at %s:%d: ",
|
|
799 LOC_FILE (loop_loc), LOC_LINE (loop_loc));
|
|
800 print_gimple_stmt (dump_file, cond_stmt, 0, TDF_SLIM);
|
|
801 }
|
|
802
|
|
803 loop->nb_iterations = niters;
|
|
804 }
|
|
805
|
|
806
|
|
807 /* Given LOOP this function generates a new copy of it and puts it
|
|
808 on E which is either the entry or exit of LOOP. */
|
|
809
|
|
810 struct loop *
|
|
811 slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop, edge e)
|
|
812 {
|
|
813 struct loop *new_loop;
|
|
814 basic_block *new_bbs, *bbs;
|
|
815 bool at_exit;
|
|
816 bool was_imm_dom;
|
|
817 basic_block exit_dest;
|
|
818 gimple phi;
|
|
819 tree phi_arg;
|
|
820 edge exit, new_exit;
|
|
821 gimple_stmt_iterator gsi;
|
|
822
|
|
823 at_exit = (e == single_exit (loop));
|
|
824 if (!at_exit && e != loop_preheader_edge (loop))
|
|
825 return NULL;
|
|
826
|
|
827 bbs = get_loop_body (loop);
|
|
828
|
|
829 /* Check whether duplication is possible. */
|
|
830 if (!can_copy_bbs_p (bbs, loop->num_nodes))
|
|
831 {
|
|
832 free (bbs);
|
|
833 return NULL;
|
|
834 }
|
|
835
|
|
836 /* Generate new loop structure. */
|
|
837 new_loop = duplicate_loop (loop, loop_outer (loop));
|
|
838 if (!new_loop)
|
|
839 {
|
|
840 free (bbs);
|
|
841 return NULL;
|
|
842 }
|
|
843
|
|
844 exit_dest = single_exit (loop)->dest;
|
|
845 was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
|
|
846 exit_dest) == loop->header ?
|
|
847 true : false);
|
|
848
|
|
849 new_bbs = XNEWVEC (basic_block, loop->num_nodes);
|
|
850
|
|
851 exit = single_exit (loop);
|
|
852 copy_bbs (bbs, loop->num_nodes, new_bbs,
|
|
853 &exit, 1, &new_exit, NULL,
|
|
854 e->src);
|
|
855
|
|
856 /* Duplicating phi args at exit bbs as coming
|
|
857 also from exit of duplicated loop. */
|
|
858 for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi); gsi_next (&gsi))
|
|
859 {
|
|
860 phi = gsi_stmt (gsi);
|
|
861 phi_arg = PHI_ARG_DEF_FROM_EDGE (phi, single_exit (loop));
|
|
862 if (phi_arg)
|
|
863 {
|
|
864 edge new_loop_exit_edge;
|
|
865
|
|
866 if (EDGE_SUCC (new_loop->header, 0)->dest == new_loop->latch)
|
|
867 new_loop_exit_edge = EDGE_SUCC (new_loop->header, 1);
|
|
868 else
|
|
869 new_loop_exit_edge = EDGE_SUCC (new_loop->header, 0);
|
|
870
|
|
871 add_phi_arg (phi, phi_arg, new_loop_exit_edge);
|
|
872 }
|
|
873 }
|
|
874
|
|
875 if (at_exit) /* Add the loop copy at exit. */
|
|
876 {
|
|
877 redirect_edge_and_branch_force (e, new_loop->header);
|
|
878 PENDING_STMT (e) = NULL;
|
|
879 set_immediate_dominator (CDI_DOMINATORS, new_loop->header, e->src);
|
|
880 if (was_imm_dom)
|
|
881 set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_loop->header);
|
|
882 }
|
|
883 else /* Add the copy at entry. */
|
|
884 {
|
|
885 edge new_exit_e;
|
|
886 edge entry_e = loop_preheader_edge (loop);
|
|
887 basic_block preheader = entry_e->src;
|
|
888
|
|
889 if (!flow_bb_inside_loop_p (new_loop,
|
|
890 EDGE_SUCC (new_loop->header, 0)->dest))
|
|
891 new_exit_e = EDGE_SUCC (new_loop->header, 0);
|
|
892 else
|
|
893 new_exit_e = EDGE_SUCC (new_loop->header, 1);
|
|
894
|
|
895 redirect_edge_and_branch_force (new_exit_e, loop->header);
|
|
896 PENDING_STMT (new_exit_e) = NULL;
|
|
897 set_immediate_dominator (CDI_DOMINATORS, loop->header,
|
|
898 new_exit_e->src);
|
|
899
|
|
900 /* We have to add phi args to the loop->header here as coming
|
|
901 from new_exit_e edge. */
|
|
902 for (gsi = gsi_start_phis (loop->header);
|
|
903 !gsi_end_p (gsi);
|
|
904 gsi_next (&gsi))
|
|
905 {
|
|
906 phi = gsi_stmt (gsi);
|
|
907 phi_arg = PHI_ARG_DEF_FROM_EDGE (phi, entry_e);
|
|
908 if (phi_arg)
|
|
909 add_phi_arg (phi, phi_arg, new_exit_e);
|
|
910 }
|
|
911
|
|
912 redirect_edge_and_branch_force (entry_e, new_loop->header);
|
|
913 PENDING_STMT (entry_e) = NULL;
|
|
914 set_immediate_dominator (CDI_DOMINATORS, new_loop->header, preheader);
|
|
915 }
|
|
916
|
|
917 free (new_bbs);
|
|
918 free (bbs);
|
|
919
|
|
920 return new_loop;
|
|
921 }
|
|
922
|
|
923
|
|
924 /* Given the condition statement COND, put it as the last statement
|
|
925 of GUARD_BB; EXIT_BB is the basic block to skip the loop;
|
|
926 Assumes that this is the single exit of the guarded loop.
|
|
927 Returns the skip edge. */
|
|
928
|
|
929 static edge
|
|
930 slpeel_add_loop_guard (basic_block guard_bb, tree cond, basic_block exit_bb,
|
|
931 basic_block dom_bb)
|
|
932 {
|
|
933 gimple_stmt_iterator gsi;
|
|
934 edge new_e, enter_e;
|
|
935 gimple cond_stmt;
|
|
936 gimple_seq gimplify_stmt_list = NULL;
|
|
937
|
|
938 enter_e = EDGE_SUCC (guard_bb, 0);
|
|
939 enter_e->flags &= ~EDGE_FALLTHRU;
|
|
940 enter_e->flags |= EDGE_FALSE_VALUE;
|
|
941 gsi = gsi_last_bb (guard_bb);
|
|
942
|
|
943 cond = force_gimple_operand (cond, &gimplify_stmt_list, true, NULL_TREE);
|
|
944 cond_stmt = gimple_build_cond (NE_EXPR,
|
|
945 cond, build_int_cst (TREE_TYPE (cond), 0),
|
|
946 NULL_TREE, NULL_TREE);
|
|
947 if (gimplify_stmt_list)
|
|
948 gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
|
|
949
|
|
950 gsi = gsi_last_bb (guard_bb);
|
|
951 gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
|
|
952
|
|
953 /* Add new edge to connect guard block to the merge/loop-exit block. */
|
|
954 new_e = make_edge (guard_bb, exit_bb, EDGE_TRUE_VALUE);
|
|
955 set_immediate_dominator (CDI_DOMINATORS, exit_bb, dom_bb);
|
|
956 return new_e;
|
|
957 }
|
|
958
|
|
959
|
|
960 /* This function verifies that the following restrictions apply to LOOP:
|
|
961 (1) it is innermost
|
|
962 (2) it consists of exactly 2 basic blocks - header, and an empty latch.
|
|
963 (3) it is single entry, single exit
|
|
964 (4) its exit condition is the last stmt in the header
|
|
965 (5) E is the entry/exit edge of LOOP.
|
|
966 */
|
|
967
|
|
968 bool
|
|
969 slpeel_can_duplicate_loop_p (const struct loop *loop, const_edge e)
|
|
970 {
|
|
971 edge exit_e = single_exit (loop);
|
|
972 edge entry_e = loop_preheader_edge (loop);
|
|
973 gimple orig_cond = get_loop_exit_condition (loop);
|
|
974 gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
|
|
975
|
|
976 if (need_ssa_update_p ())
|
|
977 return false;
|
|
978
|
|
979 if (loop->inner
|
|
980 /* All loops have an outer scope; the only case loop->outer is NULL is for
|
|
981 the function itself. */
|
|
982 || !loop_outer (loop)
|
|
983 || loop->num_nodes != 2
|
|
984 || !empty_block_p (loop->latch)
|
|
985 || !single_exit (loop)
|
|
986 /* Verify that new loop exit condition can be trivially modified. */
|
|
987 || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
|
|
988 || (e != exit_e && e != entry_e))
|
|
989 return false;
|
|
990
|
|
991 return true;
|
|
992 }
|
|
993
|
|
994 #ifdef ENABLE_CHECKING
|
|
995 void
|
|
996 slpeel_verify_cfg_after_peeling (struct loop *first_loop,
|
|
997 struct loop *second_loop)
|
|
998 {
|
|
999 basic_block loop1_exit_bb = single_exit (first_loop)->dest;
|
|
1000 basic_block loop2_entry_bb = loop_preheader_edge (second_loop)->src;
|
|
1001 basic_block loop1_entry_bb = loop_preheader_edge (first_loop)->src;
|
|
1002
|
|
1003 /* A guard that controls whether the second_loop is to be executed or skipped
|
|
1004 is placed in first_loop->exit. first_loop->exit therefore has two
|
|
1005 successors - one is the preheader of second_loop, and the other is a bb
|
|
1006 after second_loop.
|
|
1007 */
|
|
1008 gcc_assert (EDGE_COUNT (loop1_exit_bb->succs) == 2);
|
|
1009
|
|
1010 /* 1. Verify that one of the successors of first_loop->exit is the preheader
|
|
1011 of second_loop. */
|
|
1012
|
|
1013 /* The preheader of new_loop is expected to have two predecessors:
|
|
1014 first_loop->exit and the block that precedes first_loop. */
|
|
1015
|
|
1016 gcc_assert (EDGE_COUNT (loop2_entry_bb->preds) == 2
|
|
1017 && ((EDGE_PRED (loop2_entry_bb, 0)->src == loop1_exit_bb
|
|
1018 && EDGE_PRED (loop2_entry_bb, 1)->src == loop1_entry_bb)
|
|
1019 || (EDGE_PRED (loop2_entry_bb, 1)->src == loop1_exit_bb
|
|
1020 && EDGE_PRED (loop2_entry_bb, 0)->src == loop1_entry_bb)));
|
|
1021
|
|
1022 /* Verify that the other successor of first_loop->exit is after the
|
|
1023 second_loop. */
|
|
1024 /* TODO */
|
|
1025 }
|
|
1026 #endif
|
|
1027
|
|
1028 /* If the run time cost model check determines that vectorization is
|
|
1029 not profitable and hence scalar loop should be generated then set
|
|
1030 FIRST_NITERS to prologue peeled iterations. This will allow all the
|
|
1031 iterations to be executed in the prologue peeled scalar loop. */
|
|
1032
|
|
1033 void
|
|
1034 set_prologue_iterations (basic_block bb_before_first_loop,
|
|
1035 tree first_niters,
|
|
1036 struct loop *loop,
|
|
1037 unsigned int th)
|
|
1038 {
|
|
1039 edge e;
|
|
1040 basic_block cond_bb, then_bb;
|
|
1041 tree var, prologue_after_cost_adjust_name;
|
|
1042 gimple_stmt_iterator gsi;
|
|
1043 gimple newphi;
|
|
1044 edge e_true, e_false, e_fallthru;
|
|
1045 gimple cond_stmt;
|
|
1046 gimple_seq gimplify_stmt_list = NULL, stmts = NULL;
|
|
1047 tree cost_pre_condition = NULL_TREE;
|
|
1048 tree scalar_loop_iters =
|
|
1049 unshare_expr (LOOP_VINFO_NITERS_UNCHANGED (loop_vec_info_for_loop (loop)));
|
|
1050
|
|
1051 e = single_pred_edge (bb_before_first_loop);
|
|
1052 cond_bb = split_edge(e);
|
|
1053
|
|
1054 e = single_pred_edge (bb_before_first_loop);
|
|
1055 then_bb = split_edge(e);
|
|
1056 set_immediate_dominator (CDI_DOMINATORS, then_bb, cond_bb);
|
|
1057
|
|
1058 e_false = make_single_succ_edge (cond_bb, bb_before_first_loop,
|
|
1059 EDGE_FALSE_VALUE);
|
|
1060 set_immediate_dominator (CDI_DOMINATORS, bb_before_first_loop, cond_bb);
|
|
1061
|
|
1062 e_true = EDGE_PRED (then_bb, 0);
|
|
1063 e_true->flags &= ~EDGE_FALLTHRU;
|
|
1064 e_true->flags |= EDGE_TRUE_VALUE;
|
|
1065
|
|
1066 e_fallthru = EDGE_SUCC (then_bb, 0);
|
|
1067
|
|
1068 cost_pre_condition =
|
|
1069 fold_build2 (LE_EXPR, boolean_type_node, scalar_loop_iters,
|
|
1070 build_int_cst (TREE_TYPE (scalar_loop_iters), th));
|
|
1071 cost_pre_condition =
|
|
1072 force_gimple_operand (cost_pre_condition, &gimplify_stmt_list,
|
|
1073 true, NULL_TREE);
|
|
1074 cond_stmt = gimple_build_cond (NE_EXPR, cost_pre_condition,
|
|
1075 build_int_cst (TREE_TYPE (cost_pre_condition),
|
|
1076 0), NULL_TREE, NULL_TREE);
|
|
1077
|
|
1078 gsi = gsi_last_bb (cond_bb);
|
|
1079 if (gimplify_stmt_list)
|
|
1080 gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
|
|
1081
|
|
1082 gsi = gsi_last_bb (cond_bb);
|
|
1083 gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
|
|
1084
|
|
1085 var = create_tmp_var (TREE_TYPE (scalar_loop_iters),
|
|
1086 "prologue_after_cost_adjust");
|
|
1087 add_referenced_var (var);
|
|
1088 prologue_after_cost_adjust_name =
|
|
1089 force_gimple_operand (scalar_loop_iters, &stmts, false, var);
|
|
1090
|
|
1091 gsi = gsi_last_bb (then_bb);
|
|
1092 if (stmts)
|
|
1093 gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT);
|
|
1094
|
|
1095 newphi = create_phi_node (var, bb_before_first_loop);
|
|
1096 add_phi_arg (newphi, prologue_after_cost_adjust_name, e_fallthru);
|
|
1097 add_phi_arg (newphi, first_niters, e_false);
|
|
1098
|
|
1099 first_niters = PHI_RESULT (newphi);
|
|
1100 }
|
|
1101
|
|
1102
|
|
1103 /* Function slpeel_tree_peel_loop_to_edge.
|
|
1104
|
|
1105 Peel the first (last) iterations of LOOP into a new prolog (epilog) loop
|
|
1106 that is placed on the entry (exit) edge E of LOOP. After this transformation
|
|
1107 we have two loops one after the other - first-loop iterates FIRST_NITERS
|
|
1108 times, and second-loop iterates the remainder NITERS - FIRST_NITERS times.
|
|
1109 If the cost model indicates that it is profitable to emit a scalar
|
|
1110 loop instead of the vector one, then the prolog (epilog) loop will iterate
|
|
1111 for the entire unchanged scalar iterations of the loop.
|
|
1112
|
|
1113 Input:
|
|
1114 - LOOP: the loop to be peeled.
|
|
1115 - E: the exit or entry edge of LOOP.
|
|
1116 If it is the entry edge, we peel the first iterations of LOOP. In this
|
|
1117 case first-loop is LOOP, and second-loop is the newly created loop.
|
|
1118 If it is the exit edge, we peel the last iterations of LOOP. In this
|
|
1119 case, first-loop is the newly created loop, and second-loop is LOOP.
|
|
1120 - NITERS: the number of iterations that LOOP iterates.
|
|
1121 - FIRST_NITERS: the number of iterations that the first-loop should iterate.
|
|
1122 - UPDATE_FIRST_LOOP_COUNT: specified whether this function is responsible
|
|
1123 for updating the loop bound of the first-loop to FIRST_NITERS. If it
|
|
1124 is false, the caller of this function may want to take care of this
|
|
1125 (this can be useful if we don't want new stmts added to first-loop).
|
|
1126 - TH: cost model profitability threshold of iterations for vectorization.
|
|
1127 - CHECK_PROFITABILITY: specify whether cost model check has not occurred
|
|
1128 during versioning and hence needs to occur during
|
|
1129 prologue generation or whether cost model check
|
|
1130 has not occurred during prologue generation and hence
|
|
1131 needs to occur during epilogue generation.
|
|
1132
|
|
1133
|
|
1134 Output:
|
|
1135 The function returns a pointer to the new loop-copy, or NULL if it failed
|
|
1136 to perform the transformation.
|
|
1137
|
|
1138 The function generates two if-then-else guards: one before the first loop,
|
|
1139 and the other before the second loop:
|
|
1140 The first guard is:
|
|
1141 if (FIRST_NITERS == 0) then skip the first loop,
|
|
1142 and go directly to the second loop.
|
|
1143 The second guard is:
|
|
1144 if (FIRST_NITERS == NITERS) then skip the second loop.
|
|
1145
|
|
1146 FORNOW only simple loops are supported (see slpeel_can_duplicate_loop_p).
|
|
1147 FORNOW the resulting code will not be in loop-closed-ssa form.
|
|
1148 */
|
|
1149
|
|
1150 struct loop*
|
|
1151 slpeel_tree_peel_loop_to_edge (struct loop *loop,
|
|
1152 edge e, tree first_niters,
|
|
1153 tree niters, bool update_first_loop_count,
|
|
1154 unsigned int th, bool check_profitability)
|
|
1155 {
|
|
1156 struct loop *new_loop = NULL, *first_loop, *second_loop;
|
|
1157 edge skip_e;
|
|
1158 tree pre_condition = NULL_TREE;
|
|
1159 bitmap definitions;
|
|
1160 basic_block bb_before_second_loop, bb_after_second_loop;
|
|
1161 basic_block bb_before_first_loop;
|
|
1162 basic_block bb_between_loops;
|
|
1163 basic_block new_exit_bb;
|
|
1164 edge exit_e = single_exit (loop);
|
|
1165 LOC loop_loc;
|
|
1166 tree cost_pre_condition = NULL_TREE;
|
|
1167
|
|
1168 if (!slpeel_can_duplicate_loop_p (loop, e))
|
|
1169 return NULL;
|
|
1170
|
|
1171 /* We have to initialize cfg_hooks. Then, when calling
|
|
1172 cfg_hooks->split_edge, the function tree_split_edge
|
|
1173 is actually called and, when calling cfg_hooks->duplicate_block,
|
|
1174 the function tree_duplicate_bb is called. */
|
|
1175 gimple_register_cfg_hooks ();
|
|
1176
|
|
1177
|
|
1178 /* 1. Generate a copy of LOOP and put it on E (E is the entry/exit of LOOP).
|
|
1179 Resulting CFG would be:
|
|
1180
|
|
1181 first_loop:
|
|
1182 do {
|
|
1183 } while ...
|
|
1184
|
|
1185 second_loop:
|
|
1186 do {
|
|
1187 } while ...
|
|
1188
|
|
1189 orig_exit_bb:
|
|
1190 */
|
|
1191
|
|
1192 if (!(new_loop = slpeel_tree_duplicate_loop_to_edge_cfg (loop, e)))
|
|
1193 {
|
|
1194 loop_loc = find_loop_location (loop);
|
|
1195 if (dump_file && (dump_flags & TDF_DETAILS))
|
|
1196 {
|
|
1197 if (loop_loc != UNKNOWN_LOC)
|
|
1198 fprintf (dump_file, "\n%s:%d: note: ",
|
|
1199 LOC_FILE (loop_loc), LOC_LINE (loop_loc));
|
|
1200 fprintf (dump_file, "tree_duplicate_loop_to_edge_cfg failed.\n");
|
|
1201 }
|
|
1202 return NULL;
|
|
1203 }
|
|
1204
|
|
1205 if (e == exit_e)
|
|
1206 {
|
|
1207 /* NEW_LOOP was placed after LOOP. */
|
|
1208 first_loop = loop;
|
|
1209 second_loop = new_loop;
|
|
1210 }
|
|
1211 else
|
|
1212 {
|
|
1213 /* NEW_LOOP was placed before LOOP. */
|
|
1214 first_loop = new_loop;
|
|
1215 second_loop = loop;
|
|
1216 }
|
|
1217
|
|
1218 definitions = ssa_names_to_replace ();
|
|
1219 slpeel_update_phis_for_duplicate_loop (loop, new_loop, e == exit_e);
|
|
1220 rename_variables_in_loop (new_loop);
|
|
1221
|
|
1222
|
|
1223 /* 2. Add the guard code in one of the following ways:
|
|
1224
|
|
1225 2.a Add the guard that controls whether the first loop is executed.
|
|
1226 This occurs when this function is invoked for prologue or epilogue
|
|
1227 generation and when the cost model check can be done at compile time.
|
|
1228
|
|
1229 Resulting CFG would be:
|
|
1230
|
|
1231 bb_before_first_loop:
|
|
1232 if (FIRST_NITERS == 0) GOTO bb_before_second_loop
|
|
1233 GOTO first-loop
|
|
1234
|
|
1235 first_loop:
|
|
1236 do {
|
|
1237 } while ...
|
|
1238
|
|
1239 bb_before_second_loop:
|
|
1240
|
|
1241 second_loop:
|
|
1242 do {
|
|
1243 } while ...
|
|
1244
|
|
1245 orig_exit_bb:
|
|
1246
|
|
1247 2.b Add the cost model check that allows the prologue
|
|
1248 to iterate for the entire unchanged scalar
|
|
1249 iterations of the loop in the event that the cost
|
|
1250 model indicates that the scalar loop is more
|
|
1251 profitable than the vector one. This occurs when
|
|
1252 this function is invoked for prologue generation
|
|
1253 and the cost model check needs to be done at run
|
|
1254 time.
|
|
1255
|
|
1256 Resulting CFG after prologue peeling would be:
|
|
1257
|
|
1258 if (scalar_loop_iterations <= th)
|
|
1259 FIRST_NITERS = scalar_loop_iterations
|
|
1260
|
|
1261 bb_before_first_loop:
|
|
1262 if (FIRST_NITERS == 0) GOTO bb_before_second_loop
|
|
1263 GOTO first-loop
|
|
1264
|
|
1265 first_loop:
|
|
1266 do {
|
|
1267 } while ...
|
|
1268
|
|
1269 bb_before_second_loop:
|
|
1270
|
|
1271 second_loop:
|
|
1272 do {
|
|
1273 } while ...
|
|
1274
|
|
1275 orig_exit_bb:
|
|
1276
|
|
1277 2.c Add the cost model check that allows the epilogue
|
|
1278 to iterate for the entire unchanged scalar
|
|
1279 iterations of the loop in the event that the cost
|
|
1280 model indicates that the scalar loop is more
|
|
1281 profitable than the vector one. This occurs when
|
|
1282 this function is invoked for epilogue generation
|
|
1283 and the cost model check needs to be done at run
|
|
1284 time.
|
|
1285
|
|
1286 Resulting CFG after prologue peeling would be:
|
|
1287
|
|
1288 bb_before_first_loop:
|
|
1289 if ((scalar_loop_iterations <= th)
|
|
1290 ||
|
|
1291 FIRST_NITERS == 0) GOTO bb_before_second_loop
|
|
1292 GOTO first-loop
|
|
1293
|
|
1294 first_loop:
|
|
1295 do {
|
|
1296 } while ...
|
|
1297
|
|
1298 bb_before_second_loop:
|
|
1299
|
|
1300 second_loop:
|
|
1301 do {
|
|
1302 } while ...
|
|
1303
|
|
1304 orig_exit_bb:
|
|
1305 */
|
|
1306
|
|
1307 bb_before_first_loop = split_edge (loop_preheader_edge (first_loop));
|
|
1308 bb_before_second_loop = split_edge (single_exit (first_loop));
|
|
1309
|
|
1310 /* Epilogue peeling. */
|
|
1311 if (!update_first_loop_count)
|
|
1312 {
|
|
1313 pre_condition =
|
|
1314 fold_build2 (LE_EXPR, boolean_type_node, first_niters,
|
|
1315 build_int_cst (TREE_TYPE (first_niters), 0));
|
|
1316 if (check_profitability)
|
|
1317 {
|
|
1318 tree scalar_loop_iters
|
|
1319 = unshare_expr (LOOP_VINFO_NITERS_UNCHANGED
|
|
1320 (loop_vec_info_for_loop (loop)));
|
|
1321 cost_pre_condition =
|
|
1322 fold_build2 (LE_EXPR, boolean_type_node, scalar_loop_iters,
|
|
1323 build_int_cst (TREE_TYPE (scalar_loop_iters), th));
|
|
1324
|
|
1325 pre_condition = fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
|
|
1326 cost_pre_condition, pre_condition);
|
|
1327 }
|
|
1328 }
|
|
1329
|
|
1330 /* Prologue peeling. */
|
|
1331 else
|
|
1332 {
|
|
1333 if (check_profitability)
|
|
1334 set_prologue_iterations (bb_before_first_loop, first_niters,
|
|
1335 loop, th);
|
|
1336
|
|
1337 pre_condition =
|
|
1338 fold_build2 (LE_EXPR, boolean_type_node, first_niters,
|
|
1339 build_int_cst (TREE_TYPE (first_niters), 0));
|
|
1340 }
|
|
1341
|
|
1342 skip_e = slpeel_add_loop_guard (bb_before_first_loop, pre_condition,
|
|
1343 bb_before_second_loop, bb_before_first_loop);
|
|
1344 slpeel_update_phi_nodes_for_guard1 (skip_e, first_loop,
|
|
1345 first_loop == new_loop,
|
|
1346 &new_exit_bb, &definitions);
|
|
1347
|
|
1348
|
|
1349 /* 3. Add the guard that controls whether the second loop is executed.
|
|
1350 Resulting CFG would be:
|
|
1351
|
|
1352 bb_before_first_loop:
|
|
1353 if (FIRST_NITERS == 0) GOTO bb_before_second_loop (skip first loop)
|
|
1354 GOTO first-loop
|
|
1355
|
|
1356 first_loop:
|
|
1357 do {
|
|
1358 } while ...
|
|
1359
|
|
1360 bb_between_loops:
|
|
1361 if (FIRST_NITERS == NITERS) GOTO bb_after_second_loop (skip second loop)
|
|
1362 GOTO bb_before_second_loop
|
|
1363
|
|
1364 bb_before_second_loop:
|
|
1365
|
|
1366 second_loop:
|
|
1367 do {
|
|
1368 } while ...
|
|
1369
|
|
1370 bb_after_second_loop:
|
|
1371
|
|
1372 orig_exit_bb:
|
|
1373 */
|
|
1374
|
|
1375 bb_between_loops = new_exit_bb;
|
|
1376 bb_after_second_loop = split_edge (single_exit (second_loop));
|
|
1377
|
|
1378 pre_condition =
|
|
1379 fold_build2 (EQ_EXPR, boolean_type_node, first_niters, niters);
|
|
1380 skip_e = slpeel_add_loop_guard (bb_between_loops, pre_condition,
|
|
1381 bb_after_second_loop, bb_before_first_loop);
|
|
1382 slpeel_update_phi_nodes_for_guard2 (skip_e, second_loop,
|
|
1383 second_loop == new_loop, &new_exit_bb);
|
|
1384
|
|
1385 /* 4. Make first-loop iterate FIRST_NITERS times, if requested.
|
|
1386 */
|
|
1387 if (update_first_loop_count)
|
|
1388 slpeel_make_loop_iterate_ntimes (first_loop, first_niters);
|
|
1389
|
|
1390 BITMAP_FREE (definitions);
|
|
1391 delete_update_ssa ();
|
|
1392
|
|
1393 return new_loop;
|
|
1394 }
|
|
1395
|
|
1396 /* Function vect_get_loop_location.
|
|
1397
|
|
1398 Extract the location of the loop in the source code.
|
|
1399 If the loop is not well formed for vectorization, an estimated
|
|
1400 location is calculated.
|
|
1401 Return the loop location if succeed and NULL if not. */
|
|
1402
|
|
1403 LOC
|
|
1404 find_loop_location (struct loop *loop)
|
|
1405 {
|
|
1406 gimple stmt = NULL;
|
|
1407 basic_block bb;
|
|
1408 gimple_stmt_iterator si;
|
|
1409
|
|
1410 if (!loop)
|
|
1411 return UNKNOWN_LOC;
|
|
1412
|
|
1413 stmt = get_loop_exit_condition (loop);
|
|
1414
|
|
1415 if (stmt && gimple_location (stmt) != UNKNOWN_LOC)
|
|
1416 return gimple_location (stmt);
|
|
1417
|
|
1418 /* If we got here the loop is probably not "well formed",
|
|
1419 try to estimate the loop location */
|
|
1420
|
|
1421 if (!loop->header)
|
|
1422 return UNKNOWN_LOC;
|
|
1423
|
|
1424 bb = loop->header;
|
|
1425
|
|
1426 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
|
|
1427 {
|
|
1428 stmt = gsi_stmt (si);
|
|
1429 if (gimple_location (stmt) != UNKNOWN_LOC)
|
|
1430 return gimple_location (stmt);
|
|
1431 }
|
|
1432
|
|
1433 return UNKNOWN_LOC;
|
|
1434 }
|
|
1435
|
|
1436
|
|
1437 /*************************************************************************
|
|
1438 Vectorization Debug Information.
|
|
1439 *************************************************************************/
|
|
1440
|
|
1441 /* Function vect_set_verbosity_level.
|
|
1442
|
|
1443 Called from toplev.c upon detection of the
|
|
1444 -ftree-vectorizer-verbose=N option. */
|
|
1445
|
|
1446 void
|
|
1447 vect_set_verbosity_level (const char *val)
|
|
1448 {
|
|
1449 unsigned int vl;
|
|
1450
|
|
1451 vl = atoi (val);
|
|
1452 if (vl < MAX_VERBOSITY_LEVEL)
|
|
1453 vect_verbosity_level = vl;
|
|
1454 else
|
|
1455 vect_verbosity_level = MAX_VERBOSITY_LEVEL - 1;
|
|
1456 }
|
|
1457
|
|
1458
|
|
1459 /* Function vect_set_dump_settings.
|
|
1460
|
|
1461 Fix the verbosity level of the vectorizer if the
|
|
1462 requested level was not set explicitly using the flag
|
|
1463 -ftree-vectorizer-verbose=N.
|
|
1464 Decide where to print the debugging information (dump_file/stderr).
|
|
1465 If the user defined the verbosity level, but there is no dump file,
|
|
1466 print to stderr, otherwise print to the dump file. */
|
|
1467
|
|
1468 static void
|
|
1469 vect_set_dump_settings (void)
|
|
1470 {
|
|
1471 vect_dump = dump_file;
|
|
1472
|
|
1473 /* Check if the verbosity level was defined by the user: */
|
|
1474 if (vect_verbosity_level != MAX_VERBOSITY_LEVEL)
|
|
1475 {
|
|
1476 /* If there is no dump file, print to stderr. */
|
|
1477 if (!dump_file)
|
|
1478 vect_dump = stderr;
|
|
1479 return;
|
|
1480 }
|
|
1481
|
|
1482 /* User didn't specify verbosity level: */
|
|
1483 if (dump_file && (dump_flags & TDF_DETAILS))
|
|
1484 vect_verbosity_level = REPORT_DETAILS;
|
|
1485 else if (dump_file && (dump_flags & TDF_STATS))
|
|
1486 vect_verbosity_level = REPORT_UNVECTORIZED_LOOPS;
|
|
1487 else
|
|
1488 vect_verbosity_level = REPORT_NONE;
|
|
1489
|
|
1490 gcc_assert (dump_file || vect_verbosity_level == REPORT_NONE);
|
|
1491 }
|
|
1492
|
|
1493
|
|
1494 /* Function debug_loop_details.
|
|
1495
|
|
1496 For vectorization debug dumps. */
|
|
1497
|
|
1498 bool
|
|
1499 vect_print_dump_info (enum verbosity_levels vl)
|
|
1500 {
|
|
1501 if (vl > vect_verbosity_level)
|
|
1502 return false;
|
|
1503
|
|
1504 if (!current_function_decl || !vect_dump)
|
|
1505 return false;
|
|
1506
|
|
1507 if (vect_loop_location == UNKNOWN_LOC)
|
|
1508 fprintf (vect_dump, "\n%s:%d: note: ",
|
|
1509 DECL_SOURCE_FILE (current_function_decl),
|
|
1510 DECL_SOURCE_LINE (current_function_decl));
|
|
1511 else
|
|
1512 fprintf (vect_dump, "\n%s:%d: note: ",
|
|
1513 LOC_FILE (vect_loop_location), LOC_LINE (vect_loop_location));
|
|
1514
|
|
1515 return true;
|
|
1516 }
|
|
1517
|
|
1518
|
|
1519 /*************************************************************************
|
|
1520 Vectorization Utilities.
|
|
1521 *************************************************************************/
|
|
1522
|
|
1523 /* Function new_stmt_vec_info.
|
|
1524
|
|
1525 Create and initialize a new stmt_vec_info struct for STMT. */
|
|
1526
|
|
1527 stmt_vec_info
|
|
1528 new_stmt_vec_info (gimple stmt, loop_vec_info loop_vinfo)
|
|
1529 {
|
|
1530 stmt_vec_info res;
|
|
1531 res = (stmt_vec_info) xcalloc (1, sizeof (struct _stmt_vec_info));
|
|
1532
|
|
1533 STMT_VINFO_TYPE (res) = undef_vec_info_type;
|
|
1534 STMT_VINFO_STMT (res) = stmt;
|
|
1535 STMT_VINFO_LOOP_VINFO (res) = loop_vinfo;
|
|
1536 STMT_VINFO_RELEVANT (res) = 0;
|
|
1537 STMT_VINFO_LIVE_P (res) = false;
|
|
1538 STMT_VINFO_VECTYPE (res) = NULL;
|
|
1539 STMT_VINFO_VEC_STMT (res) = NULL;
|
|
1540 STMT_VINFO_IN_PATTERN_P (res) = false;
|
|
1541 STMT_VINFO_RELATED_STMT (res) = NULL;
|
|
1542 STMT_VINFO_DATA_REF (res) = NULL;
|
|
1543
|
|
1544 STMT_VINFO_DR_BASE_ADDRESS (res) = NULL;
|
|
1545 STMT_VINFO_DR_OFFSET (res) = NULL;
|
|
1546 STMT_VINFO_DR_INIT (res) = NULL;
|
|
1547 STMT_VINFO_DR_STEP (res) = NULL;
|
|
1548 STMT_VINFO_DR_ALIGNED_TO (res) = NULL;
|
|
1549
|
|
1550 if (gimple_code (stmt) == GIMPLE_PHI
|
|
1551 && is_loop_header_bb_p (gimple_bb (stmt)))
|
|
1552 STMT_VINFO_DEF_TYPE (res) = vect_unknown_def_type;
|
|
1553 else
|
|
1554 STMT_VINFO_DEF_TYPE (res) = vect_loop_def;
|
|
1555 STMT_VINFO_SAME_ALIGN_REFS (res) = VEC_alloc (dr_p, heap, 5);
|
|
1556 STMT_VINFO_INSIDE_OF_LOOP_COST (res) = 0;
|
|
1557 STMT_VINFO_OUTSIDE_OF_LOOP_COST (res) = 0;
|
|
1558 STMT_SLP_TYPE (res) = 0;
|
|
1559 DR_GROUP_FIRST_DR (res) = NULL;
|
|
1560 DR_GROUP_NEXT_DR (res) = NULL;
|
|
1561 DR_GROUP_SIZE (res) = 0;
|
|
1562 DR_GROUP_STORE_COUNT (res) = 0;
|
|
1563 DR_GROUP_GAP (res) = 0;
|
|
1564 DR_GROUP_SAME_DR_STMT (res) = NULL;
|
|
1565 DR_GROUP_READ_WRITE_DEPENDENCE (res) = false;
|
|
1566
|
|
1567 return res;
|
|
1568 }
|
|
1569
|
|
1570 /* Create a hash table for stmt_vec_info. */
|
|
1571
|
|
1572 void
|
|
1573 init_stmt_vec_info_vec (void)
|
|
1574 {
|
|
1575 gcc_assert (!stmt_vec_info_vec);
|
|
1576 stmt_vec_info_vec = VEC_alloc (vec_void_p, heap, 50);
|
|
1577 }
|
|
1578
|
|
1579 /* Free hash table for stmt_vec_info. */
|
|
1580
|
|
1581 void
|
|
1582 free_stmt_vec_info_vec (void)
|
|
1583 {
|
|
1584 gcc_assert (stmt_vec_info_vec);
|
|
1585 VEC_free (vec_void_p, heap, stmt_vec_info_vec);
|
|
1586 }
|
|
1587
|
|
1588 /* Free stmt vectorization related info. */
|
|
1589
|
|
1590 void
|
|
1591 free_stmt_vec_info (gimple stmt)
|
|
1592 {
|
|
1593 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
|
|
1594
|
|
1595 if (!stmt_info)
|
|
1596 return;
|
|
1597
|
|
1598 VEC_free (dr_p, heap, STMT_VINFO_SAME_ALIGN_REFS (stmt_info));
|
|
1599 set_vinfo_for_stmt (stmt, NULL);
|
|
1600 free (stmt_info);
|
|
1601 }
|
|
1602
|
|
1603
|
|
1604 /* Function bb_in_loop_p
|
|
1605
|
|
1606 Used as predicate for dfs order traversal of the loop bbs. */
|
|
1607
|
|
1608 static bool
|
|
1609 bb_in_loop_p (const_basic_block bb, const void *data)
|
|
1610 {
|
|
1611 const struct loop *const loop = (const struct loop *)data;
|
|
1612 if (flow_bb_inside_loop_p (loop, bb))
|
|
1613 return true;
|
|
1614 return false;
|
|
1615 }
|
|
1616
|
|
1617
|
|
1618 /* Function new_loop_vec_info.
|
|
1619
|
|
1620 Create and initialize a new loop_vec_info struct for LOOP, as well as
|
|
1621 stmt_vec_info structs for all the stmts in LOOP. */
|
|
1622
|
|
1623 loop_vec_info
|
|
1624 new_loop_vec_info (struct loop *loop)
|
|
1625 {
|
|
1626 loop_vec_info res;
|
|
1627 basic_block *bbs;
|
|
1628 gimple_stmt_iterator si;
|
|
1629 unsigned int i, nbbs;
|
|
1630
|
|
1631 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
|
|
1632 LOOP_VINFO_LOOP (res) = loop;
|
|
1633
|
|
1634 bbs = get_loop_body (loop);
|
|
1635
|
|
1636 /* Create/Update stmt_info for all stmts in the loop. */
|
|
1637 for (i = 0; i < loop->num_nodes; i++)
|
|
1638 {
|
|
1639 basic_block bb = bbs[i];
|
|
1640
|
|
1641 /* BBs in a nested inner-loop will have been already processed (because
|
|
1642 we will have called vect_analyze_loop_form for any nested inner-loop).
|
|
1643 Therefore, for stmts in an inner-loop we just want to update the
|
|
1644 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
|
|
1645 loop_info of the outer-loop we are currently considering to vectorize
|
|
1646 (instead of the loop_info of the inner-loop).
|
|
1647 For stmts in other BBs we need to create a stmt_info from scratch. */
|
|
1648 if (bb->loop_father != loop)
|
|
1649 {
|
|
1650 /* Inner-loop bb. */
|
|
1651 gcc_assert (loop->inner && bb->loop_father == loop->inner);
|
|
1652 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
|
|
1653 {
|
|
1654 gimple phi = gsi_stmt (si);
|
|
1655 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
|
|
1656 loop_vec_info inner_loop_vinfo =
|
|
1657 STMT_VINFO_LOOP_VINFO (stmt_info);
|
|
1658 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
|
|
1659 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
|
|
1660 }
|
|
1661 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
|
|
1662 {
|
|
1663 gimple stmt = gsi_stmt (si);
|
|
1664 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
|
|
1665 loop_vec_info inner_loop_vinfo =
|
|
1666 STMT_VINFO_LOOP_VINFO (stmt_info);
|
|
1667 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
|
|
1668 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
|
|
1669 }
|
|
1670 }
|
|
1671 else
|
|
1672 {
|
|
1673 /* bb in current nest. */
|
|
1674 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
|
|
1675 {
|
|
1676 gimple phi = gsi_stmt (si);
|
|
1677 gimple_set_uid (phi, 0);
|
|
1678 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res));
|
|
1679 }
|
|
1680
|
|
1681 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
|
|
1682 {
|
|
1683 gimple stmt = gsi_stmt (si);
|
|
1684 gimple_set_uid (stmt, 0);
|
|
1685 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res));
|
|
1686 }
|
|
1687 }
|
|
1688 }
|
|
1689
|
|
1690 /* CHECKME: We want to visit all BBs before their successors (except for
|
|
1691 latch blocks, for which this assertion wouldn't hold). In the simple
|
|
1692 case of the loop forms we allow, a dfs order of the BBs would the same
|
|
1693 as reversed postorder traversal, so we are safe. */
|
|
1694
|
|
1695 free (bbs);
|
|
1696 bbs = XCNEWVEC (basic_block, loop->num_nodes);
|
|
1697 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
|
|
1698 bbs, loop->num_nodes, loop);
|
|
1699 gcc_assert (nbbs == loop->num_nodes);
|
|
1700
|
|
1701 LOOP_VINFO_BBS (res) = bbs;
|
|
1702 LOOP_VINFO_NITERS (res) = NULL;
|
|
1703 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
|
|
1704 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
|
|
1705 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
|
|
1706 LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
|
|
1707 LOOP_VINFO_VECT_FACTOR (res) = 0;
|
|
1708 LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
|
|
1709 LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
|
|
1710 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
|
|
1711 LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
|
|
1712 VEC_alloc (gimple, heap,
|
|
1713 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
|
|
1714 LOOP_VINFO_MAY_ALIAS_DDRS (res) =
|
|
1715 VEC_alloc (ddr_p, heap,
|
|
1716 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
|
|
1717 LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10);
|
|
1718 LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
|
|
1719 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
|
|
1720
|
|
1721 return res;
|
|
1722 }
|
|
1723
|
|
1724
|
|
1725 /* Function destroy_loop_vec_info.
|
|
1726
|
|
1727 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
|
|
1728 stmts in the loop. */
|
|
1729
|
|
1730 void
|
|
1731 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
|
|
1732 {
|
|
1733 struct loop *loop;
|
|
1734 basic_block *bbs;
|
|
1735 int nbbs;
|
|
1736 gimple_stmt_iterator si;
|
|
1737 int j;
|
|
1738 VEC (slp_instance, heap) *slp_instances;
|
|
1739 slp_instance instance;
|
|
1740
|
|
1741 if (!loop_vinfo)
|
|
1742 return;
|
|
1743
|
|
1744 loop = LOOP_VINFO_LOOP (loop_vinfo);
|
|
1745
|
|
1746 bbs = LOOP_VINFO_BBS (loop_vinfo);
|
|
1747 nbbs = loop->num_nodes;
|
|
1748
|
|
1749 if (!clean_stmts)
|
|
1750 {
|
|
1751 free (LOOP_VINFO_BBS (loop_vinfo));
|
|
1752 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
|
|
1753 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
|
|
1754 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
|
|
1755
|
|
1756 free (loop_vinfo);
|
|
1757 loop->aux = NULL;
|
|
1758 return;
|
|
1759 }
|
|
1760
|
|
1761 for (j = 0; j < nbbs; j++)
|
|
1762 {
|
|
1763 basic_block bb = bbs[j];
|
|
1764
|
|
1765 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
|
|
1766 free_stmt_vec_info (gsi_stmt (si));
|
|
1767
|
|
1768 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
|
|
1769 {
|
|
1770 gimple stmt = gsi_stmt (si);
|
|
1771 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
|
|
1772
|
|
1773 if (stmt_info)
|
|
1774 {
|
|
1775 /* Check if this is a "pattern stmt" (introduced by the
|
|
1776 vectorizer during the pattern recognition pass). */
|
|
1777 bool remove_stmt_p = false;
|
|
1778 gimple orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
|
|
1779 if (orig_stmt)
|
|
1780 {
|
|
1781 stmt_vec_info orig_stmt_info = vinfo_for_stmt (orig_stmt);
|
|
1782 if (orig_stmt_info
|
|
1783 && STMT_VINFO_IN_PATTERN_P (orig_stmt_info))
|
|
1784 remove_stmt_p = true;
|
|
1785 }
|
|
1786
|
|
1787 /* Free stmt_vec_info. */
|
|
1788 free_stmt_vec_info (stmt);
|
|
1789
|
|
1790 /* Remove dead "pattern stmts". */
|
|
1791 if (remove_stmt_p)
|
|
1792 gsi_remove (&si, true);
|
|
1793 }
|
|
1794 gsi_next (&si);
|
|
1795 }
|
|
1796 }
|
|
1797
|
|
1798 free (LOOP_VINFO_BBS (loop_vinfo));
|
|
1799 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
|
|
1800 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
|
|
1801 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
|
|
1802 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
|
|
1803 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
|
|
1804 for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++)
|
|
1805 vect_free_slp_instance (instance);
|
|
1806
|
|
1807 VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
|
|
1808 VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
|
|
1809
|
|
1810 free (loop_vinfo);
|
|
1811 loop->aux = NULL;
|
|
1812 }
|
|
1813
|
|
1814
|
|
1815 /* Function vect_force_dr_alignment_p.
|
|
1816
|
|
1817 Returns whether the alignment of a DECL can be forced to be aligned
|
|
1818 on ALIGNMENT bit boundary. */
|
|
1819
|
|
1820 bool
|
|
1821 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
|
|
1822 {
|
|
1823 if (TREE_CODE (decl) != VAR_DECL)
|
|
1824 return false;
|
|
1825
|
|
1826 if (DECL_EXTERNAL (decl))
|
|
1827 return false;
|
|
1828
|
|
1829 if (TREE_ASM_WRITTEN (decl))
|
|
1830 return false;
|
|
1831
|
|
1832 if (TREE_STATIC (decl))
|
|
1833 return (alignment <= MAX_OFILE_ALIGNMENT);
|
|
1834 else
|
|
1835 return (alignment <= MAX_STACK_ALIGNMENT);
|
|
1836 }
|
|
1837
|
|
1838
|
|
1839 /* Function get_vectype_for_scalar_type.
|
|
1840
|
|
1841 Returns the vector type corresponding to SCALAR_TYPE as supported
|
|
1842 by the target. */
|
|
1843
|
|
1844 tree
|
|
1845 get_vectype_for_scalar_type (tree scalar_type)
|
|
1846 {
|
|
1847 enum machine_mode inner_mode = TYPE_MODE (scalar_type);
|
|
1848 int nbytes = GET_MODE_SIZE (inner_mode);
|
|
1849 int nunits;
|
|
1850 tree vectype;
|
|
1851
|
|
1852 if (nbytes == 0 || nbytes >= UNITS_PER_SIMD_WORD (inner_mode))
|
|
1853 return NULL_TREE;
|
|
1854
|
|
1855 /* FORNOW: Only a single vector size per mode (UNITS_PER_SIMD_WORD)
|
|
1856 is expected. */
|
|
1857 nunits = UNITS_PER_SIMD_WORD (inner_mode) / nbytes;
|
|
1858
|
|
1859 vectype = build_vector_type (scalar_type, nunits);
|
|
1860 if (vect_print_dump_info (REPORT_DETAILS))
|
|
1861 {
|
|
1862 fprintf (vect_dump, "get vectype with %d units of type ", nunits);
|
|
1863 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
|
|
1864 }
|
|
1865
|
|
1866 if (!vectype)
|
|
1867 return NULL_TREE;
|
|
1868
|
|
1869 if (vect_print_dump_info (REPORT_DETAILS))
|
|
1870 {
|
|
1871 fprintf (vect_dump, "vectype: ");
|
|
1872 print_generic_expr (vect_dump, vectype, TDF_SLIM);
|
|
1873 }
|
|
1874
|
|
1875 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
|
|
1876 && !INTEGRAL_MODE_P (TYPE_MODE (vectype)))
|
|
1877 {
|
|
1878 if (vect_print_dump_info (REPORT_DETAILS))
|
|
1879 fprintf (vect_dump, "mode not supported by target.");
|
|
1880 return NULL_TREE;
|
|
1881 }
|
|
1882
|
|
1883 return vectype;
|
|
1884 }
|
|
1885
|
|
1886
|
|
1887 /* Function vect_supportable_dr_alignment
|
|
1888
|
|
1889 Return whether the data reference DR is supported with respect to its
|
|
1890 alignment. */
|
|
1891
|
|
1892 enum dr_alignment_support
|
|
1893 vect_supportable_dr_alignment (struct data_reference *dr)
|
|
1894 {
|
|
1895 gimple stmt = DR_STMT (dr);
|
|
1896 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
|
|
1897 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
|
|
1898 enum machine_mode mode = (int) TYPE_MODE (vectype);
|
|
1899 struct loop *vect_loop = LOOP_VINFO_LOOP (STMT_VINFO_LOOP_VINFO (stmt_info));
|
|
1900 bool nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
|
|
1901 bool invariant_in_outerloop = false;
|
|
1902
|
|
1903 if (aligned_access_p (dr))
|
|
1904 return dr_aligned;
|
|
1905
|
|
1906 if (nested_in_vect_loop)
|
|
1907 {
|
|
1908 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
|
|
1909 invariant_in_outerloop =
|
|
1910 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
|
|
1911 }
|
|
1912
|
|
1913 /* Possibly unaligned access. */
|
|
1914
|
|
1915 /* We can choose between using the implicit realignment scheme (generating
|
|
1916 a misaligned_move stmt) and the explicit realignment scheme (generating
|
|
1917 aligned loads with a REALIGN_LOAD). There are two variants to the explicit
|
|
1918 realignment scheme: optimized, and unoptimized.
|
|
1919 We can optimize the realignment only if the step between consecutive
|
|
1920 vector loads is equal to the vector size. Since the vector memory
|
|
1921 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
|
|
1922 is guaranteed that the misalignment amount remains the same throughout the
|
|
1923 execution of the vectorized loop. Therefore, we can create the
|
|
1924 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
|
|
1925 at the loop preheader.
|
|
1926
|
|
1927 However, in the case of outer-loop vectorization, when vectorizing a
|
|
1928 memory access in the inner-loop nested within the LOOP that is now being
|
|
1929 vectorized, while it is guaranteed that the misalignment of the
|
|
1930 vectorized memory access will remain the same in different outer-loop
|
|
1931 iterations, it is *not* guaranteed that is will remain the same throughout
|
|
1932 the execution of the inner-loop. This is because the inner-loop advances
|
|
1933 with the original scalar step (and not in steps of VS). If the inner-loop
|
|
1934 step happens to be a multiple of VS, then the misalignment remains fixed
|
|
1935 and we can use the optimized realignment scheme. For example:
|
|
1936
|
|
1937 for (i=0; i<N; i++)
|
|
1938 for (j=0; j<M; j++)
|
|
1939 s += a[i+j];
|
|
1940
|
|
1941 When vectorizing the i-loop in the above example, the step between
|
|
1942 consecutive vector loads is 1, and so the misalignment does not remain
|
|
1943 fixed across the execution of the inner-loop, and the realignment cannot
|
|
1944 be optimized (as illustrated in the following pseudo vectorized loop):
|
|
1945
|
|
1946 for (i=0; i<N; i+=4)
|
|
1947 for (j=0; j<M; j++){
|
|
1948 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
|
|
1949 // when j is {0,1,2,3,4,5,6,7,...} respectively.
|
|
1950 // (assuming that we start from an aligned address).
|
|
1951 }
|
|
1952
|
|
1953 We therefore have to use the unoptimized realignment scheme:
|
|
1954
|
|
1955 for (i=0; i<N; i+=4)
|
|
1956 for (j=k; j<M; j+=4)
|
|
1957 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
|
|
1958 // that the misalignment of the initial address is
|
|
1959 // 0).
|
|
1960
|
|
1961 The loop can then be vectorized as follows:
|
|
1962
|
|
1963 for (k=0; k<4; k++){
|
|
1964 rt = get_realignment_token (&vp[k]);
|
|
1965 for (i=0; i<N; i+=4){
|
|
1966 v1 = vp[i+k];
|
|
1967 for (j=k; j<M; j+=4){
|
|
1968 v2 = vp[i+j+VS-1];
|
|
1969 va = REALIGN_LOAD <v1,v2,rt>;
|
|
1970 vs += va;
|
|
1971 v1 = v2;
|
|
1972 }
|
|
1973 }
|
|
1974 } */
|
|
1975
|
|
1976 if (DR_IS_READ (dr))
|
|
1977 {
|
|
1978 if (optab_handler (vec_realign_load_optab, mode)->insn_code !=
|
|
1979 CODE_FOR_nothing
|
|
1980 && (!targetm.vectorize.builtin_mask_for_load
|
|
1981 || targetm.vectorize.builtin_mask_for_load ()))
|
|
1982 {
|
|
1983 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
|
|
1984 if (nested_in_vect_loop
|
|
1985 && (TREE_INT_CST_LOW (DR_STEP (dr))
|
|
1986 != GET_MODE_SIZE (TYPE_MODE (vectype))))
|
|
1987 return dr_explicit_realign;
|
|
1988 else
|
|
1989 return dr_explicit_realign_optimized;
|
|
1990 }
|
|
1991
|
|
1992 if (optab_handler (movmisalign_optab, mode)->insn_code !=
|
|
1993 CODE_FOR_nothing)
|
|
1994 /* Can't software pipeline the loads, but can at least do them. */
|
|
1995 return dr_unaligned_supported;
|
|
1996 }
|
|
1997
|
|
1998 /* Unsupported. */
|
|
1999 return dr_unaligned_unsupported;
|
|
2000 }
|
|
2001
|
|
2002
|
|
2003 /* Function vect_is_simple_use.
|
|
2004
|
|
2005 Input:
|
|
2006 LOOP - the loop that is being vectorized.
|
|
2007 OPERAND - operand of a stmt in LOOP.
|
|
2008 DEF - the defining stmt in case OPERAND is an SSA_NAME.
|
|
2009
|
|
2010 Returns whether a stmt with OPERAND can be vectorized.
|
|
2011 Supportable operands are constants, loop invariants, and operands that are
|
|
2012 defined by the current iteration of the loop. Unsupportable operands are
|
|
2013 those that are defined by a previous iteration of the loop (as is the case
|
|
2014 in reduction/induction computations). */
|
|
2015
|
|
2016 bool
|
|
2017 vect_is_simple_use (tree operand, loop_vec_info loop_vinfo, gimple *def_stmt,
|
|
2018 tree *def, enum vect_def_type *dt)
|
|
2019 {
|
|
2020 basic_block bb;
|
|
2021 stmt_vec_info stmt_vinfo;
|
|
2022 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
|
|
2023
|
|
2024 *def_stmt = NULL;
|
|
2025 *def = NULL_TREE;
|
|
2026
|
|
2027 if (vect_print_dump_info (REPORT_DETAILS))
|
|
2028 {
|
|
2029 fprintf (vect_dump, "vect_is_simple_use: operand ");
|
|
2030 print_generic_expr (vect_dump, operand, TDF_SLIM);
|
|
2031 }
|
|
2032
|
|
2033 if (TREE_CODE (operand) == INTEGER_CST || TREE_CODE (operand) == REAL_CST)
|
|
2034 {
|
|
2035 *dt = vect_constant_def;
|
|
2036 return true;
|
|
2037 }
|
|
2038 if (is_gimple_min_invariant (operand))
|
|
2039 {
|
|
2040 *def = operand;
|
|
2041 *dt = vect_invariant_def;
|
|
2042 return true;
|
|
2043 }
|
|
2044
|
|
2045 if (TREE_CODE (operand) == PAREN_EXPR)
|
|
2046 {
|
|
2047 if (vect_print_dump_info (REPORT_DETAILS))
|
|
2048 fprintf (vect_dump, "non-associatable copy.");
|
|
2049 operand = TREE_OPERAND (operand, 0);
|
|
2050 }
|
|
2051 if (TREE_CODE (operand) != SSA_NAME)
|
|
2052 {
|
|
2053 if (vect_print_dump_info (REPORT_DETAILS))
|
|
2054 fprintf (vect_dump, "not ssa-name.");
|
|
2055 return false;
|
|
2056 }
|
|
2057
|
|
2058 *def_stmt = SSA_NAME_DEF_STMT (operand);
|
|
2059 if (*def_stmt == NULL)
|
|
2060 {
|
|
2061 if (vect_print_dump_info (REPORT_DETAILS))
|
|
2062 fprintf (vect_dump, "no def_stmt.");
|
|
2063 return false;
|
|
2064 }
|
|
2065
|
|
2066 if (vect_print_dump_info (REPORT_DETAILS))
|
|
2067 {
|
|
2068 fprintf (vect_dump, "def_stmt: ");
|
|
2069 print_gimple_stmt (vect_dump, *def_stmt, 0, TDF_SLIM);
|
|
2070 }
|
|
2071
|
|
2072 /* empty stmt is expected only in case of a function argument.
|
|
2073 (Otherwise - we expect a phi_node or a GIMPLE_ASSIGN). */
|
|
2074 if (gimple_nop_p (*def_stmt))
|
|
2075 {
|
|
2076 *def = operand;
|
|
2077 *dt = vect_invariant_def;
|
|
2078 return true;
|
|
2079 }
|
|
2080
|
|
2081 bb = gimple_bb (*def_stmt);
|
|
2082 if (!flow_bb_inside_loop_p (loop, bb))
|
|
2083 *dt = vect_invariant_def;
|
|
2084 else
|
|
2085 {
|
|
2086 stmt_vinfo = vinfo_for_stmt (*def_stmt);
|
|
2087 *dt = STMT_VINFO_DEF_TYPE (stmt_vinfo);
|
|
2088 }
|
|
2089
|
|
2090 if (*dt == vect_unknown_def_type)
|
|
2091 {
|
|
2092 if (vect_print_dump_info (REPORT_DETAILS))
|
|
2093 fprintf (vect_dump, "Unsupported pattern.");
|
|
2094 return false;
|
|
2095 }
|
|
2096
|
|
2097 if (vect_print_dump_info (REPORT_DETAILS))
|
|
2098 fprintf (vect_dump, "type of def: %d.",*dt);
|
|
2099
|
|
2100 switch (gimple_code (*def_stmt))
|
|
2101 {
|
|
2102 case GIMPLE_PHI:
|
|
2103 *def = gimple_phi_result (*def_stmt);
|
|
2104 break;
|
|
2105
|
|
2106 case GIMPLE_ASSIGN:
|
|
2107 *def = gimple_assign_lhs (*def_stmt);
|
|
2108 break;
|
|
2109
|
|
2110 case GIMPLE_CALL:
|
|
2111 *def = gimple_call_lhs (*def_stmt);
|
|
2112 if (*def != NULL)
|
|
2113 break;
|
|
2114 /* FALLTHRU */
|
|
2115 default:
|
|
2116 if (vect_print_dump_info (REPORT_DETAILS))
|
|
2117 fprintf (vect_dump, "unsupported defining stmt: ");
|
|
2118 return false;
|
|
2119 }
|
|
2120
|
|
2121 return true;
|
|
2122 }
|
|
2123
|
|
2124
|
|
2125 /* Function supportable_widening_operation
|
|
2126
|
|
2127 Check whether an operation represented by the code CODE is a
|
|
2128 widening operation that is supported by the target platform in
|
|
2129 vector form (i.e., when operating on arguments of type VECTYPE).
|
|
2130
|
|
2131 Widening operations we currently support are NOP (CONVERT), FLOAT
|
|
2132 and WIDEN_MULT. This function checks if these operations are supported
|
|
2133 by the target platform either directly (via vector tree-codes), or via
|
|
2134 target builtins.
|
|
2135
|
|
2136 Output:
|
|
2137 - CODE1 and CODE2 are codes of vector operations to be used when
|
|
2138 vectorizing the operation, if available.
|
|
2139 - DECL1 and DECL2 are decls of target builtin functions to be used
|
|
2140 when vectorizing the operation, if available. In this case,
|
|
2141 CODE1 and CODE2 are CALL_EXPR.
|
|
2142 - MULTI_STEP_CVT determines the number of required intermediate steps in
|
|
2143 case of multi-step conversion (like char->short->int - in that case
|
|
2144 MULTI_STEP_CVT will be 1).
|
|
2145 - INTERM_TYPES contains the intermediate type required to perform the
|
|
2146 widening operation (short in the above example). */
|
|
2147
|
|
2148 bool
|
|
2149 supportable_widening_operation (enum tree_code code, gimple stmt, tree vectype,
|
|
2150 tree *decl1, tree *decl2,
|
|
2151 enum tree_code *code1, enum tree_code *code2,
|
|
2152 int *multi_step_cvt,
|
|
2153 VEC (tree, heap) **interm_types)
|
|
2154 {
|
|
2155 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
|
|
2156 loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
|
|
2157 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
|
|
2158 bool ordered_p;
|
|
2159 enum machine_mode vec_mode;
|
|
2160 enum insn_code icode1 = 0, icode2 = 0;
|
|
2161 optab optab1, optab2;
|
|
2162 tree type = gimple_expr_type (stmt);
|
|
2163 tree wide_vectype = get_vectype_for_scalar_type (type);
|
|
2164 enum tree_code c1, c2;
|
|
2165
|
|
2166 /* The result of a vectorized widening operation usually requires two vectors
|
|
2167 (because the widened results do not fit int one vector). The generated
|
|
2168 vector results would normally be expected to be generated in the same
|
|
2169 order as in the original scalar computation, i.e. if 8 results are
|
|
2170 generated in each vector iteration, they are to be organized as follows:
|
|
2171 vect1: [res1,res2,res3,res4], vect2: [res5,res6,res7,res8].
|
|
2172
|
|
2173 However, in the special case that the result of the widening operation is
|
|
2174 used in a reduction computation only, the order doesn't matter (because
|
|
2175 when vectorizing a reduction we change the order of the computation).
|
|
2176 Some targets can take advantage of this and generate more efficient code.
|
|
2177 For example, targets like Altivec, that support widen_mult using a sequence
|
|
2178 of {mult_even,mult_odd} generate the following vectors:
|
|
2179 vect1: [res1,res3,res5,res7], vect2: [res2,res4,res6,res8].
|
|
2180
|
|
2181 When vectorizing outer-loops, we execute the inner-loop sequentially
|
|
2182 (each vectorized inner-loop iteration contributes to VF outer-loop
|
|
2183 iterations in parallel). We therefore don't allow to change the order
|
|
2184 of the computation in the inner-loop during outer-loop vectorization. */
|
|
2185
|
|
2186 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_by_reduction
|
|
2187 && !nested_in_vect_loop_p (vect_loop, stmt))
|
|
2188 ordered_p = false;
|
|
2189 else
|
|
2190 ordered_p = true;
|
|
2191
|
|
2192 if (!ordered_p
|
|
2193 && code == WIDEN_MULT_EXPR
|
|
2194 && targetm.vectorize.builtin_mul_widen_even
|
|
2195 && targetm.vectorize.builtin_mul_widen_even (vectype)
|
|
2196 && targetm.vectorize.builtin_mul_widen_odd
|
|
2197 && targetm.vectorize.builtin_mul_widen_odd (vectype))
|
|
2198 {
|
|
2199 if (vect_print_dump_info (REPORT_DETAILS))
|
|
2200 fprintf (vect_dump, "Unordered widening operation detected.");
|
|
2201
|
|
2202 *code1 = *code2 = CALL_EXPR;
|
|
2203 *decl1 = targetm.vectorize.builtin_mul_widen_even (vectype);
|
|
2204 *decl2 = targetm.vectorize.builtin_mul_widen_odd (vectype);
|
|
2205 return true;
|
|
2206 }
|
|
2207
|
|
2208 switch (code)
|
|
2209 {
|
|
2210 case WIDEN_MULT_EXPR:
|
|
2211 if (BYTES_BIG_ENDIAN)
|
|
2212 {
|
|
2213 c1 = VEC_WIDEN_MULT_HI_EXPR;
|
|
2214 c2 = VEC_WIDEN_MULT_LO_EXPR;
|
|
2215 }
|
|
2216 else
|
|
2217 {
|
|
2218 c2 = VEC_WIDEN_MULT_HI_EXPR;
|
|
2219 c1 = VEC_WIDEN_MULT_LO_EXPR;
|
|
2220 }
|
|
2221 break;
|
|
2222
|
|
2223 CASE_CONVERT:
|
|
2224 if (BYTES_BIG_ENDIAN)
|
|
2225 {
|
|
2226 c1 = VEC_UNPACK_HI_EXPR;
|
|
2227 c2 = VEC_UNPACK_LO_EXPR;
|
|
2228 }
|
|
2229 else
|
|
2230 {
|
|
2231 c2 = VEC_UNPACK_HI_EXPR;
|
|
2232 c1 = VEC_UNPACK_LO_EXPR;
|
|
2233 }
|
|
2234 break;
|
|
2235
|
|
2236 case FLOAT_EXPR:
|
|
2237 if (BYTES_BIG_ENDIAN)
|
|
2238 {
|
|
2239 c1 = VEC_UNPACK_FLOAT_HI_EXPR;
|
|
2240 c2 = VEC_UNPACK_FLOAT_LO_EXPR;
|
|
2241 }
|
|
2242 else
|
|
2243 {
|
|
2244 c2 = VEC_UNPACK_FLOAT_HI_EXPR;
|
|
2245 c1 = VEC_UNPACK_FLOAT_LO_EXPR;
|
|
2246 }
|
|
2247 break;
|
|
2248
|
|
2249 case FIX_TRUNC_EXPR:
|
|
2250 /* ??? Not yet implemented due to missing VEC_UNPACK_FIX_TRUNC_HI_EXPR/
|
|
2251 VEC_UNPACK_FIX_TRUNC_LO_EXPR tree codes and optabs used for
|
|
2252 computing the operation. */
|
|
2253 return false;
|
|
2254
|
|
2255 default:
|
|
2256 gcc_unreachable ();
|
|
2257 }
|
|
2258
|
|
2259 if (code == FIX_TRUNC_EXPR)
|
|
2260 {
|
|
2261 /* The signedness is determined from output operand. */
|
|
2262 optab1 = optab_for_tree_code (c1, type, optab_default);
|
|
2263 optab2 = optab_for_tree_code (c2, type, optab_default);
|
|
2264 }
|
|
2265 else
|
|
2266 {
|
|
2267 optab1 = optab_for_tree_code (c1, vectype, optab_default);
|
|
2268 optab2 = optab_for_tree_code (c2, vectype, optab_default);
|
|
2269 }
|
|
2270
|
|
2271 if (!optab1 || !optab2)
|
|
2272 return false;
|
|
2273
|
|
2274 vec_mode = TYPE_MODE (vectype);
|
|
2275 if ((icode1 = optab_handler (optab1, vec_mode)->insn_code) == CODE_FOR_nothing
|
|
2276 || (icode2 = optab_handler (optab2, vec_mode)->insn_code)
|
|
2277 == CODE_FOR_nothing)
|
|
2278 return false;
|
|
2279
|
|
2280 /* Check if it's a multi-step conversion that can be done using intermediate
|
|
2281 types. */
|
|
2282 if (insn_data[icode1].operand[0].mode != TYPE_MODE (wide_vectype)
|
|
2283 || insn_data[icode2].operand[0].mode != TYPE_MODE (wide_vectype))
|
|
2284 {
|
|
2285 int i;
|
|
2286 tree prev_type = vectype, intermediate_type;
|
|
2287 enum machine_mode intermediate_mode, prev_mode = vec_mode;
|
|
2288 optab optab3, optab4;
|
|
2289
|
|
2290 if (!CONVERT_EXPR_CODE_P (code))
|
|
2291 return false;
|
|
2292
|
|
2293 *code1 = c1;
|
|
2294 *code2 = c2;
|
|
2295
|
|
2296 /* We assume here that there will not be more than MAX_INTERM_CVT_STEPS
|
|
2297 intermediate steps in promotion sequence. We try MAX_INTERM_CVT_STEPS
|
|
2298 to get to NARROW_VECTYPE, and fail if we do not. */
|
|
2299 *interm_types = VEC_alloc (tree, heap, MAX_INTERM_CVT_STEPS);
|
|
2300 for (i = 0; i < 3; i++)
|
|
2301 {
|
|
2302 intermediate_mode = insn_data[icode1].operand[0].mode;
|
|
2303 intermediate_type = lang_hooks.types.type_for_mode (intermediate_mode,
|
|
2304 TYPE_UNSIGNED (prev_type));
|
|
2305 optab3 = optab_for_tree_code (c1, intermediate_type, optab_default);
|
|
2306 optab4 = optab_for_tree_code (c2, intermediate_type, optab_default);
|
|
2307
|
|
2308 if (!optab3 || !optab4
|
|
2309 || (icode1 = optab1->handlers[(int) prev_mode].insn_code)
|
|
2310 == CODE_FOR_nothing
|
|
2311 || insn_data[icode1].operand[0].mode != intermediate_mode
|
|
2312 || (icode2 = optab2->handlers[(int) prev_mode].insn_code)
|
|
2313 == CODE_FOR_nothing
|
|
2314 || insn_data[icode2].operand[0].mode != intermediate_mode
|
|
2315 || (icode1 = optab3->handlers[(int) intermediate_mode].insn_code)
|
|
2316 == CODE_FOR_nothing
|
|
2317 || (icode2 = optab4->handlers[(int) intermediate_mode].insn_code)
|
|
2318 == CODE_FOR_nothing)
|
|
2319 return false;
|
|
2320
|
|
2321 VEC_quick_push (tree, *interm_types, intermediate_type);
|
|
2322 (*multi_step_cvt)++;
|
|
2323
|
|
2324 if (insn_data[icode1].operand[0].mode == TYPE_MODE (wide_vectype)
|
|
2325 && insn_data[icode2].operand[0].mode == TYPE_MODE (wide_vectype))
|
|
2326 return true;
|
|
2327
|
|
2328 prev_type = intermediate_type;
|
|
2329 prev_mode = intermediate_mode;
|
|
2330 }
|
|
2331
|
|
2332 return false;
|
|
2333 }
|
|
2334
|
|
2335 *code1 = c1;
|
|
2336 *code2 = c2;
|
|
2337 return true;
|
|
2338 }
|
|
2339
|
|
2340
|
|
2341 /* Function supportable_narrowing_operation
|
|
2342
|
|
2343 Check whether an operation represented by the code CODE is a
|
|
2344 narrowing operation that is supported by the target platform in
|
|
2345 vector form (i.e., when operating on arguments of type VECTYPE).
|
|
2346
|
|
2347 Narrowing operations we currently support are NOP (CONVERT) and
|
|
2348 FIX_TRUNC. This function checks if these operations are supported by
|
|
2349 the target platform directly via vector tree-codes.
|
|
2350
|
|
2351 Output:
|
|
2352 - CODE1 is the code of a vector operation to be used when
|
|
2353 vectorizing the operation, if available.
|
|
2354 - MULTI_STEP_CVT determines the number of required intermediate steps in
|
|
2355 case of multi-step conversion (like int->short->char - in that case
|
|
2356 MULTI_STEP_CVT will be 1).
|
|
2357 - INTERM_TYPES contains the intermediate type required to perform the
|
|
2358 narrowing operation (short in the above example). */
|
|
2359
|
|
2360 bool
|
|
2361 supportable_narrowing_operation (enum tree_code code,
|
|
2362 const_gimple stmt, tree vectype,
|
|
2363 enum tree_code *code1, int *multi_step_cvt,
|
|
2364 VEC (tree, heap) **interm_types)
|
|
2365 {
|
|
2366 enum machine_mode vec_mode;
|
|
2367 enum insn_code icode1;
|
|
2368 optab optab1, interm_optab;
|
|
2369 tree type = gimple_expr_type (stmt);
|
|
2370 tree narrow_vectype = get_vectype_for_scalar_type (type);
|
|
2371 enum tree_code c1;
|
|
2372 tree intermediate_type, prev_type;
|
|
2373 int i;
|
|
2374
|
|
2375 switch (code)
|
|
2376 {
|
|
2377 CASE_CONVERT:
|
|
2378 c1 = VEC_PACK_TRUNC_EXPR;
|
|
2379 break;
|
|
2380
|
|
2381 case FIX_TRUNC_EXPR:
|
|
2382 c1 = VEC_PACK_FIX_TRUNC_EXPR;
|
|
2383 break;
|
|
2384
|
|
2385 case FLOAT_EXPR:
|
|
2386 /* ??? Not yet implemented due to missing VEC_PACK_FLOAT_EXPR
|
|
2387 tree code and optabs used for computing the operation. */
|
|
2388 return false;
|
|
2389
|
|
2390 default:
|
|
2391 gcc_unreachable ();
|
|
2392 }
|
|
2393
|
|
2394 if (code == FIX_TRUNC_EXPR)
|
|
2395 /* The signedness is determined from output operand. */
|
|
2396 optab1 = optab_for_tree_code (c1, type, optab_default);
|
|
2397 else
|
|
2398 optab1 = optab_for_tree_code (c1, vectype, optab_default);
|
|
2399
|
|
2400 if (!optab1)
|
|
2401 return false;
|
|
2402
|
|
2403 vec_mode = TYPE_MODE (vectype);
|
|
2404 if ((icode1 = optab_handler (optab1, vec_mode)->insn_code)
|
|
2405 == CODE_FOR_nothing)
|
|
2406 return false;
|
|
2407
|
|
2408 /* Check if it's a multi-step conversion that can be done using intermediate
|
|
2409 types. */
|
|
2410 if (insn_data[icode1].operand[0].mode != TYPE_MODE (narrow_vectype))
|
|
2411 {
|
|
2412 enum machine_mode intermediate_mode, prev_mode = vec_mode;
|
|
2413
|
|
2414 *code1 = c1;
|
|
2415 prev_type = vectype;
|
|
2416 /* We assume here that there will not be more than MAX_INTERM_CVT_STEPS
|
|
2417 intermediate steps in promotion sequence. We try MAX_INTERM_CVT_STEPS
|
|
2418 to get to NARROW_VECTYPE, and fail if we do not. */
|
|
2419 *interm_types = VEC_alloc (tree, heap, MAX_INTERM_CVT_STEPS);
|
|
2420 for (i = 0; i < 3; i++)
|
|
2421 {
|
|
2422 intermediate_mode = insn_data[icode1].operand[0].mode;
|
|
2423 intermediate_type = lang_hooks.types.type_for_mode (intermediate_mode,
|
|
2424 TYPE_UNSIGNED (prev_type));
|
|
2425 interm_optab = optab_for_tree_code (c1, intermediate_type,
|
|
2426 optab_default);
|
|
2427 if (!interm_optab
|
|
2428 || (icode1 = optab1->handlers[(int) prev_mode].insn_code)
|
|
2429 == CODE_FOR_nothing
|
|
2430 || insn_data[icode1].operand[0].mode != intermediate_mode
|
|
2431 || (icode1
|
|
2432 = interm_optab->handlers[(int) intermediate_mode].insn_code)
|
|
2433 == CODE_FOR_nothing)
|
|
2434 return false;
|
|
2435
|
|
2436 VEC_quick_push (tree, *interm_types, intermediate_type);
|
|
2437 (*multi_step_cvt)++;
|
|
2438
|
|
2439 if (insn_data[icode1].operand[0].mode == TYPE_MODE (narrow_vectype))
|
|
2440 return true;
|
|
2441
|
|
2442 prev_type = intermediate_type;
|
|
2443 prev_mode = intermediate_mode;
|
|
2444 }
|
|
2445
|
|
2446 return false;
|
|
2447 }
|
|
2448
|
|
2449 *code1 = c1;
|
|
2450 return true;
|
|
2451 }
|
|
2452
|
|
2453
|
|
2454 /* Function reduction_code_for_scalar_code
|
|
2455
|
|
2456 Input:
|
|
2457 CODE - tree_code of a reduction operations.
|
|
2458
|
|
2459 Output:
|
|
2460 REDUC_CODE - the corresponding tree-code to be used to reduce the
|
|
2461 vector of partial results into a single scalar result (which
|
|
2462 will also reside in a vector).
|
|
2463
|
|
2464 Return TRUE if a corresponding REDUC_CODE was found, FALSE otherwise. */
|
|
2465
|
|
2466 bool
|
|
2467 reduction_code_for_scalar_code (enum tree_code code,
|
|
2468 enum tree_code *reduc_code)
|
|
2469 {
|
|
2470 switch (code)
|
|
2471 {
|
|
2472 case MAX_EXPR:
|
|
2473 *reduc_code = REDUC_MAX_EXPR;
|
|
2474 return true;
|
|
2475
|
|
2476 case MIN_EXPR:
|
|
2477 *reduc_code = REDUC_MIN_EXPR;
|
|
2478 return true;
|
|
2479
|
|
2480 case PLUS_EXPR:
|
|
2481 *reduc_code = REDUC_PLUS_EXPR;
|
|
2482 return true;
|
|
2483
|
|
2484 default:
|
|
2485 return false;
|
|
2486 }
|
|
2487 }
|
|
2488
|
|
2489 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
|
|
2490 STMT is printed with a message MSG. */
|
|
2491
|
|
2492 static void
|
|
2493 report_vect_op (gimple stmt, const char *msg)
|
|
2494 {
|
|
2495 fprintf (vect_dump, "%s", msg);
|
|
2496 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
|
|
2497 }
|
|
2498
|
|
2499 /* Function vect_is_simple_reduction
|
|
2500
|
|
2501 Detect a cross-iteration def-use cycle that represents a simple
|
|
2502 reduction computation. We look for the following pattern:
|
|
2503
|
|
2504 loop_header:
|
|
2505 a1 = phi < a0, a2 >
|
|
2506 a3 = ...
|
|
2507 a2 = operation (a3, a1)
|
|
2508
|
|
2509 such that:
|
|
2510 1. operation is commutative and associative and it is safe to
|
|
2511 change the order of the computation.
|
|
2512 2. no uses for a2 in the loop (a2 is used out of the loop)
|
|
2513 3. no uses of a1 in the loop besides the reduction operation.
|
|
2514
|
|
2515 Condition 1 is tested here.
|
|
2516 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized. */
|
|
2517
|
|
2518 gimple
|
|
2519 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi)
|
|
2520 {
|
|
2521 struct loop *loop = (gimple_bb (phi))->loop_father;
|
|
2522 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
|
|
2523 edge latch_e = loop_latch_edge (loop);
|
|
2524 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
|
|
2525 gimple def_stmt, def1, def2;
|
|
2526 enum tree_code code;
|
|
2527 tree op1, op2;
|
|
2528 tree type;
|
|
2529 int nloop_uses;
|
|
2530 tree name;
|
|
2531 imm_use_iterator imm_iter;
|
|
2532 use_operand_p use_p;
|
|
2533
|
|
2534 gcc_assert (loop == vect_loop || flow_loop_nested_p (vect_loop, loop));
|
|
2535
|
|
2536 name = PHI_RESULT (phi);
|
|
2537 nloop_uses = 0;
|
|
2538 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
|
|
2539 {
|
|
2540 gimple use_stmt = USE_STMT (use_p);
|
|
2541 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
|
|
2542 && vinfo_for_stmt (use_stmt)
|
|
2543 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
|
|
2544 nloop_uses++;
|
|
2545 if (nloop_uses > 1)
|
|
2546 {
|
|
2547 if (vect_print_dump_info (REPORT_DETAILS))
|
|
2548 fprintf (vect_dump, "reduction used in loop.");
|
|
2549 return NULL;
|
|
2550 }
|
|
2551 }
|
|
2552
|
|
2553 if (TREE_CODE (loop_arg) != SSA_NAME)
|
|
2554 {
|
|
2555 if (vect_print_dump_info (REPORT_DETAILS))
|
|
2556 {
|
|
2557 fprintf (vect_dump, "reduction: not ssa_name: ");
|
|
2558 print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
|
|
2559 }
|
|
2560 return NULL;
|
|
2561 }
|
|
2562
|
|
2563 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
|
|
2564 if (!def_stmt)
|
|
2565 {
|
|
2566 if (vect_print_dump_info (REPORT_DETAILS))
|
|
2567 fprintf (vect_dump, "reduction: no def_stmt.");
|
|
2568 return NULL;
|
|
2569 }
|
|
2570
|
|
2571 if (!is_gimple_assign (def_stmt))
|
|
2572 {
|
|
2573 if (vect_print_dump_info (REPORT_DETAILS))
|
|
2574 print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
|
|
2575 return NULL;
|
|
2576 }
|
|
2577
|
|
2578 name = gimple_assign_lhs (def_stmt);
|
|
2579 nloop_uses = 0;
|
|
2580 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
|
|
2581 {
|
|
2582 gimple use_stmt = USE_STMT (use_p);
|
|
2583 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
|
|
2584 && vinfo_for_stmt (use_stmt)
|
|
2585 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
|
|
2586 nloop_uses++;
|
|
2587 if (nloop_uses > 1)
|
|
2588 {
|
|
2589 if (vect_print_dump_info (REPORT_DETAILS))
|
|
2590 fprintf (vect_dump, "reduction used in loop.");
|
|
2591 return NULL;
|
|
2592 }
|
|
2593 }
|
|
2594
|
|
2595 code = gimple_assign_rhs_code (def_stmt);
|
|
2596
|
|
2597 if (!commutative_tree_code (code) || !associative_tree_code (code))
|
|
2598 {
|
|
2599 if (vect_print_dump_info (REPORT_DETAILS))
|
|
2600 report_vect_op (def_stmt, "reduction: not commutative/associative: ");
|
|
2601 return NULL;
|
|
2602 }
|
|
2603
|
|
2604 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
|
|
2605 {
|
|
2606 if (vect_print_dump_info (REPORT_DETAILS))
|
|
2607 report_vect_op (def_stmt, "reduction: not binary operation: ");
|
|
2608 return NULL;
|
|
2609 }
|
|
2610
|
|
2611 op1 = gimple_assign_rhs1 (def_stmt);
|
|
2612 op2 = gimple_assign_rhs2 (def_stmt);
|
|
2613 if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
|
|
2614 {
|
|
2615 if (vect_print_dump_info (REPORT_DETAILS))
|
|
2616 report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
|
|
2617 return NULL;
|
|
2618 }
|
|
2619
|
|
2620 /* Check that it's ok to change the order of the computation. */
|
|
2621 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
|
|
2622 if (TYPE_MAIN_VARIANT (type) != TYPE_MAIN_VARIANT (TREE_TYPE (op1))
|
|
2623 || TYPE_MAIN_VARIANT (type) != TYPE_MAIN_VARIANT (TREE_TYPE (op2)))
|
|
2624 {
|
|
2625 if (vect_print_dump_info (REPORT_DETAILS))
|
|
2626 {
|
|
2627 fprintf (vect_dump, "reduction: multiple types: operation type: ");
|
|
2628 print_generic_expr (vect_dump, type, TDF_SLIM);
|
|
2629 fprintf (vect_dump, ", operands types: ");
|
|
2630 print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
|
|
2631 fprintf (vect_dump, ",");
|
|
2632 print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
|
|
2633 }
|
|
2634 return NULL;
|
|
2635 }
|
|
2636
|
|
2637 /* Generally, when vectorizing a reduction we change the order of the
|
|
2638 computation. This may change the behavior of the program in some
|
|
2639 cases, so we need to check that this is ok. One exception is when
|
|
2640 vectorizing an outer-loop: the inner-loop is executed sequentially,
|
|
2641 and therefore vectorizing reductions in the inner-loop during
|
|
2642 outer-loop vectorization is safe. */
|
|
2643
|
|
2644 /* CHECKME: check for !flag_finite_math_only too? */
|
|
2645 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
|
|
2646 && !nested_in_vect_loop_p (vect_loop, def_stmt))
|
|
2647 {
|
|
2648 /* Changing the order of operations changes the semantics. */
|
|
2649 if (vect_print_dump_info (REPORT_DETAILS))
|
|
2650 report_vect_op (def_stmt, "reduction: unsafe fp math optimization: ");
|
|
2651 return NULL;
|
|
2652 }
|
|
2653 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
|
|
2654 && !nested_in_vect_loop_p (vect_loop, def_stmt))
|
|
2655 {
|
|
2656 /* Changing the order of operations changes the semantics. */
|
|
2657 if (vect_print_dump_info (REPORT_DETAILS))
|
|
2658 report_vect_op (def_stmt, "reduction: unsafe int math optimization: ");
|
|
2659 return NULL;
|
|
2660 }
|
|
2661 else if (SAT_FIXED_POINT_TYPE_P (type))
|
|
2662 {
|
|
2663 /* Changing the order of operations changes the semantics. */
|
|
2664 if (vect_print_dump_info (REPORT_DETAILS))
|
|
2665 report_vect_op (def_stmt,
|
|
2666 "reduction: unsafe fixed-point math optimization: ");
|
|
2667 return NULL;
|
|
2668 }
|
|
2669
|
|
2670 /* reduction is safe. we're dealing with one of the following:
|
|
2671 1) integer arithmetic and no trapv
|
|
2672 2) floating point arithmetic, and special flags permit this optimization.
|
|
2673 */
|
|
2674 def1 = SSA_NAME_DEF_STMT (op1);
|
|
2675 def2 = SSA_NAME_DEF_STMT (op2);
|
|
2676 if (!def1 || !def2 || gimple_nop_p (def1) || gimple_nop_p (def2))
|
|
2677 {
|
|
2678 if (vect_print_dump_info (REPORT_DETAILS))
|
|
2679 report_vect_op (def_stmt, "reduction: no defs for operands: ");
|
|
2680 return NULL;
|
|
2681 }
|
|
2682
|
|
2683
|
|
2684 /* Check that one def is the reduction def, defined by PHI,
|
|
2685 the other def is either defined in the loop ("vect_loop_def"),
|
|
2686 or it's an induction (defined by a loop-header phi-node). */
|
|
2687
|
|
2688 if (def2 == phi
|
|
2689 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
|
|
2690 && (is_gimple_assign (def1)
|
|
2691 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) == vect_induction_def
|
|
2692 || (gimple_code (def1) == GIMPLE_PHI
|
|
2693 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) == vect_loop_def
|
|
2694 && !is_loop_header_bb_p (gimple_bb (def1)))))
|
|
2695 {
|
|
2696 if (vect_print_dump_info (REPORT_DETAILS))
|
|
2697 report_vect_op (def_stmt, "detected reduction:");
|
|
2698 return def_stmt;
|
|
2699 }
|
|
2700 else if (def1 == phi
|
|
2701 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
|
|
2702 && (is_gimple_assign (def2)
|
|
2703 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) == vect_induction_def
|
|
2704 || (gimple_code (def2) == GIMPLE_PHI
|
|
2705 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) == vect_loop_def
|
|
2706 && !is_loop_header_bb_p (gimple_bb (def2)))))
|
|
2707 {
|
|
2708 /* Swap operands (just for simplicity - so that the rest of the code
|
|
2709 can assume that the reduction variable is always the last (second)
|
|
2710 argument). */
|
|
2711 if (vect_print_dump_info (REPORT_DETAILS))
|
|
2712 report_vect_op (def_stmt ,
|
|
2713 "detected reduction: need to swap operands:");
|
|
2714 swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
|
|
2715 gimple_assign_rhs2_ptr (def_stmt));
|
|
2716 return def_stmt;
|
|
2717 }
|
|
2718 else
|
|
2719 {
|
|
2720 if (vect_print_dump_info (REPORT_DETAILS))
|
|
2721 report_vect_op (def_stmt, "reduction: unknown pattern.");
|
|
2722 return NULL;
|
|
2723 }
|
|
2724 }
|
|
2725
|
|
2726
|
|
2727 /* Function vect_is_simple_iv_evolution.
|
|
2728
|
|
2729 FORNOW: A simple evolution of an induction variables in the loop is
|
|
2730 considered a polynomial evolution with constant step. */
|
|
2731
|
|
2732 bool
|
|
2733 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
|
|
2734 tree * step)
|
|
2735 {
|
|
2736 tree init_expr;
|
|
2737 tree step_expr;
|
|
2738 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
|
|
2739
|
|
2740 /* When there is no evolution in this loop, the evolution function
|
|
2741 is not "simple". */
|
|
2742 if (evolution_part == NULL_TREE)
|
|
2743 return false;
|
|
2744
|
|
2745 /* When the evolution is a polynomial of degree >= 2
|
|
2746 the evolution function is not "simple". */
|
|
2747 if (tree_is_chrec (evolution_part))
|
|
2748 return false;
|
|
2749
|
|
2750 step_expr = evolution_part;
|
|
2751 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
|
|
2752
|
|
2753 if (vect_print_dump_info (REPORT_DETAILS))
|
|
2754 {
|
|
2755 fprintf (vect_dump, "step: ");
|
|
2756 print_generic_expr (vect_dump, step_expr, TDF_SLIM);
|
|
2757 fprintf (vect_dump, ", init: ");
|
|
2758 print_generic_expr (vect_dump, init_expr, TDF_SLIM);
|
|
2759 }
|
|
2760
|
|
2761 *init = init_expr;
|
|
2762 *step = step_expr;
|
|
2763
|
|
2764 if (TREE_CODE (step_expr) != INTEGER_CST)
|
|
2765 {
|
|
2766 if (vect_print_dump_info (REPORT_DETAILS))
|
|
2767 fprintf (vect_dump, "step unknown.");
|
|
2768 return false;
|
|
2769 }
|
|
2770
|
|
2771 return true;
|
|
2772 }
|
|
2773
|
|
2774
|
|
2775 /* Function vectorize_loops.
|
|
2776
|
|
2777 Entry Point to loop vectorization phase. */
|
|
2778
|
|
2779 unsigned
|
|
2780 vectorize_loops (void)
|
|
2781 {
|
|
2782 unsigned int i;
|
|
2783 unsigned int num_vectorized_loops = 0;
|
|
2784 unsigned int vect_loops_num;
|
|
2785 loop_iterator li;
|
|
2786 struct loop *loop;
|
|
2787
|
|
2788 vect_loops_num = number_of_loops ();
|
|
2789
|
|
2790 /* Bail out if there are no loops. */
|
|
2791 if (vect_loops_num <= 1)
|
|
2792 return 0;
|
|
2793
|
|
2794 /* Fix the verbosity level if not defined explicitly by the user. */
|
|
2795 vect_set_dump_settings ();
|
|
2796
|
|
2797 /* Allocate the bitmap that records which virtual variables that
|
|
2798 need to be renamed. */
|
|
2799 vect_memsyms_to_rename = BITMAP_ALLOC (NULL);
|
|
2800
|
|
2801 init_stmt_vec_info_vec ();
|
|
2802
|
|
2803 /* ----------- Analyze loops. ----------- */
|
|
2804
|
|
2805 /* If some loop was duplicated, it gets bigger number
|
|
2806 than all previously defined loops. This fact allows us to run
|
|
2807 only over initial loops skipping newly generated ones. */
|
|
2808 FOR_EACH_LOOP (li, loop, 0)
|
|
2809 if (optimize_loop_nest_for_speed_p (loop))
|
|
2810 {
|
|
2811 loop_vec_info loop_vinfo;
|
|
2812
|
|
2813 vect_loop_location = find_loop_location (loop);
|
|
2814 loop_vinfo = vect_analyze_loop (loop);
|
|
2815 loop->aux = loop_vinfo;
|
|
2816
|
|
2817 if (!loop_vinfo || !LOOP_VINFO_VECTORIZABLE_P (loop_vinfo))
|
|
2818 continue;
|
|
2819
|
|
2820 vect_transform_loop (loop_vinfo);
|
|
2821 num_vectorized_loops++;
|
|
2822 }
|
|
2823 vect_loop_location = UNKNOWN_LOC;
|
|
2824
|
|
2825 statistics_counter_event (cfun, "Vectorized loops", num_vectorized_loops);
|
|
2826 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS)
|
|
2827 || (vect_print_dump_info (REPORT_VECTORIZED_LOOPS)
|
|
2828 && num_vectorized_loops > 0))
|
|
2829 fprintf (vect_dump, "vectorized %u loops in function.\n",
|
|
2830 num_vectorized_loops);
|
|
2831
|
|
2832 /* ----------- Finalize. ----------- */
|
|
2833
|
|
2834 BITMAP_FREE (vect_memsyms_to_rename);
|
|
2835
|
|
2836 for (i = 1; i < vect_loops_num; i++)
|
|
2837 {
|
|
2838 loop_vec_info loop_vinfo;
|
|
2839
|
|
2840 loop = get_loop (i);
|
|
2841 if (!loop)
|
|
2842 continue;
|
|
2843 loop_vinfo = (loop_vec_info) loop->aux;
|
|
2844 destroy_loop_vec_info (loop_vinfo, true);
|
|
2845 loop->aux = NULL;
|
|
2846 }
|
|
2847
|
|
2848 free_stmt_vec_info_vec ();
|
|
2849
|
|
2850 return num_vectorized_loops > 0 ? TODO_cleanup_cfg : 0;
|
|
2851 }
|
|
2852
|
|
2853 /* Increase alignment of global arrays to improve vectorization potential.
|
|
2854 TODO:
|
|
2855 - Consider also structs that have an array field.
|
|
2856 - Use ipa analysis to prune arrays that can't be vectorized?
|
|
2857 This should involve global alignment analysis and in the future also
|
|
2858 array padding. */
|
|
2859
|
|
2860 static unsigned int
|
|
2861 increase_alignment (void)
|
|
2862 {
|
|
2863 struct varpool_node *vnode;
|
|
2864
|
|
2865 /* Increase the alignment of all global arrays for vectorization. */
|
|
2866 for (vnode = varpool_nodes_queue;
|
|
2867 vnode;
|
|
2868 vnode = vnode->next_needed)
|
|
2869 {
|
|
2870 tree vectype, decl = vnode->decl;
|
|
2871 unsigned int alignment;
|
|
2872
|
|
2873 if (TREE_CODE (TREE_TYPE (decl)) != ARRAY_TYPE)
|
|
2874 continue;
|
|
2875 vectype = get_vectype_for_scalar_type (TREE_TYPE (TREE_TYPE (decl)));
|
|
2876 if (!vectype)
|
|
2877 continue;
|
|
2878 alignment = TYPE_ALIGN (vectype);
|
|
2879 if (DECL_ALIGN (decl) >= alignment)
|
|
2880 continue;
|
|
2881
|
|
2882 if (vect_can_force_dr_alignment_p (decl, alignment))
|
|
2883 {
|
|
2884 DECL_ALIGN (decl) = TYPE_ALIGN (vectype);
|
|
2885 DECL_USER_ALIGN (decl) = 1;
|
|
2886 if (dump_file)
|
|
2887 {
|
|
2888 fprintf (dump_file, "Increasing alignment of decl: ");
|
|
2889 print_generic_expr (dump_file, decl, TDF_SLIM);
|
|
2890 }
|
|
2891 }
|
|
2892 }
|
|
2893 return 0;
|
|
2894 }
|
|
2895
|
|
2896 static bool
|
|
2897 gate_increase_alignment (void)
|
|
2898 {
|
|
2899 return flag_section_anchors && flag_tree_vectorize;
|
|
2900 }
|
|
2901
|
|
2902 struct simple_ipa_opt_pass pass_ipa_increase_alignment =
|
|
2903 {
|
|
2904 {
|
|
2905 SIMPLE_IPA_PASS,
|
|
2906 "increase_alignment", /* name */
|
|
2907 gate_increase_alignment, /* gate */
|
|
2908 increase_alignment, /* execute */
|
|
2909 NULL, /* sub */
|
|
2910 NULL, /* next */
|
|
2911 0, /* static_pass_number */
|
|
2912 0, /* tv_id */
|
|
2913 0, /* properties_required */
|
|
2914 0, /* properties_provided */
|
|
2915 0, /* properties_destroyed */
|
|
2916 0, /* todo_flags_start */
|
|
2917 0 /* todo_flags_finish */
|
|
2918 }
|
|
2919 };
|