comparison gcc/tree-vect-transform.c @ 0:a06113de4d67

first commit
author kent <kent@cr.ie.u-ryukyu.ac.jp>
date Fri, 17 Jul 2009 14:47:48 +0900
parents
children 855418dad1a3
comparison
equal deleted inserted replaced
-1:000000000000 0:a06113de4d67
1 /* Transformation Utilities for Loop Vectorization.
2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009
3 Free Software Foundation, Inc.
4 Contributed by Dorit Naishlos <dorit@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "tm.h"
26 #include "ggc.h"
27 #include "tree.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "basic-block.h"
31 #include "diagnostic.h"
32 #include "tree-flow.h"
33 #include "tree-dump.h"
34 #include "timevar.h"
35 #include "cfgloop.h"
36 #include "expr.h"
37 #include "optabs.h"
38 #include "params.h"
39 #include "recog.h"
40 #include "tree-data-ref.h"
41 #include "tree-chrec.h"
42 #include "tree-scalar-evolution.h"
43 #include "tree-vectorizer.h"
44 #include "langhooks.h"
45 #include "tree-pass.h"
46 #include "toplev.h"
47 #include "real.h"
48
49 /* Utility functions for the code transformation. */
50 static bool vect_transform_stmt (gimple, gimple_stmt_iterator *, bool *,
51 slp_tree, slp_instance);
52 static tree vect_create_destination_var (tree, tree);
53 static tree vect_create_data_ref_ptr
54 (gimple, struct loop*, tree, tree *, gimple *, bool, bool *, tree);
55 static tree vect_create_addr_base_for_vector_ref
56 (gimple, gimple_seq *, tree, struct loop *);
57 static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *);
58 static tree vect_get_vec_def_for_operand (tree, gimple, tree *);
59 static tree vect_init_vector (gimple, tree, tree, gimple_stmt_iterator *);
60 static void vect_finish_stmt_generation
61 (gimple stmt, gimple vec_stmt, gimple_stmt_iterator *);
62 static bool vect_is_simple_cond (tree, loop_vec_info);
63 static void vect_create_epilog_for_reduction
64 (tree, gimple, int, enum tree_code, gimple);
65 static tree get_initial_def_for_reduction (gimple, tree, tree *);
66
67 /* Utility function dealing with loop peeling (not peeling itself). */
68 static void vect_generate_tmps_on_preheader
69 (loop_vec_info, tree *, tree *, tree *);
70 static tree vect_build_loop_niters (loop_vec_info);
71 static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge);
72 static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree);
73 static void vect_update_init_of_dr (struct data_reference *, tree niters);
74 static void vect_update_inits_of_drs (loop_vec_info, tree);
75 static int vect_min_worthwhile_factor (enum tree_code);
76
77
78 static int
79 cost_for_stmt (gimple stmt)
80 {
81 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
82
83 switch (STMT_VINFO_TYPE (stmt_info))
84 {
85 case load_vec_info_type:
86 return TARG_SCALAR_LOAD_COST;
87 case store_vec_info_type:
88 return TARG_SCALAR_STORE_COST;
89 case op_vec_info_type:
90 case condition_vec_info_type:
91 case assignment_vec_info_type:
92 case reduc_vec_info_type:
93 case induc_vec_info_type:
94 case type_promotion_vec_info_type:
95 case type_demotion_vec_info_type:
96 case type_conversion_vec_info_type:
97 case call_vec_info_type:
98 return TARG_SCALAR_STMT_COST;
99 case undef_vec_info_type:
100 default:
101 gcc_unreachable ();
102 }
103 }
104
105
106 /* Function vect_estimate_min_profitable_iters
107
108 Return the number of iterations required for the vector version of the
109 loop to be profitable relative to the cost of the scalar version of the
110 loop.
111
112 TODO: Take profile info into account before making vectorization
113 decisions, if available. */
114
115 int
116 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
117 {
118 int i;
119 int min_profitable_iters;
120 int peel_iters_prologue;
121 int peel_iters_epilogue;
122 int vec_inside_cost = 0;
123 int vec_outside_cost = 0;
124 int scalar_single_iter_cost = 0;
125 int scalar_outside_cost = 0;
126 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
127 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
128 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
129 int nbbs = loop->num_nodes;
130 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
131 int peel_guard_costs = 0;
132 int innerloop_iters = 0, factor;
133 VEC (slp_instance, heap) *slp_instances;
134 slp_instance instance;
135
136 /* Cost model disabled. */
137 if (!flag_vect_cost_model)
138 {
139 if (vect_print_dump_info (REPORT_COST))
140 fprintf (vect_dump, "cost model disabled.");
141 return 0;
142 }
143
144 /* Requires loop versioning tests to handle misalignment. */
145 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
146 {
147 /* FIXME: Make cost depend on complexity of individual check. */
148 vec_outside_cost +=
149 VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
150 if (vect_print_dump_info (REPORT_COST))
151 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
152 "versioning to treat misalignment.\n");
153 }
154
155 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
156 {
157 /* FIXME: Make cost depend on complexity of individual check. */
158 vec_outside_cost +=
159 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
160 if (vect_print_dump_info (REPORT_COST))
161 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
162 "versioning aliasing.\n");
163 }
164
165 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
166 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
167 {
168 vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
169 }
170
171 /* Count statements in scalar loop. Using this as scalar cost for a single
172 iteration for now.
173
174 TODO: Add outer loop support.
175
176 TODO: Consider assigning different costs to different scalar
177 statements. */
178
179 /* FORNOW. */
180 if (loop->inner)
181 innerloop_iters = 50; /* FIXME */
182
183 for (i = 0; i < nbbs; i++)
184 {
185 gimple_stmt_iterator si;
186 basic_block bb = bbs[i];
187
188 if (bb->loop_father == loop->inner)
189 factor = innerloop_iters;
190 else
191 factor = 1;
192
193 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
194 {
195 gimple stmt = gsi_stmt (si);
196 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
197 /* Skip stmts that are not vectorized inside the loop. */
198 if (!STMT_VINFO_RELEVANT_P (stmt_info)
199 && (!STMT_VINFO_LIVE_P (stmt_info)
200 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
201 continue;
202 scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
203 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
204 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
205 some of the "outside" costs are generated inside the outer-loop. */
206 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
207 }
208 }
209
210 /* Add additional cost for the peeled instructions in prologue and epilogue
211 loop.
212
213 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
214 at compile-time - we assume it's vf/2 (the worst would be vf-1).
215
216 TODO: Build an expression that represents peel_iters for prologue and
217 epilogue to be used in a run-time test. */
218
219 if (byte_misalign < 0)
220 {
221 peel_iters_prologue = vf/2;
222 if (vect_print_dump_info (REPORT_COST))
223 fprintf (vect_dump, "cost model: "
224 "prologue peel iters set to vf/2.");
225
226 /* If peeling for alignment is unknown, loop bound of main loop becomes
227 unknown. */
228 peel_iters_epilogue = vf/2;
229 if (vect_print_dump_info (REPORT_COST))
230 fprintf (vect_dump, "cost model: "
231 "epilogue peel iters set to vf/2 because "
232 "peeling for alignment is unknown .");
233
234 /* If peeled iterations are unknown, count a taken branch and a not taken
235 branch per peeled loop. Even if scalar loop iterations are known,
236 vector iterations are not known since peeled prologue iterations are
237 not known. Hence guards remain the same. */
238 peel_guard_costs += 2 * (TARG_COND_TAKEN_BRANCH_COST
239 + TARG_COND_NOT_TAKEN_BRANCH_COST);
240 }
241 else
242 {
243 if (byte_misalign)
244 {
245 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
246 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
247 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
248 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
249
250 peel_iters_prologue = nelements - (byte_misalign / element_size);
251 }
252 else
253 peel_iters_prologue = 0;
254
255 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
256 {
257 peel_iters_epilogue = vf/2;
258 if (vect_print_dump_info (REPORT_COST))
259 fprintf (vect_dump, "cost model: "
260 "epilogue peel iters set to vf/2 because "
261 "loop iterations are unknown .");
262
263 /* If peeled iterations are known but number of scalar loop
264 iterations are unknown, count a taken branch per peeled loop. */
265 peel_guard_costs += 2 * TARG_COND_TAKEN_BRANCH_COST;
266
267 }
268 else
269 {
270 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
271 peel_iters_prologue = niters < peel_iters_prologue ?
272 niters : peel_iters_prologue;
273 peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
274 }
275 }
276
277 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
278 + (peel_iters_epilogue * scalar_single_iter_cost)
279 + peel_guard_costs;
280
281 /* FORNOW: The scalar outside cost is incremented in one of the
282 following ways:
283
284 1. The vectorizer checks for alignment and aliasing and generates
285 a condition that allows dynamic vectorization. A cost model
286 check is ANDED with the versioning condition. Hence scalar code
287 path now has the added cost of the versioning check.
288
289 if (cost > th & versioning_check)
290 jmp to vector code
291
292 Hence run-time scalar is incremented by not-taken branch cost.
293
294 2. The vectorizer then checks if a prologue is required. If the
295 cost model check was not done before during versioning, it has to
296 be done before the prologue check.
297
298 if (cost <= th)
299 prologue = scalar_iters
300 if (prologue == 0)
301 jmp to vector code
302 else
303 execute prologue
304 if (prologue == num_iters)
305 go to exit
306
307 Hence the run-time scalar cost is incremented by a taken branch,
308 plus a not-taken branch, plus a taken branch cost.
309
310 3. The vectorizer then checks if an epilogue is required. If the
311 cost model check was not done before during prologue check, it
312 has to be done with the epilogue check.
313
314 if (prologue == 0)
315 jmp to vector code
316 else
317 execute prologue
318 if (prologue == num_iters)
319 go to exit
320 vector code:
321 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
322 jmp to epilogue
323
324 Hence the run-time scalar cost should be incremented by 2 taken
325 branches.
326
327 TODO: The back end may reorder the BBS's differently and reverse
328 conditions/branch directions. Change the estimates below to
329 something more reasonable. */
330
331 /* If the number of iterations is known and we do not do versioning, we can
332 decide whether to vectorize at compile time. Hence the scalar version
333 do not carry cost model guard costs. */
334 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
335 || VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
336 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
337 {
338 /* Cost model check occurs at versioning. */
339 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
340 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
341 scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST;
342 else
343 {
344 /* Cost model check occurs at prologue generation. */
345 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
346 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST
347 + TARG_COND_NOT_TAKEN_BRANCH_COST;
348 /* Cost model check occurs at epilogue generation. */
349 else
350 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST;
351 }
352 }
353
354 /* Add SLP costs. */
355 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
356 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
357 {
358 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
359 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
360 }
361
362 /* Calculate number of iterations required to make the vector version
363 profitable, relative to the loop bodies only. The following condition
364 must hold true:
365 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
366 where
367 SIC = scalar iteration cost, VIC = vector iteration cost,
368 VOC = vector outside cost, VF = vectorization factor,
369 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
370 SOC = scalar outside cost for run time cost model check. */
371
372 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
373 {
374 if (vec_outside_cost <= 0)
375 min_profitable_iters = 1;
376 else
377 {
378 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
379 - vec_inside_cost * peel_iters_prologue
380 - vec_inside_cost * peel_iters_epilogue)
381 / ((scalar_single_iter_cost * vf)
382 - vec_inside_cost);
383
384 if ((scalar_single_iter_cost * vf * min_profitable_iters)
385 <= ((vec_inside_cost * min_profitable_iters)
386 + ((vec_outside_cost - scalar_outside_cost) * vf)))
387 min_profitable_iters++;
388 }
389 }
390 /* vector version will never be profitable. */
391 else
392 {
393 if (vect_print_dump_info (REPORT_COST))
394 fprintf (vect_dump, "cost model: vector iteration cost = %d "
395 "is divisible by scalar iteration cost = %d by a factor "
396 "greater than or equal to the vectorization factor = %d .",
397 vec_inside_cost, scalar_single_iter_cost, vf);
398 return -1;
399 }
400
401 if (vect_print_dump_info (REPORT_COST))
402 {
403 fprintf (vect_dump, "Cost model analysis: \n");
404 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
405 vec_inside_cost);
406 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
407 vec_outside_cost);
408 fprintf (vect_dump, " Scalar iteration cost: %d\n",
409 scalar_single_iter_cost);
410 fprintf (vect_dump, " Scalar outside cost: %d\n", scalar_outside_cost);
411 fprintf (vect_dump, " prologue iterations: %d\n",
412 peel_iters_prologue);
413 fprintf (vect_dump, " epilogue iterations: %d\n",
414 peel_iters_epilogue);
415 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
416 min_profitable_iters);
417 }
418
419 min_profitable_iters =
420 min_profitable_iters < vf ? vf : min_profitable_iters;
421
422 /* Because the condition we create is:
423 if (niters <= min_profitable_iters)
424 then skip the vectorized loop. */
425 min_profitable_iters--;
426
427 if (vect_print_dump_info (REPORT_COST))
428 fprintf (vect_dump, " Profitability threshold = %d\n",
429 min_profitable_iters);
430
431 return min_profitable_iters;
432 }
433
434
435 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
436 functions. Design better to avoid maintenance issues. */
437
438 /* Function vect_model_reduction_cost.
439
440 Models cost for a reduction operation, including the vector ops
441 generated within the strip-mine loop, the initial definition before
442 the loop, and the epilogue code that must be generated. */
443
444 static bool
445 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
446 int ncopies)
447 {
448 int outer_cost = 0;
449 enum tree_code code;
450 optab optab;
451 tree vectype;
452 gimple stmt, orig_stmt;
453 tree reduction_op;
454 enum machine_mode mode;
455 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
456 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
457
458
459 /* Cost of reduction op inside loop. */
460 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST;
461
462 stmt = STMT_VINFO_STMT (stmt_info);
463
464 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
465 {
466 case GIMPLE_SINGLE_RHS:
467 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
468 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
469 break;
470 case GIMPLE_UNARY_RHS:
471 reduction_op = gimple_assign_rhs1 (stmt);
472 break;
473 case GIMPLE_BINARY_RHS:
474 reduction_op = gimple_assign_rhs2 (stmt);
475 break;
476 default:
477 gcc_unreachable ();
478 }
479
480 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
481 if (!vectype)
482 {
483 if (vect_print_dump_info (REPORT_COST))
484 {
485 fprintf (vect_dump, "unsupported data-type ");
486 print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
487 }
488 return false;
489 }
490
491 mode = TYPE_MODE (vectype);
492 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
493
494 if (!orig_stmt)
495 orig_stmt = STMT_VINFO_STMT (stmt_info);
496
497 code = gimple_assign_rhs_code (orig_stmt);
498
499 /* Add in cost for initial definition. */
500 outer_cost += TARG_SCALAR_TO_VEC_COST;
501
502 /* Determine cost of epilogue code.
503
504 We have a reduction operator that will reduce the vector in one statement.
505 Also requires scalar extract. */
506
507 if (!nested_in_vect_loop_p (loop, orig_stmt))
508 {
509 if (reduc_code < NUM_TREE_CODES)
510 outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
511 else
512 {
513 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
514 tree bitsize =
515 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
516 int element_bitsize = tree_low_cst (bitsize, 1);
517 int nelements = vec_size_in_bits / element_bitsize;
518
519 optab = optab_for_tree_code (code, vectype, optab_default);
520
521 /* We have a whole vector shift available. */
522 if (VECTOR_MODE_P (mode)
523 && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing
524 && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
525 /* Final reduction via vector shifts and the reduction operator. Also
526 requires scalar extract. */
527 outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST
528 + TARG_VEC_TO_SCALAR_COST);
529 else
530 /* Use extracts and reduction op for final reduction. For N elements,
531 we have N extracts and N-1 reduction ops. */
532 outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST);
533 }
534 }
535
536 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
537
538 if (vect_print_dump_info (REPORT_COST))
539 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
540 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
541 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
542
543 return true;
544 }
545
546
547 /* Function vect_model_induction_cost.
548
549 Models cost for induction operations. */
550
551 static void
552 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
553 {
554 /* loop cost for vec_loop. */
555 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
556 /* prologue cost for vec_init and vec_step. */
557 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
558
559 if (vect_print_dump_info (REPORT_COST))
560 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
561 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
562 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
563 }
564
565
566 /* Function vect_model_simple_cost.
567
568 Models cost for simple operations, i.e. those that only emit ncopies of a
569 single op. Right now, this does not account for multiple insns that could
570 be generated for the single vector op. We will handle that shortly. */
571
572 void
573 vect_model_simple_cost (stmt_vec_info stmt_info, int ncopies,
574 enum vect_def_type *dt, slp_tree slp_node)
575 {
576 int i;
577 int inside_cost = 0, outside_cost = 0;
578
579 /* The SLP costs were already calculated during SLP tree build. */
580 if (PURE_SLP_STMT (stmt_info))
581 return;
582
583 inside_cost = ncopies * TARG_VEC_STMT_COST;
584
585 /* FORNOW: Assuming maximum 2 args per stmts. */
586 for (i = 0; i < 2; i++)
587 {
588 if (dt[i] == vect_constant_def || dt[i] == vect_invariant_def)
589 outside_cost += TARG_SCALAR_TO_VEC_COST;
590 }
591
592 if (vect_print_dump_info (REPORT_COST))
593 fprintf (vect_dump, "vect_model_simple_cost: inside_cost = %d, "
594 "outside_cost = %d .", inside_cost, outside_cost);
595
596 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
597 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
598 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
599 }
600
601
602 /* Function vect_cost_strided_group_size
603
604 For strided load or store, return the group_size only if it is the first
605 load or store of a group, else return 1. This ensures that group size is
606 only returned once per group. */
607
608 static int
609 vect_cost_strided_group_size (stmt_vec_info stmt_info)
610 {
611 gimple first_stmt = DR_GROUP_FIRST_DR (stmt_info);
612
613 if (first_stmt == STMT_VINFO_STMT (stmt_info))
614 return DR_GROUP_SIZE (stmt_info);
615
616 return 1;
617 }
618
619
620 /* Function vect_model_store_cost
621
622 Models cost for stores. In the case of strided accesses, one access
623 has the overhead of the strided access attributed to it. */
624
625 void
626 vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
627 enum vect_def_type dt, slp_tree slp_node)
628 {
629 int group_size;
630 int inside_cost = 0, outside_cost = 0;
631
632 /* The SLP costs were already calculated during SLP tree build. */
633 if (PURE_SLP_STMT (stmt_info))
634 return;
635
636 if (dt == vect_constant_def || dt == vect_invariant_def)
637 outside_cost = TARG_SCALAR_TO_VEC_COST;
638
639 /* Strided access? */
640 if (DR_GROUP_FIRST_DR (stmt_info) && !slp_node)
641 group_size = vect_cost_strided_group_size (stmt_info);
642 /* Not a strided access. */
643 else
644 group_size = 1;
645
646 /* Is this an access in a group of stores, which provide strided access?
647 If so, add in the cost of the permutes. */
648 if (group_size > 1)
649 {
650 /* Uses a high and low interleave operation for each needed permute. */
651 inside_cost = ncopies * exact_log2(group_size) * group_size
652 * TARG_VEC_STMT_COST;
653
654 if (vect_print_dump_info (REPORT_COST))
655 fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .",
656 group_size);
657
658 }
659
660 /* Costs of the stores. */
661 inside_cost += ncopies * TARG_VEC_STORE_COST;
662
663 if (vect_print_dump_info (REPORT_COST))
664 fprintf (vect_dump, "vect_model_store_cost: inside_cost = %d, "
665 "outside_cost = %d .", inside_cost, outside_cost);
666
667 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
668 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
669 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
670 }
671
672
673 /* Function vect_model_load_cost
674
675 Models cost for loads. In the case of strided accesses, the last access
676 has the overhead of the strided access attributed to it. Since unaligned
677 accesses are supported for loads, we also account for the costs of the
678 access scheme chosen. */
679
680 void
681 vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node)
682
683 {
684 int group_size;
685 int alignment_support_cheme;
686 gimple first_stmt;
687 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
688 int inside_cost = 0, outside_cost = 0;
689
690 /* The SLP costs were already calculated during SLP tree build. */
691 if (PURE_SLP_STMT (stmt_info))
692 return;
693
694 /* Strided accesses? */
695 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
696 if (first_stmt && !slp_node)
697 {
698 group_size = vect_cost_strided_group_size (stmt_info);
699 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
700 }
701 /* Not a strided access. */
702 else
703 {
704 group_size = 1;
705 first_dr = dr;
706 }
707
708 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
709
710 /* Is this an access in a group of loads providing strided access?
711 If so, add in the cost of the permutes. */
712 if (group_size > 1)
713 {
714 /* Uses an even and odd extract operations for each needed permute. */
715 inside_cost = ncopies * exact_log2(group_size) * group_size
716 * TARG_VEC_STMT_COST;
717
718 if (vect_print_dump_info (REPORT_COST))
719 fprintf (vect_dump, "vect_model_load_cost: strided group_size = %d .",
720 group_size);
721
722 }
723
724 /* The loads themselves. */
725 switch (alignment_support_cheme)
726 {
727 case dr_aligned:
728 {
729 inside_cost += ncopies * TARG_VEC_LOAD_COST;
730
731 if (vect_print_dump_info (REPORT_COST))
732 fprintf (vect_dump, "vect_model_load_cost: aligned.");
733
734 break;
735 }
736 case dr_unaligned_supported:
737 {
738 /* Here, we assign an additional cost for the unaligned load. */
739 inside_cost += ncopies * TARG_VEC_UNALIGNED_LOAD_COST;
740
741 if (vect_print_dump_info (REPORT_COST))
742 fprintf (vect_dump, "vect_model_load_cost: unaligned supported by "
743 "hardware.");
744
745 break;
746 }
747 case dr_explicit_realign:
748 {
749 inside_cost += ncopies * (2*TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
750
751 /* FIXME: If the misalignment remains fixed across the iterations of
752 the containing loop, the following cost should be added to the
753 outside costs. */
754 if (targetm.vectorize.builtin_mask_for_load)
755 inside_cost += TARG_VEC_STMT_COST;
756
757 break;
758 }
759 case dr_explicit_realign_optimized:
760 {
761 if (vect_print_dump_info (REPORT_COST))
762 fprintf (vect_dump, "vect_model_load_cost: unaligned software "
763 "pipelined.");
764
765 /* Unaligned software pipeline has a load of an address, an initial
766 load, and possibly a mask operation to "prime" the loop. However,
767 if this is an access in a group of loads, which provide strided
768 access, then the above cost should only be considered for one
769 access in the group. Inside the loop, there is a load op
770 and a realignment op. */
771
772 if ((!DR_GROUP_FIRST_DR (stmt_info)) || group_size > 1 || slp_node)
773 {
774 outside_cost = 2*TARG_VEC_STMT_COST;
775 if (targetm.vectorize.builtin_mask_for_load)
776 outside_cost += TARG_VEC_STMT_COST;
777 }
778
779 inside_cost += ncopies * (TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
780
781 break;
782 }
783
784 default:
785 gcc_unreachable ();
786 }
787
788 if (vect_print_dump_info (REPORT_COST))
789 fprintf (vect_dump, "vect_model_load_cost: inside_cost = %d, "
790 "outside_cost = %d .", inside_cost, outside_cost);
791
792 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
793 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
794 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
795 }
796
797
798 /* Function vect_get_new_vect_var.
799
800 Returns a name for a new variable. The current naming scheme appends the
801 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
802 the name of vectorizer generated variables, and appends that to NAME if
803 provided. */
804
805 static tree
806 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
807 {
808 const char *prefix;
809 tree new_vect_var;
810
811 switch (var_kind)
812 {
813 case vect_simple_var:
814 prefix = "vect_";
815 break;
816 case vect_scalar_var:
817 prefix = "stmp_";
818 break;
819 case vect_pointer_var:
820 prefix = "vect_p";
821 break;
822 default:
823 gcc_unreachable ();
824 }
825
826 if (name)
827 {
828 char* tmp = concat (prefix, name, NULL);
829 new_vect_var = create_tmp_var (type, tmp);
830 free (tmp);
831 }
832 else
833 new_vect_var = create_tmp_var (type, prefix);
834
835 /* Mark vector typed variable as a gimple register variable. */
836 if (TREE_CODE (type) == VECTOR_TYPE)
837 DECL_GIMPLE_REG_P (new_vect_var) = true;
838
839 return new_vect_var;
840 }
841
842
843 /* Function vect_create_addr_base_for_vector_ref.
844
845 Create an expression that computes the address of the first memory location
846 that will be accessed for a data reference.
847
848 Input:
849 STMT: The statement containing the data reference.
850 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
851 OFFSET: Optional. If supplied, it is be added to the initial address.
852 LOOP: Specify relative to which loop-nest should the address be computed.
853 For example, when the dataref is in an inner-loop nested in an
854 outer-loop that is now being vectorized, LOOP can be either the
855 outer-loop, or the inner-loop. The first memory location accessed
856 by the following dataref ('in' points to short):
857
858 for (i=0; i<N; i++)
859 for (j=0; j<M; j++)
860 s += in[i+j]
861
862 is as follows:
863 if LOOP=i_loop: &in (relative to i_loop)
864 if LOOP=j_loop: &in+i*2B (relative to j_loop)
865
866 Output:
867 1. Return an SSA_NAME whose value is the address of the memory location of
868 the first vector of the data reference.
869 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
870 these statement(s) which define the returned SSA_NAME.
871
872 FORNOW: We are only handling array accesses with step 1. */
873
874 static tree
875 vect_create_addr_base_for_vector_ref (gimple stmt,
876 gimple_seq *new_stmt_list,
877 tree offset,
878 struct loop *loop)
879 {
880 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
881 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
882 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
883 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
884 tree base_name;
885 tree data_ref_base_var;
886 tree vec_stmt;
887 tree addr_base, addr_expr;
888 tree dest;
889 gimple_seq seq = NULL;
890 tree base_offset = unshare_expr (DR_OFFSET (dr));
891 tree init = unshare_expr (DR_INIT (dr));
892 tree vect_ptr_type, addr_expr2;
893 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
894
895 gcc_assert (loop);
896 if (loop != containing_loop)
897 {
898 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
899 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
900
901 gcc_assert (nested_in_vect_loop_p (loop, stmt));
902
903 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
904 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
905 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
906 }
907
908 /* Create data_ref_base */
909 base_name = build_fold_indirect_ref (data_ref_base);
910 data_ref_base_var = create_tmp_var (TREE_TYPE (data_ref_base), "batmp");
911 add_referenced_var (data_ref_base_var);
912 data_ref_base = force_gimple_operand (data_ref_base, &seq, true,
913 data_ref_base_var);
914 gimple_seq_add_seq (new_stmt_list, seq);
915
916 /* Create base_offset */
917 base_offset = size_binop (PLUS_EXPR,
918 fold_convert (sizetype, base_offset),
919 fold_convert (sizetype, init));
920 dest = create_tmp_var (sizetype, "base_off");
921 add_referenced_var (dest);
922 base_offset = force_gimple_operand (base_offset, &seq, true, dest);
923 gimple_seq_add_seq (new_stmt_list, seq);
924
925 if (offset)
926 {
927 tree tmp = create_tmp_var (sizetype, "offset");
928
929 add_referenced_var (tmp);
930 offset = fold_build2 (MULT_EXPR, sizetype,
931 fold_convert (sizetype, offset), step);
932 base_offset = fold_build2 (PLUS_EXPR, sizetype,
933 base_offset, offset);
934 base_offset = force_gimple_operand (base_offset, &seq, false, tmp);
935 gimple_seq_add_seq (new_stmt_list, seq);
936 }
937
938 /* base + base_offset */
939 addr_base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (data_ref_base),
940 data_ref_base, base_offset);
941
942 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
943
944 /* addr_expr = addr_base */
945 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
946 get_name (base_name));
947 add_referenced_var (addr_expr);
948 vec_stmt = fold_convert (vect_ptr_type, addr_base);
949 addr_expr2 = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
950 get_name (base_name));
951 add_referenced_var (addr_expr2);
952 vec_stmt = force_gimple_operand (vec_stmt, &seq, false, addr_expr2);
953 gimple_seq_add_seq (new_stmt_list, seq);
954
955 if (vect_print_dump_info (REPORT_DETAILS))
956 {
957 fprintf (vect_dump, "created ");
958 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
959 }
960 return vec_stmt;
961 }
962
963
964 /* Function vect_create_data_ref_ptr.
965
966 Create a new pointer to vector type (vp), that points to the first location
967 accessed in the loop by STMT, along with the def-use update chain to
968 appropriately advance the pointer through the loop iterations. Also set
969 aliasing information for the pointer. This vector pointer is used by the
970 callers to this function to create a memory reference expression for vector
971 load/store access.
972
973 Input:
974 1. STMT: a stmt that references memory. Expected to be of the form
975 GIMPLE_ASSIGN <name, data-ref> or
976 GIMPLE_ASSIGN <data-ref, name>.
977 2. AT_LOOP: the loop where the vector memref is to be created.
978 3. OFFSET (optional): an offset to be added to the initial address accessed
979 by the data-ref in STMT.
980 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain
981 pointing to the initial address.
982 5. TYPE: if not NULL indicates the required type of the data-ref.
983
984 Output:
985 1. Declare a new ptr to vector_type, and have it point to the base of the
986 data reference (initial addressed accessed by the data reference).
987 For example, for vector of type V8HI, the following code is generated:
988
989 v8hi *vp;
990 vp = (v8hi *)initial_address;
991
992 if OFFSET is not supplied:
993 initial_address = &a[init];
994 if OFFSET is supplied:
995 initial_address = &a[init + OFFSET];
996
997 Return the initial_address in INITIAL_ADDRESS.
998
999 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
1000 update the pointer in each iteration of the loop.
1001
1002 Return the increment stmt that updates the pointer in PTR_INCR.
1003
1004 3. Set INV_P to true if the access pattern of the data reference in the
1005 vectorized loop is invariant. Set it to false otherwise.
1006
1007 4. Return the pointer. */
1008
1009 static tree
1010 vect_create_data_ref_ptr (gimple stmt, struct loop *at_loop,
1011 tree offset, tree *initial_address, gimple *ptr_incr,
1012 bool only_init, bool *inv_p, tree type)
1013 {
1014 tree base_name;
1015 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1016 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1017 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1018 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
1019 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
1020 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1021 tree vect_ptr_type;
1022 tree vect_ptr;
1023 tree tag;
1024 tree new_temp;
1025 gimple vec_stmt;
1026 gimple_seq new_stmt_list = NULL;
1027 edge pe;
1028 basic_block new_bb;
1029 tree vect_ptr_init;
1030 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1031 tree vptr;
1032 gimple_stmt_iterator incr_gsi;
1033 bool insert_after;
1034 tree indx_before_incr, indx_after_incr;
1035 gimple incr;
1036 tree step;
1037
1038 /* Check the step (evolution) of the load in LOOP, and record
1039 whether it's invariant. */
1040 if (nested_in_vect_loop)
1041 step = STMT_VINFO_DR_STEP (stmt_info);
1042 else
1043 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
1044
1045 if (tree_int_cst_compare (step, size_zero_node) == 0)
1046 *inv_p = true;
1047 else
1048 *inv_p = false;
1049
1050 /* Create an expression for the first address accessed by this load
1051 in LOOP. */
1052 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
1053
1054 if (vect_print_dump_info (REPORT_DETAILS))
1055 {
1056 tree data_ref_base = base_name;
1057 fprintf (vect_dump, "create vector-pointer variable to type: ");
1058 print_generic_expr (vect_dump, vectype, TDF_SLIM);
1059 if (TREE_CODE (data_ref_base) == VAR_DECL)
1060 fprintf (vect_dump, " vectorizing a one dimensional array ref: ");
1061 else if (TREE_CODE (data_ref_base) == ARRAY_REF)
1062 fprintf (vect_dump, " vectorizing a multidimensional array ref: ");
1063 else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
1064 fprintf (vect_dump, " vectorizing a record based array ref: ");
1065 else if (TREE_CODE (data_ref_base) == SSA_NAME)
1066 fprintf (vect_dump, " vectorizing a pointer ref: ");
1067 print_generic_expr (vect_dump, base_name, TDF_SLIM);
1068 }
1069
1070 /** (1) Create the new vector-pointer variable: **/
1071 if (type)
1072 vect_ptr_type = build_pointer_type (type);
1073 else
1074 vect_ptr_type = build_pointer_type (vectype);
1075
1076 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == SSA_NAME
1077 && TYPE_RESTRICT (TREE_TYPE (DR_BASE_ADDRESS (dr))))
1078 vect_ptr_type = build_qualified_type (vect_ptr_type, TYPE_QUAL_RESTRICT);
1079 vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
1080 get_name (base_name));
1081 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == SSA_NAME
1082 && TYPE_RESTRICT (TREE_TYPE (DR_BASE_ADDRESS (dr))))
1083 {
1084 get_alias_set (base_name);
1085 DECL_POINTER_ALIAS_SET (vect_ptr)
1086 = DECL_POINTER_ALIAS_SET (SSA_NAME_VAR (DR_BASE_ADDRESS (dr)));
1087 }
1088
1089 add_referenced_var (vect_ptr);
1090
1091 /** (2) Add aliasing information to the new vector-pointer:
1092 (The points-to info (DR_PTR_INFO) may be defined later.) **/
1093
1094 tag = DR_SYMBOL_TAG (dr);
1095 gcc_assert (tag);
1096
1097 /* If tag is a variable (and NOT_A_TAG) than a new symbol memory
1098 tag must be created with tag added to its may alias list. */
1099 if (!MTAG_P (tag))
1100 new_type_alias (vect_ptr, tag, DR_REF (dr));
1101 else
1102 {
1103 set_symbol_mem_tag (vect_ptr, tag);
1104 mark_sym_for_renaming (tag);
1105 }
1106
1107 /** Note: If the dataref is in an inner-loop nested in LOOP, and we are
1108 vectorizing LOOP (i.e. outer-loop vectorization), we need to create two
1109 def-use update cycles for the pointer: One relative to the outer-loop
1110 (LOOP), which is what steps (3) and (4) below do. The other is relative
1111 to the inner-loop (which is the inner-most loop containing the dataref),
1112 and this is done be step (5) below.
1113
1114 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
1115 inner-most loop, and so steps (3),(4) work the same, and step (5) is
1116 redundant. Steps (3),(4) create the following:
1117
1118 vp0 = &base_addr;
1119 LOOP: vp1 = phi(vp0,vp2)
1120 ...
1121 ...
1122 vp2 = vp1 + step
1123 goto LOOP
1124
1125 If there is an inner-loop nested in loop, then step (5) will also be
1126 applied, and an additional update in the inner-loop will be created:
1127
1128 vp0 = &base_addr;
1129 LOOP: vp1 = phi(vp0,vp2)
1130 ...
1131 inner: vp3 = phi(vp1,vp4)
1132 vp4 = vp3 + inner_step
1133 if () goto inner
1134 ...
1135 vp2 = vp1 + step
1136 if () goto LOOP */
1137
1138 /** (3) Calculate the initial address the vector-pointer, and set
1139 the vector-pointer to point to it before the loop: **/
1140
1141 /* Create: (&(base[init_val+offset]) in the loop preheader. */
1142
1143 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
1144 offset, loop);
1145 pe = loop_preheader_edge (loop);
1146 if (new_stmt_list)
1147 {
1148 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
1149 gcc_assert (!new_bb);
1150 }
1151
1152 *initial_address = new_temp;
1153
1154 /* Create: p = (vectype *) initial_base */
1155 vec_stmt = gimple_build_assign (vect_ptr,
1156 fold_convert (vect_ptr_type, new_temp));
1157 vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt);
1158 gimple_assign_set_lhs (vec_stmt, vect_ptr_init);
1159 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
1160 gcc_assert (!new_bb);
1161
1162
1163 /** (4) Handle the updating of the vector-pointer inside the loop.
1164 This is needed when ONLY_INIT is false, and also when AT_LOOP
1165 is the inner-loop nested in LOOP (during outer-loop vectorization).
1166 **/
1167
1168 if (only_init && at_loop == loop) /* No update in loop is required. */
1169 {
1170 /* Copy the points-to information if it exists. */
1171 if (DR_PTR_INFO (dr))
1172 duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr));
1173 vptr = vect_ptr_init;
1174 }
1175 else
1176 {
1177 /* The step of the vector pointer is the Vector Size. */
1178 tree step = TYPE_SIZE_UNIT (vectype);
1179 /* One exception to the above is when the scalar step of the load in
1180 LOOP is zero. In this case the step here is also zero. */
1181 if (*inv_p)
1182 step = size_zero_node;
1183
1184 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
1185
1186 create_iv (vect_ptr_init,
1187 fold_convert (vect_ptr_type, step),
1188 vect_ptr, loop, &incr_gsi, insert_after,
1189 &indx_before_incr, &indx_after_incr);
1190 incr = gsi_stmt (incr_gsi);
1191 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
1192
1193 /* Copy the points-to information if it exists. */
1194 if (DR_PTR_INFO (dr))
1195 {
1196 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1197 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1198 }
1199 merge_alias_info (vect_ptr_init, indx_before_incr);
1200 merge_alias_info (vect_ptr_init, indx_after_incr);
1201 if (ptr_incr)
1202 *ptr_incr = incr;
1203
1204 vptr = indx_before_incr;
1205 }
1206
1207 if (!nested_in_vect_loop || only_init)
1208 return vptr;
1209
1210
1211 /** (5) Handle the updating of the vector-pointer inside the inner-loop
1212 nested in LOOP, if exists: **/
1213
1214 gcc_assert (nested_in_vect_loop);
1215 if (!only_init)
1216 {
1217 standard_iv_increment_position (containing_loop, &incr_gsi,
1218 &insert_after);
1219 create_iv (vptr, fold_convert (vect_ptr_type, DR_STEP (dr)), vect_ptr,
1220 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
1221 &indx_after_incr);
1222 incr = gsi_stmt (incr_gsi);
1223 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
1224
1225 /* Copy the points-to information if it exists. */
1226 if (DR_PTR_INFO (dr))
1227 {
1228 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1229 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1230 }
1231 merge_alias_info (vect_ptr_init, indx_before_incr);
1232 merge_alias_info (vect_ptr_init, indx_after_incr);
1233 if (ptr_incr)
1234 *ptr_incr = incr;
1235
1236 return indx_before_incr;
1237 }
1238 else
1239 gcc_unreachable ();
1240 }
1241
1242
1243 /* Function bump_vector_ptr
1244
1245 Increment a pointer (to a vector type) by vector-size. If requested,
1246 i.e. if PTR-INCR is given, then also connect the new increment stmt
1247 to the existing def-use update-chain of the pointer, by modifying
1248 the PTR_INCR as illustrated below:
1249
1250 The pointer def-use update-chain before this function:
1251 DATAREF_PTR = phi (p_0, p_2)
1252 ....
1253 PTR_INCR: p_2 = DATAREF_PTR + step
1254
1255 The pointer def-use update-chain after this function:
1256 DATAREF_PTR = phi (p_0, p_2)
1257 ....
1258 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
1259 ....
1260 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
1261
1262 Input:
1263 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
1264 in the loop.
1265 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
1266 the loop. The increment amount across iterations is expected
1267 to be vector_size.
1268 BSI - location where the new update stmt is to be placed.
1269 STMT - the original scalar memory-access stmt that is being vectorized.
1270 BUMP - optional. The offset by which to bump the pointer. If not given,
1271 the offset is assumed to be vector_size.
1272
1273 Output: Return NEW_DATAREF_PTR as illustrated above.
1274
1275 */
1276
1277 static tree
1278 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
1279 gimple stmt, tree bump)
1280 {
1281 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1282 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1283 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1284 tree ptr_var = SSA_NAME_VAR (dataref_ptr);
1285 tree update = TYPE_SIZE_UNIT (vectype);
1286 gimple incr_stmt;
1287 ssa_op_iter iter;
1288 use_operand_p use_p;
1289 tree new_dataref_ptr;
1290
1291 if (bump)
1292 update = bump;
1293
1294 incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, ptr_var,
1295 dataref_ptr, update);
1296 new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
1297 gimple_assign_set_lhs (incr_stmt, new_dataref_ptr);
1298 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
1299
1300 /* Copy the points-to information if it exists. */
1301 if (DR_PTR_INFO (dr))
1302 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
1303 merge_alias_info (new_dataref_ptr, dataref_ptr);
1304
1305 if (!ptr_incr)
1306 return new_dataref_ptr;
1307
1308 /* Update the vector-pointer's cross-iteration increment. */
1309 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
1310 {
1311 tree use = USE_FROM_PTR (use_p);
1312
1313 if (use == dataref_ptr)
1314 SET_USE (use_p, new_dataref_ptr);
1315 else
1316 gcc_assert (tree_int_cst_compare (use, update) == 0);
1317 }
1318
1319 return new_dataref_ptr;
1320 }
1321
1322
1323 /* Function vect_create_destination_var.
1324
1325 Create a new temporary of type VECTYPE. */
1326
1327 static tree
1328 vect_create_destination_var (tree scalar_dest, tree vectype)
1329 {
1330 tree vec_dest;
1331 const char *new_name;
1332 tree type;
1333 enum vect_var_kind kind;
1334
1335 kind = vectype ? vect_simple_var : vect_scalar_var;
1336 type = vectype ? vectype : TREE_TYPE (scalar_dest);
1337
1338 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
1339
1340 new_name = get_name (scalar_dest);
1341 if (!new_name)
1342 new_name = "var_";
1343 vec_dest = vect_get_new_vect_var (type, kind, new_name);
1344 add_referenced_var (vec_dest);
1345
1346 return vec_dest;
1347 }
1348
1349
1350 /* Function vect_init_vector.
1351
1352 Insert a new stmt (INIT_STMT) that initializes a new vector variable with
1353 the vector elements of VECTOR_VAR. Place the initialization at BSI if it
1354 is not NULL. Otherwise, place the initialization at the loop preheader.
1355 Return the DEF of INIT_STMT.
1356 It will be used in the vectorization of STMT. */
1357
1358 static tree
1359 vect_init_vector (gimple stmt, tree vector_var, tree vector_type,
1360 gimple_stmt_iterator *gsi)
1361 {
1362 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1363 tree new_var;
1364 gimple init_stmt;
1365 tree vec_oprnd;
1366 edge pe;
1367 tree new_temp;
1368 basic_block new_bb;
1369
1370 new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_");
1371 add_referenced_var (new_var);
1372 init_stmt = gimple_build_assign (new_var, vector_var);
1373 new_temp = make_ssa_name (new_var, init_stmt);
1374 gimple_assign_set_lhs (init_stmt, new_temp);
1375
1376 if (gsi)
1377 vect_finish_stmt_generation (stmt, init_stmt, gsi);
1378 else
1379 {
1380 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1381 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1382
1383 if (nested_in_vect_loop_p (loop, stmt))
1384 loop = loop->inner;
1385 pe = loop_preheader_edge (loop);
1386 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
1387 gcc_assert (!new_bb);
1388 }
1389
1390 if (vect_print_dump_info (REPORT_DETAILS))
1391 {
1392 fprintf (vect_dump, "created new init_stmt: ");
1393 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
1394 }
1395
1396 vec_oprnd = gimple_assign_lhs (init_stmt);
1397 return vec_oprnd;
1398 }
1399
1400
1401 /* For constant and loop invariant defs of SLP_NODE this function returns
1402 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
1403 OP_NUM determines if we gather defs for operand 0 or operand 1 of the scalar
1404 stmts. NUMBER_OF_VECTORS is the number of vector defs to create. */
1405
1406 static void
1407 vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
1408 unsigned int op_num, unsigned int number_of_vectors)
1409 {
1410 VEC (gimple, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node);
1411 gimple stmt = VEC_index (gimple, stmts, 0);
1412 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1413 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1414 int nunits;
1415 tree vec_cst;
1416 tree t = NULL_TREE;
1417 int j, number_of_places_left_in_vector;
1418 tree vector_type;
1419 tree op, vop;
1420 int group_size = VEC_length (gimple, stmts);
1421 unsigned int vec_num, i;
1422 int number_of_copies = 1;
1423 VEC (tree, heap) *voprnds = VEC_alloc (tree, heap, number_of_vectors);
1424 bool constant_p, is_store;
1425
1426 if (STMT_VINFO_DATA_REF (stmt_vinfo))
1427 {
1428 is_store = true;
1429 op = gimple_assign_rhs1 (stmt);
1430 }
1431 else
1432 {
1433 is_store = false;
1434 op = gimple_op (stmt, op_num + 1);
1435 }
1436
1437 if (CONSTANT_CLASS_P (op))
1438 {
1439 vector_type = vectype;
1440 constant_p = true;
1441 }
1442 else
1443 {
1444 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1445 gcc_assert (vector_type);
1446 constant_p = false;
1447 }
1448
1449 nunits = TYPE_VECTOR_SUBPARTS (vector_type);
1450
1451 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
1452 created vectors. It is greater than 1 if unrolling is performed.
1453
1454 For example, we have two scalar operands, s1 and s2 (e.g., group of
1455 strided accesses of size two), while NUNITS is four (i.e., four scalars
1456 of this type can be packed in a vector). The output vector will contain
1457 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
1458 will be 2).
1459
1460 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
1461 containing the operands.
1462
1463 For example, NUNITS is four as before, and the group size is 8
1464 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
1465 {s5, s6, s7, s8}. */
1466
1467 number_of_copies = least_common_multiple (nunits, group_size) / group_size;
1468
1469 number_of_places_left_in_vector = nunits;
1470 for (j = 0; j < number_of_copies; j++)
1471 {
1472 for (i = group_size - 1; VEC_iterate (gimple, stmts, i, stmt); i--)
1473 {
1474 if (is_store)
1475 op = gimple_assign_rhs1 (stmt);
1476 else
1477 op = gimple_op (stmt, op_num + 1);
1478
1479 /* Create 'vect_ = {op0,op1,...,opn}'. */
1480 t = tree_cons (NULL_TREE, op, t);
1481
1482 number_of_places_left_in_vector--;
1483
1484 if (number_of_places_left_in_vector == 0)
1485 {
1486 number_of_places_left_in_vector = nunits;
1487
1488 if (constant_p)
1489 vec_cst = build_vector (vector_type, t);
1490 else
1491 vec_cst = build_constructor_from_list (vector_type, t);
1492 VEC_quick_push (tree, voprnds,
1493 vect_init_vector (stmt, vec_cst, vector_type, NULL));
1494 t = NULL_TREE;
1495 }
1496 }
1497 }
1498
1499 /* Since the vectors are created in the reverse order, we should invert
1500 them. */
1501 vec_num = VEC_length (tree, voprnds);
1502 for (j = vec_num - 1; j >= 0; j--)
1503 {
1504 vop = VEC_index (tree, voprnds, j);
1505 VEC_quick_push (tree, *vec_oprnds, vop);
1506 }
1507
1508 VEC_free (tree, heap, voprnds);
1509
1510 /* In case that VF is greater than the unrolling factor needed for the SLP
1511 group of stmts, NUMBER_OF_VECTORS to be created is greater than
1512 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
1513 to replicate the vectors. */
1514 while (number_of_vectors > VEC_length (tree, *vec_oprnds))
1515 {
1516 for (i = 0; VEC_iterate (tree, *vec_oprnds, i, vop) && i < vec_num; i++)
1517 VEC_quick_push (tree, *vec_oprnds, vop);
1518 }
1519 }
1520
1521
1522 /* Get vectorized definitions from SLP_NODE that contains corresponding
1523 vectorized def-stmts. */
1524
1525 static void
1526 vect_get_slp_vect_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds)
1527 {
1528 tree vec_oprnd;
1529 gimple vec_def_stmt;
1530 unsigned int i;
1531
1532 gcc_assert (SLP_TREE_VEC_STMTS (slp_node));
1533
1534 for (i = 0;
1535 VEC_iterate (gimple, SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt);
1536 i++)
1537 {
1538 gcc_assert (vec_def_stmt);
1539 vec_oprnd = gimple_get_lhs (vec_def_stmt);
1540 VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
1541 }
1542 }
1543
1544
1545 /* Get vectorized definitions for SLP_NODE.
1546 If the scalar definitions are loop invariants or constants, collect them and
1547 call vect_get_constant_vectors() to create vector stmts.
1548 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
1549 must be stored in the LEFT/RIGHT node of SLP_NODE, and we call
1550 vect_get_slp_vect_defs() to retrieve them.
1551 If VEC_OPRNDS1 is NULL, don't get vector defs for the second operand (from
1552 the right node. This is used when the second operand must remain scalar. */
1553
1554 static void
1555 vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
1556 VEC (tree,heap) **vec_oprnds1)
1557 {
1558 gimple first_stmt;
1559 enum tree_code code;
1560 int number_of_vects;
1561 HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
1562
1563 first_stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (slp_node), 0);
1564 /* The number of vector defs is determined by the number of vector statements
1565 in the node from which we get those statements. */
1566 if (SLP_TREE_LEFT (slp_node))
1567 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_LEFT (slp_node));
1568 else
1569 {
1570 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
1571 /* Number of vector stmts was calculated according to LHS in
1572 vect_schedule_slp_instance(), fix it by replacing LHS with RHS, if
1573 necessary. See vect_get_smallest_scalar_type() for details. */
1574 vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
1575 &rhs_size_unit);
1576 if (rhs_size_unit != lhs_size_unit)
1577 {
1578 number_of_vects *= rhs_size_unit;
1579 number_of_vects /= lhs_size_unit;
1580 }
1581 }
1582
1583 /* Allocate memory for vectorized defs. */
1584 *vec_oprnds0 = VEC_alloc (tree, heap, number_of_vects);
1585
1586 /* SLP_NODE corresponds either to a group of stores or to a group of
1587 unary/binary operations. We don't call this function for loads. */
1588 if (SLP_TREE_LEFT (slp_node))
1589 /* The defs are already vectorized. */
1590 vect_get_slp_vect_defs (SLP_TREE_LEFT (slp_node), vec_oprnds0);
1591 else
1592 /* Build vectors from scalar defs. */
1593 vect_get_constant_vectors (slp_node, vec_oprnds0, 0, number_of_vects);
1594
1595 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)))
1596 /* Since we don't call this function with loads, this is a group of
1597 stores. */
1598 return;
1599
1600 code = gimple_assign_rhs_code (first_stmt);
1601 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS || !vec_oprnds1)
1602 return;
1603
1604 /* The number of vector defs is determined by the number of vector statements
1605 in the node from which we get those statements. */
1606 if (SLP_TREE_RIGHT (slp_node))
1607 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_RIGHT (slp_node));
1608 else
1609 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
1610
1611 *vec_oprnds1 = VEC_alloc (tree, heap, number_of_vects);
1612
1613 if (SLP_TREE_RIGHT (slp_node))
1614 /* The defs are already vectorized. */
1615 vect_get_slp_vect_defs (SLP_TREE_RIGHT (slp_node), vec_oprnds1);
1616 else
1617 /* Build vectors from scalar defs. */
1618 vect_get_constant_vectors (slp_node, vec_oprnds1, 1, number_of_vects);
1619 }
1620
1621
1622 /* Function get_initial_def_for_induction
1623
1624 Input:
1625 STMT - a stmt that performs an induction operation in the loop.
1626 IV_PHI - the initial value of the induction variable
1627
1628 Output:
1629 Return a vector variable, initialized with the first VF values of
1630 the induction variable. E.g., for an iv with IV_PHI='X' and
1631 evolution S, for a vector of 4 units, we want to return:
1632 [X, X + S, X + 2*S, X + 3*S]. */
1633
1634 static tree
1635 get_initial_def_for_induction (gimple iv_phi)
1636 {
1637 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
1638 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1639 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1640 tree scalar_type = TREE_TYPE (gimple_phi_result (iv_phi));
1641 tree vectype;
1642 int nunits;
1643 edge pe = loop_preheader_edge (loop);
1644 struct loop *iv_loop;
1645 basic_block new_bb;
1646 tree vec, vec_init, vec_step, t;
1647 tree access_fn;
1648 tree new_var;
1649 tree new_name;
1650 gimple init_stmt, induction_phi, new_stmt;
1651 tree induc_def, vec_def, vec_dest;
1652 tree init_expr, step_expr;
1653 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1654 int i;
1655 bool ok;
1656 int ncopies;
1657 tree expr;
1658 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
1659 bool nested_in_vect_loop = false;
1660 gimple_seq stmts = NULL;
1661 imm_use_iterator imm_iter;
1662 use_operand_p use_p;
1663 gimple exit_phi;
1664 edge latch_e;
1665 tree loop_arg;
1666 gimple_stmt_iterator si;
1667 basic_block bb = gimple_bb (iv_phi);
1668
1669 vectype = get_vectype_for_scalar_type (scalar_type);
1670 gcc_assert (vectype);
1671 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1672 ncopies = vf / nunits;
1673
1674 gcc_assert (phi_info);
1675 gcc_assert (ncopies >= 1);
1676
1677 /* Find the first insertion point in the BB. */
1678 si = gsi_after_labels (bb);
1679
1680 if (INTEGRAL_TYPE_P (scalar_type) || POINTER_TYPE_P (scalar_type))
1681 step_expr = build_int_cst (scalar_type, 0);
1682 else
1683 step_expr = build_real (scalar_type, dconst0);
1684
1685 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
1686 if (nested_in_vect_loop_p (loop, iv_phi))
1687 {
1688 nested_in_vect_loop = true;
1689 iv_loop = loop->inner;
1690 }
1691 else
1692 iv_loop = loop;
1693 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
1694
1695 latch_e = loop_latch_edge (iv_loop);
1696 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
1697
1698 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
1699 gcc_assert (access_fn);
1700 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
1701 &init_expr, &step_expr);
1702 gcc_assert (ok);
1703 pe = loop_preheader_edge (iv_loop);
1704
1705 /* Create the vector that holds the initial_value of the induction. */
1706 if (nested_in_vect_loop)
1707 {
1708 /* iv_loop is nested in the loop to be vectorized. init_expr had already
1709 been created during vectorization of previous stmts; We obtain it from
1710 the STMT_VINFO_VEC_STMT of the defining stmt. */
1711 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi, loop_preheader_edge (iv_loop));
1712 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
1713 }
1714 else
1715 {
1716 /* iv_loop is the loop to be vectorized. Create:
1717 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
1718 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
1719 add_referenced_var (new_var);
1720
1721 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
1722 if (stmts)
1723 {
1724 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
1725 gcc_assert (!new_bb);
1726 }
1727
1728 t = NULL_TREE;
1729 t = tree_cons (NULL_TREE, init_expr, t);
1730 for (i = 1; i < nunits; i++)
1731 {
1732 /* Create: new_name_i = new_name + step_expr */
1733 enum tree_code code = POINTER_TYPE_P (scalar_type)
1734 ? POINTER_PLUS_EXPR : PLUS_EXPR;
1735 init_stmt = gimple_build_assign_with_ops (code, new_var,
1736 new_name, step_expr);
1737 new_name = make_ssa_name (new_var, init_stmt);
1738 gimple_assign_set_lhs (init_stmt, new_name);
1739
1740 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
1741 gcc_assert (!new_bb);
1742
1743 if (vect_print_dump_info (REPORT_DETAILS))
1744 {
1745 fprintf (vect_dump, "created new init_stmt: ");
1746 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
1747 }
1748 t = tree_cons (NULL_TREE, new_name, t);
1749 }
1750 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
1751 vec = build_constructor_from_list (vectype, nreverse (t));
1752 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
1753 }
1754
1755
1756 /* Create the vector that holds the step of the induction. */
1757 if (nested_in_vect_loop)
1758 /* iv_loop is nested in the loop to be vectorized. Generate:
1759 vec_step = [S, S, S, S] */
1760 new_name = step_expr;
1761 else
1762 {
1763 /* iv_loop is the loop to be vectorized. Generate:
1764 vec_step = [VF*S, VF*S, VF*S, VF*S] */
1765 expr = build_int_cst (scalar_type, vf);
1766 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1767 }
1768
1769 t = NULL_TREE;
1770 for (i = 0; i < nunits; i++)
1771 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1772 gcc_assert (CONSTANT_CLASS_P (new_name));
1773 vec = build_vector (vectype, t);
1774 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1775
1776
1777 /* Create the following def-use cycle:
1778 loop prolog:
1779 vec_init = ...
1780 vec_step = ...
1781 loop:
1782 vec_iv = PHI <vec_init, vec_loop>
1783 ...
1784 STMT
1785 ...
1786 vec_loop = vec_iv + vec_step; */
1787
1788 /* Create the induction-phi that defines the induction-operand. */
1789 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
1790 add_referenced_var (vec_dest);
1791 induction_phi = create_phi_node (vec_dest, iv_loop->header);
1792 set_vinfo_for_stmt (induction_phi,
1793 new_stmt_vec_info (induction_phi, loop_vinfo));
1794 induc_def = PHI_RESULT (induction_phi);
1795
1796 /* Create the iv update inside the loop */
1797 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
1798 induc_def, vec_step);
1799 vec_def = make_ssa_name (vec_dest, new_stmt);
1800 gimple_assign_set_lhs (new_stmt, vec_def);
1801 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
1802 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
1803
1804 /* Set the arguments of the phi node: */
1805 add_phi_arg (induction_phi, vec_init, pe);
1806 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop));
1807
1808
1809 /* In case that vectorization factor (VF) is bigger than the number
1810 of elements that we can fit in a vectype (nunits), we have to generate
1811 more than one vector stmt - i.e - we need to "unroll" the
1812 vector stmt by a factor VF/nunits. For more details see documentation
1813 in vectorizable_operation. */
1814
1815 if (ncopies > 1)
1816 {
1817 stmt_vec_info prev_stmt_vinfo;
1818 /* FORNOW. This restriction should be relaxed. */
1819 gcc_assert (!nested_in_vect_loop);
1820
1821 /* Create the vector that holds the step of the induction. */
1822 expr = build_int_cst (scalar_type, nunits);
1823 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1824 t = NULL_TREE;
1825 for (i = 0; i < nunits; i++)
1826 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1827 gcc_assert (CONSTANT_CLASS_P (new_name));
1828 vec = build_vector (vectype, t);
1829 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1830
1831 vec_def = induc_def;
1832 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
1833 for (i = 1; i < ncopies; i++)
1834 {
1835 /* vec_i = vec_prev + vec_step */
1836 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
1837 vec_def, vec_step);
1838 vec_def = make_ssa_name (vec_dest, new_stmt);
1839 gimple_assign_set_lhs (new_stmt, vec_def);
1840
1841 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
1842 set_vinfo_for_stmt (new_stmt,
1843 new_stmt_vec_info (new_stmt, loop_vinfo));
1844 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
1845 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
1846 }
1847 }
1848
1849 if (nested_in_vect_loop)
1850 {
1851 /* Find the loop-closed exit-phi of the induction, and record
1852 the final vector of induction results: */
1853 exit_phi = NULL;
1854 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
1855 {
1856 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
1857 {
1858 exit_phi = USE_STMT (use_p);
1859 break;
1860 }
1861 }
1862 if (exit_phi)
1863 {
1864 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
1865 /* FORNOW. Currently not supporting the case that an inner-loop induction
1866 is not used in the outer-loop (i.e. only outside the outer-loop). */
1867 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
1868 && !STMT_VINFO_LIVE_P (stmt_vinfo));
1869
1870 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
1871 if (vect_print_dump_info (REPORT_DETAILS))
1872 {
1873 fprintf (vect_dump, "vector of inductions after inner-loop:");
1874 print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
1875 }
1876 }
1877 }
1878
1879
1880 if (vect_print_dump_info (REPORT_DETAILS))
1881 {
1882 fprintf (vect_dump, "transform induction: created def-use cycle: ");
1883 print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
1884 fprintf (vect_dump, "\n");
1885 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
1886 }
1887
1888 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
1889 return induc_def;
1890 }
1891
1892
1893 /* Function vect_get_vec_def_for_operand.
1894
1895 OP is an operand in STMT. This function returns a (vector) def that will be
1896 used in the vectorized stmt for STMT.
1897
1898 In the case that OP is an SSA_NAME which is defined in the loop, then
1899 STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def.
1900
1901 In case OP is an invariant or constant, a new stmt that creates a vector def
1902 needs to be introduced. */
1903
1904 static tree
1905 vect_get_vec_def_for_operand (tree op, gimple stmt, tree *scalar_def)
1906 {
1907 tree vec_oprnd;
1908 gimple vec_stmt;
1909 gimple def_stmt;
1910 stmt_vec_info def_stmt_info = NULL;
1911 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1912 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1913 unsigned int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1914 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1915 tree vec_inv;
1916 tree vec_cst;
1917 tree t = NULL_TREE;
1918 tree def;
1919 int i;
1920 enum vect_def_type dt;
1921 bool is_simple_use;
1922 tree vector_type;
1923
1924 if (vect_print_dump_info (REPORT_DETAILS))
1925 {
1926 fprintf (vect_dump, "vect_get_vec_def_for_operand: ");
1927 print_generic_expr (vect_dump, op, TDF_SLIM);
1928 }
1929
1930 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1931 gcc_assert (is_simple_use);
1932 if (vect_print_dump_info (REPORT_DETAILS))
1933 {
1934 if (def)
1935 {
1936 fprintf (vect_dump, "def = ");
1937 print_generic_expr (vect_dump, def, TDF_SLIM);
1938 }
1939 if (def_stmt)
1940 {
1941 fprintf (vect_dump, " def_stmt = ");
1942 print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
1943 }
1944 }
1945
1946 switch (dt)
1947 {
1948 /* Case 1: operand is a constant. */
1949 case vect_constant_def:
1950 {
1951 if (scalar_def)
1952 *scalar_def = op;
1953
1954 /* Create 'vect_cst_ = {cst,cst,...,cst}' */
1955 if (vect_print_dump_info (REPORT_DETAILS))
1956 fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits);
1957
1958 for (i = nunits - 1; i >= 0; --i)
1959 {
1960 t = tree_cons (NULL_TREE, op, t);
1961 }
1962 vec_cst = build_vector (vectype, t);
1963 return vect_init_vector (stmt, vec_cst, vectype, NULL);
1964 }
1965
1966 /* Case 2: operand is defined outside the loop - loop invariant. */
1967 case vect_invariant_def:
1968 {
1969 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
1970 gcc_assert (vector_type);
1971 nunits = TYPE_VECTOR_SUBPARTS (vector_type);
1972
1973 if (scalar_def)
1974 *scalar_def = def;
1975
1976 /* Create 'vec_inv = {inv,inv,..,inv}' */
1977 if (vect_print_dump_info (REPORT_DETAILS))
1978 fprintf (vect_dump, "Create vector_inv.");
1979
1980 for (i = nunits - 1; i >= 0; --i)
1981 {
1982 t = tree_cons (NULL_TREE, def, t);
1983 }
1984
1985 /* FIXME: use build_constructor directly. */
1986 vec_inv = build_constructor_from_list (vector_type, t);
1987 return vect_init_vector (stmt, vec_inv, vector_type, NULL);
1988 }
1989
1990 /* Case 3: operand is defined inside the loop. */
1991 case vect_loop_def:
1992 {
1993 if (scalar_def)
1994 *scalar_def = NULL/* FIXME tuples: def_stmt*/;
1995
1996 /* Get the def from the vectorized stmt. */
1997 def_stmt_info = vinfo_for_stmt (def_stmt);
1998 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
1999 gcc_assert (vec_stmt);
2000 if (gimple_code (vec_stmt) == GIMPLE_PHI)
2001 vec_oprnd = PHI_RESULT (vec_stmt);
2002 else if (is_gimple_call (vec_stmt))
2003 vec_oprnd = gimple_call_lhs (vec_stmt);
2004 else
2005 vec_oprnd = gimple_assign_lhs (vec_stmt);
2006 return vec_oprnd;
2007 }
2008
2009 /* Case 4: operand is defined by a loop header phi - reduction */
2010 case vect_reduction_def:
2011 {
2012 struct loop *loop;
2013
2014 gcc_assert (gimple_code (def_stmt) == GIMPLE_PHI);
2015 loop = (gimple_bb (def_stmt))->loop_father;
2016
2017 /* Get the def before the loop */
2018 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
2019 return get_initial_def_for_reduction (stmt, op, scalar_def);
2020 }
2021
2022 /* Case 5: operand is defined by loop-header phi - induction. */
2023 case vect_induction_def:
2024 {
2025 gcc_assert (gimple_code (def_stmt) == GIMPLE_PHI);
2026
2027 /* Get the def from the vectorized stmt. */
2028 def_stmt_info = vinfo_for_stmt (def_stmt);
2029 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
2030 gcc_assert (vec_stmt && gimple_code (vec_stmt) == GIMPLE_PHI);
2031 vec_oprnd = PHI_RESULT (vec_stmt);
2032 return vec_oprnd;
2033 }
2034
2035 default:
2036 gcc_unreachable ();
2037 }
2038 }
2039
2040
2041 /* Function vect_get_vec_def_for_stmt_copy
2042
2043 Return a vector-def for an operand. This function is used when the
2044 vectorized stmt to be created (by the caller to this function) is a "copy"
2045 created in case the vectorized result cannot fit in one vector, and several
2046 copies of the vector-stmt are required. In this case the vector-def is
2047 retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
2048 of the stmt that defines VEC_OPRND.
2049 DT is the type of the vector def VEC_OPRND.
2050
2051 Context:
2052 In case the vectorization factor (VF) is bigger than the number
2053 of elements that can fit in a vectype (nunits), we have to generate
2054 more than one vector stmt to vectorize the scalar stmt. This situation
2055 arises when there are multiple data-types operated upon in the loop; the
2056 smallest data-type determines the VF, and as a result, when vectorizing
2057 stmts operating on wider types we need to create 'VF/nunits' "copies" of the
2058 vector stmt (each computing a vector of 'nunits' results, and together
2059 computing 'VF' results in each iteration). This function is called when
2060 vectorizing such a stmt (e.g. vectorizing S2 in the illustration below, in
2061 which VF=16 and nunits=4, so the number of copies required is 4):
2062
2063 scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT
2064
2065 S1: x = load VS1.0: vx.0 = memref0 VS1.1
2066 VS1.1: vx.1 = memref1 VS1.2
2067 VS1.2: vx.2 = memref2 VS1.3
2068 VS1.3: vx.3 = memref3
2069
2070 S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1
2071 VSnew.1: vz1 = vx.1 + ... VSnew.2
2072 VSnew.2: vz2 = vx.2 + ... VSnew.3
2073 VSnew.3: vz3 = vx.3 + ...
2074
2075 The vectorization of S1 is explained in vectorizable_load.
2076 The vectorization of S2:
2077 To create the first vector-stmt out of the 4 copies - VSnew.0 -
2078 the function 'vect_get_vec_def_for_operand' is called to
2079 get the relevant vector-def for each operand of S2. For operand x it
2080 returns the vector-def 'vx.0'.
2081
2082 To create the remaining copies of the vector-stmt (VSnew.j), this
2083 function is called to get the relevant vector-def for each operand. It is
2084 obtained from the respective VS1.j stmt, which is recorded in the
2085 STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND.
2086
2087 For example, to obtain the vector-def 'vx.1' in order to create the
2088 vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'.
2089 Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the
2090 STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1',
2091 and return its def ('vx.1').
2092 Overall, to create the above sequence this function will be called 3 times:
2093 vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0);
2094 vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1);
2095 vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */
2096
2097 static tree
2098 vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd)
2099 {
2100 gimple vec_stmt_for_operand;
2101 stmt_vec_info def_stmt_info;
2102
2103 /* Do nothing; can reuse same def. */
2104 if (dt == vect_invariant_def || dt == vect_constant_def )
2105 return vec_oprnd;
2106
2107 vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd);
2108 def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand);
2109 gcc_assert (def_stmt_info);
2110 vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
2111 gcc_assert (vec_stmt_for_operand);
2112 vec_oprnd = gimple_get_lhs (vec_stmt_for_operand);
2113 if (gimple_code (vec_stmt_for_operand) == GIMPLE_PHI)
2114 vec_oprnd = PHI_RESULT (vec_stmt_for_operand);
2115 else
2116 vec_oprnd = gimple_get_lhs (vec_stmt_for_operand);
2117 return vec_oprnd;
2118 }
2119
2120
2121 /* Get vectorized definitions for the operands to create a copy of an original
2122 stmt. See vect_get_vec_def_for_stmt_copy() for details. */
2123
2124 static void
2125 vect_get_vec_defs_for_stmt_copy (enum vect_def_type *dt,
2126 VEC(tree,heap) **vec_oprnds0,
2127 VEC(tree,heap) **vec_oprnds1)
2128 {
2129 tree vec_oprnd = VEC_pop (tree, *vec_oprnds0);
2130
2131 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd);
2132 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
2133
2134 if (vec_oprnds1 && *vec_oprnds1)
2135 {
2136 vec_oprnd = VEC_pop (tree, *vec_oprnds1);
2137 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd);
2138 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
2139 }
2140 }
2141
2142
2143 /* Get vectorized definitions for OP0 and OP1, or SLP_NODE if it is not NULL. */
2144
2145 static void
2146 vect_get_vec_defs (tree op0, tree op1, gimple stmt,
2147 VEC(tree,heap) **vec_oprnds0, VEC(tree,heap) **vec_oprnds1,
2148 slp_tree slp_node)
2149 {
2150 if (slp_node)
2151 vect_get_slp_defs (slp_node, vec_oprnds0, vec_oprnds1);
2152 else
2153 {
2154 tree vec_oprnd;
2155
2156 *vec_oprnds0 = VEC_alloc (tree, heap, 1);
2157 vec_oprnd = vect_get_vec_def_for_operand (op0, stmt, NULL);
2158 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
2159
2160 if (op1)
2161 {
2162 *vec_oprnds1 = VEC_alloc (tree, heap, 1);
2163 vec_oprnd = vect_get_vec_def_for_operand (op1, stmt, NULL);
2164 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
2165 }
2166 }
2167 }
2168
2169
2170 /* Function vect_finish_stmt_generation.
2171
2172 Insert a new stmt. */
2173
2174 static void
2175 vect_finish_stmt_generation (gimple stmt, gimple vec_stmt,
2176 gimple_stmt_iterator *gsi)
2177 {
2178 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2179 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2180
2181 gcc_assert (gimple_code (stmt) != GIMPLE_LABEL);
2182
2183 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
2184
2185 set_vinfo_for_stmt (vec_stmt, new_stmt_vec_info (vec_stmt, loop_vinfo));
2186
2187 if (vect_print_dump_info (REPORT_DETAILS))
2188 {
2189 fprintf (vect_dump, "add new stmt: ");
2190 print_gimple_stmt (vect_dump, vec_stmt, 0, TDF_SLIM);
2191 }
2192
2193 gimple_set_location (vec_stmt, gimple_location (gsi_stmt (*gsi)));
2194 }
2195
2196
2197 /* Function get_initial_def_for_reduction
2198
2199 Input:
2200 STMT - a stmt that performs a reduction operation in the loop.
2201 INIT_VAL - the initial value of the reduction variable
2202
2203 Output:
2204 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2205 of the reduction (used for adjusting the epilog - see below).
2206 Return a vector variable, initialized according to the operation that STMT
2207 performs. This vector will be used as the initial value of the
2208 vector of partial results.
2209
2210 Option1 (adjust in epilog): Initialize the vector as follows:
2211 add: [0,0,...,0,0]
2212 mult: [1,1,...,1,1]
2213 min/max: [init_val,init_val,..,init_val,init_val]
2214 bit and/or: [init_val,init_val,..,init_val,init_val]
2215 and when necessary (e.g. add/mult case) let the caller know
2216 that it needs to adjust the result by init_val.
2217
2218 Option2: Initialize the vector as follows:
2219 add: [0,0,...,0,init_val]
2220 mult: [1,1,...,1,init_val]
2221 min/max: [init_val,init_val,...,init_val]
2222 bit and/or: [init_val,init_val,...,init_val]
2223 and no adjustments are needed.
2224
2225 For example, for the following code:
2226
2227 s = init_val;
2228 for (i=0;i<n;i++)
2229 s = s + a[i];
2230
2231 STMT is 's = s + a[i]', and the reduction variable is 's'.
2232 For a vector of 4 units, we want to return either [0,0,0,init_val],
2233 or [0,0,0,0] and let the caller know that it needs to adjust
2234 the result at the end by 'init_val'.
2235
2236 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2237 initialization vector is simpler (same element in all entries).
2238 A cost model should help decide between these two schemes. */
2239
2240 static tree
2241 get_initial_def_for_reduction (gimple stmt, tree init_val, tree *adjustment_def)
2242 {
2243 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2244 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2245 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2246 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
2247 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2248 tree scalar_type = TREE_TYPE (vectype);
2249 enum tree_code code = gimple_assign_rhs_code (stmt);
2250 tree type = TREE_TYPE (init_val);
2251 tree vecdef;
2252 tree def_for_init;
2253 tree init_def;
2254 tree t = NULL_TREE;
2255 int i;
2256 bool nested_in_vect_loop = false;
2257
2258 gcc_assert (POINTER_TYPE_P (type) || INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
2259 if (nested_in_vect_loop_p (loop, stmt))
2260 nested_in_vect_loop = true;
2261 else
2262 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
2263
2264 vecdef = vect_get_vec_def_for_operand (init_val, stmt, NULL);
2265
2266 switch (code)
2267 {
2268 case WIDEN_SUM_EXPR:
2269 case DOT_PROD_EXPR:
2270 case PLUS_EXPR:
2271 if (nested_in_vect_loop)
2272 *adjustment_def = vecdef;
2273 else
2274 *adjustment_def = init_val;
2275 /* Create a vector of zeros for init_def. */
2276 if (SCALAR_FLOAT_TYPE_P (scalar_type))
2277 def_for_init = build_real (scalar_type, dconst0);
2278 else
2279 def_for_init = build_int_cst (scalar_type, 0);
2280
2281 for (i = nunits - 1; i >= 0; --i)
2282 t = tree_cons (NULL_TREE, def_for_init, t);
2283 init_def = build_vector (vectype, t);
2284 break;
2285
2286 case MIN_EXPR:
2287 case MAX_EXPR:
2288 *adjustment_def = NULL_TREE;
2289 init_def = vecdef;
2290 break;
2291
2292 default:
2293 gcc_unreachable ();
2294 }
2295
2296 return init_def;
2297 }
2298
2299
2300 /* Function vect_create_epilog_for_reduction
2301
2302 Create code at the loop-epilog to finalize the result of a reduction
2303 computation.
2304
2305 VECT_DEF is a vector of partial results.
2306 REDUC_CODE is the tree-code for the epilog reduction.
2307 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
2308 number of elements that we can fit in a vectype (nunits). In this case
2309 we have to generate more than one vector stmt - i.e - we need to "unroll"
2310 the vector stmt by a factor VF/nunits. For more details see documentation
2311 in vectorizable_operation.
2312 STMT is the scalar reduction stmt that is being vectorized.
2313 REDUCTION_PHI is the phi-node that carries the reduction computation.
2314
2315 This function:
2316 1. Creates the reduction def-use cycle: sets the arguments for
2317 REDUCTION_PHI:
2318 The loop-entry argument is the vectorized initial-value of the reduction.
2319 The loop-latch argument is VECT_DEF - the vector of partial sums.
2320 2. "Reduces" the vector of partial results VECT_DEF into a single result,
2321 by applying the operation specified by REDUC_CODE if available, or by
2322 other means (whole-vector shifts or a scalar loop).
2323 The function also creates a new phi node at the loop exit to preserve
2324 loop-closed form, as illustrated below.
2325
2326 The flow at the entry to this function:
2327
2328 loop:
2329 vec_def = phi <null, null> # REDUCTION_PHI
2330 VECT_DEF = vector_stmt # vectorized form of STMT
2331 s_loop = scalar_stmt # (scalar) STMT
2332 loop_exit:
2333 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2334 use <s_out0>
2335 use <s_out0>
2336
2337 The above is transformed by this function into:
2338
2339 loop:
2340 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
2341 VECT_DEF = vector_stmt # vectorized form of STMT
2342 s_loop = scalar_stmt # (scalar) STMT
2343 loop_exit:
2344 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2345 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2346 v_out2 = reduce <v_out1>
2347 s_out3 = extract_field <v_out2, 0>
2348 s_out4 = adjust_result <s_out3>
2349 use <s_out4>
2350 use <s_out4>
2351 */
2352
2353 static void
2354 vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
2355 int ncopies,
2356 enum tree_code reduc_code,
2357 gimple reduction_phi)
2358 {
2359 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2360 stmt_vec_info prev_phi_info;
2361 tree vectype;
2362 enum machine_mode mode;
2363 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2364 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2365 basic_block exit_bb;
2366 tree scalar_dest;
2367 tree scalar_type;
2368 gimple new_phi = NULL, phi;
2369 gimple_stmt_iterator exit_gsi;
2370 tree vec_dest;
2371 tree new_temp = NULL_TREE;
2372 tree new_name;
2373 gimple epilog_stmt = NULL;
2374 tree new_scalar_dest, new_dest;
2375 gimple exit_phi;
2376 tree bitsize, bitpos, bytesize;
2377 enum tree_code code = gimple_assign_rhs_code (stmt);
2378 tree adjustment_def;
2379 tree vec_initial_def, def;
2380 tree orig_name;
2381 imm_use_iterator imm_iter;
2382 use_operand_p use_p;
2383 bool extract_scalar_result = false;
2384 tree reduction_op, expr;
2385 gimple orig_stmt;
2386 gimple use_stmt;
2387 bool nested_in_vect_loop = false;
2388 VEC(gimple,heap) *phis = NULL;
2389 enum vect_def_type dt = vect_unknown_def_type;
2390 int j, i;
2391
2392 if (nested_in_vect_loop_p (loop, stmt))
2393 {
2394 loop = loop->inner;
2395 nested_in_vect_loop = true;
2396 }
2397
2398 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2399 {
2400 case GIMPLE_SINGLE_RHS:
2401 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2402 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2403 break;
2404 case GIMPLE_UNARY_RHS:
2405 reduction_op = gimple_assign_rhs1 (stmt);
2406 break;
2407 case GIMPLE_BINARY_RHS:
2408 reduction_op = gimple_assign_rhs2 (stmt);
2409 break;
2410 default:
2411 gcc_unreachable ();
2412 }
2413
2414 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2415 gcc_assert (vectype);
2416 mode = TYPE_MODE (vectype);
2417
2418 /*** 1. Create the reduction def-use cycle ***/
2419
2420 /* For the case of reduction, vect_get_vec_def_for_operand returns
2421 the scalar def before the loop, that defines the initial value
2422 of the reduction variable. */
2423 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
2424 &adjustment_def);
2425
2426 phi = reduction_phi;
2427 def = vect_def;
2428 for (j = 0; j < ncopies; j++)
2429 {
2430 /* 1.1 set the loop-entry arg of the reduction-phi: */
2431 add_phi_arg (phi, vec_initial_def, loop_preheader_edge (loop));
2432
2433 /* 1.2 set the loop-latch arg for the reduction-phi: */
2434 if (j > 0)
2435 def = vect_get_vec_def_for_stmt_copy (dt, def);
2436 add_phi_arg (phi, def, loop_latch_edge (loop));
2437
2438 if (vect_print_dump_info (REPORT_DETAILS))
2439 {
2440 fprintf (vect_dump, "transform reduction: created def-use cycle: ");
2441 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
2442 fprintf (vect_dump, "\n");
2443 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0, TDF_SLIM);
2444 }
2445
2446 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
2447 }
2448
2449 /*** 2. Create epilog code
2450 The reduction epilog code operates across the elements of the vector
2451 of partial results computed by the vectorized loop.
2452 The reduction epilog code consists of:
2453 step 1: compute the scalar result in a vector (v_out2)
2454 step 2: extract the scalar result (s_out3) from the vector (v_out2)
2455 step 3: adjust the scalar result (s_out3) if needed.
2456
2457 Step 1 can be accomplished using one the following three schemes:
2458 (scheme 1) using reduc_code, if available.
2459 (scheme 2) using whole-vector shifts, if available.
2460 (scheme 3) using a scalar loop. In this case steps 1+2 above are
2461 combined.
2462
2463 The overall epilog code looks like this:
2464
2465 s_out0 = phi <s_loop> # original EXIT_PHI
2466 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2467 v_out2 = reduce <v_out1> # step 1
2468 s_out3 = extract_field <v_out2, 0> # step 2
2469 s_out4 = adjust_result <s_out3> # step 3
2470
2471 (step 3 is optional, and steps 1 and 2 may be combined).
2472 Lastly, the uses of s_out0 are replaced by s_out4.
2473
2474 ***/
2475
2476 /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
2477 v_out1 = phi <v_loop> */
2478
2479 exit_bb = single_exit (loop)->dest;
2480 def = vect_def;
2481 prev_phi_info = NULL;
2482 for (j = 0; j < ncopies; j++)
2483 {
2484 phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
2485 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
2486 if (j == 0)
2487 new_phi = phi;
2488 else
2489 {
2490 def = vect_get_vec_def_for_stmt_copy (dt, def);
2491 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
2492 }
2493 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
2494 prev_phi_info = vinfo_for_stmt (phi);
2495 }
2496 exit_gsi = gsi_after_labels (exit_bb);
2497
2498 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
2499 (i.e. when reduc_code is not available) and in the final adjustment
2500 code (if needed). Also get the original scalar reduction variable as
2501 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
2502 represents a reduction pattern), the tree-code and scalar-def are
2503 taken from the original stmt that the pattern-stmt (STMT) replaces.
2504 Otherwise (it is a regular reduction) - the tree-code and scalar-def
2505 are taken from STMT. */
2506
2507 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2508 if (!orig_stmt)
2509 {
2510 /* Regular reduction */
2511 orig_stmt = stmt;
2512 }
2513 else
2514 {
2515 /* Reduction pattern */
2516 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
2517 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
2518 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
2519 }
2520 code = gimple_assign_rhs_code (orig_stmt);
2521 scalar_dest = gimple_assign_lhs (orig_stmt);
2522 scalar_type = TREE_TYPE (scalar_dest);
2523 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
2524 bitsize = TYPE_SIZE (scalar_type);
2525 bytesize = TYPE_SIZE_UNIT (scalar_type);
2526
2527
2528 /* In case this is a reduction in an inner-loop while vectorizing an outer
2529 loop - we don't need to extract a single scalar result at the end of the
2530 inner-loop. The final vector of partial results will be used in the
2531 vectorized outer-loop, or reduced to a scalar result at the end of the
2532 outer-loop. */
2533 if (nested_in_vect_loop)
2534 goto vect_finalize_reduction;
2535
2536 /* FORNOW */
2537 gcc_assert (ncopies == 1);
2538
2539 /* 2.3 Create the reduction code, using one of the three schemes described
2540 above. */
2541
2542 if (reduc_code < NUM_TREE_CODES)
2543 {
2544 tree tmp;
2545
2546 /*** Case 1: Create:
2547 v_out2 = reduc_expr <v_out1> */
2548
2549 if (vect_print_dump_info (REPORT_DETAILS))
2550 fprintf (vect_dump, "Reduce using direct vector reduction.");
2551
2552 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2553 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
2554 epilog_stmt = gimple_build_assign (vec_dest, tmp);
2555 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2556 gimple_assign_set_lhs (epilog_stmt, new_temp);
2557 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2558
2559 extract_scalar_result = true;
2560 }
2561 else
2562 {
2563 enum tree_code shift_code = 0;
2564 bool have_whole_vector_shift = true;
2565 int bit_offset;
2566 int element_bitsize = tree_low_cst (bitsize, 1);
2567 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2568 tree vec_temp;
2569
2570 if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
2571 shift_code = VEC_RSHIFT_EXPR;
2572 else
2573 have_whole_vector_shift = false;
2574
2575 /* Regardless of whether we have a whole vector shift, if we're
2576 emulating the operation via tree-vect-generic, we don't want
2577 to use it. Only the first round of the reduction is likely
2578 to still be profitable via emulation. */
2579 /* ??? It might be better to emit a reduction tree code here, so that
2580 tree-vect-generic can expand the first round via bit tricks. */
2581 if (!VECTOR_MODE_P (mode))
2582 have_whole_vector_shift = false;
2583 else
2584 {
2585 optab optab = optab_for_tree_code (code, vectype, optab_default);
2586 if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
2587 have_whole_vector_shift = false;
2588 }
2589
2590 if (have_whole_vector_shift)
2591 {
2592 /*** Case 2: Create:
2593 for (offset = VS/2; offset >= element_size; offset/=2)
2594 {
2595 Create: va' = vec_shift <va, offset>
2596 Create: va = vop <va, va'>
2597 } */
2598
2599 if (vect_print_dump_info (REPORT_DETAILS))
2600 fprintf (vect_dump, "Reduce using vector shifts");
2601
2602 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2603 new_temp = PHI_RESULT (new_phi);
2604
2605 for (bit_offset = vec_size_in_bits/2;
2606 bit_offset >= element_bitsize;
2607 bit_offset /= 2)
2608 {
2609 tree bitpos = size_int (bit_offset);
2610 epilog_stmt = gimple_build_assign_with_ops (shift_code, vec_dest,
2611 new_temp, bitpos);
2612 new_name = make_ssa_name (vec_dest, epilog_stmt);
2613 gimple_assign_set_lhs (epilog_stmt, new_name);
2614 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2615
2616 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
2617 new_name, new_temp);
2618 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2619 gimple_assign_set_lhs (epilog_stmt, new_temp);
2620 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2621 }
2622
2623 extract_scalar_result = true;
2624 }
2625 else
2626 {
2627 tree rhs;
2628
2629 /*** Case 3: Create:
2630 s = extract_field <v_out2, 0>
2631 for (offset = element_size;
2632 offset < vector_size;
2633 offset += element_size;)
2634 {
2635 Create: s' = extract_field <v_out2, offset>
2636 Create: s = op <s, s'>
2637 } */
2638
2639 if (vect_print_dump_info (REPORT_DETAILS))
2640 fprintf (vect_dump, "Reduce using scalar code. ");
2641
2642 vec_temp = PHI_RESULT (new_phi);
2643 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2644 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2645 bitsize_zero_node);
2646 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
2647 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2648 gimple_assign_set_lhs (epilog_stmt, new_temp);
2649 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2650
2651 for (bit_offset = element_bitsize;
2652 bit_offset < vec_size_in_bits;
2653 bit_offset += element_bitsize)
2654 {
2655 tree bitpos = bitsize_int (bit_offset);
2656 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2657 bitpos);
2658
2659 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
2660 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
2661 gimple_assign_set_lhs (epilog_stmt, new_name);
2662 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2663
2664 epilog_stmt = gimple_build_assign_with_ops (code,
2665 new_scalar_dest,
2666 new_name, new_temp);
2667 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2668 gimple_assign_set_lhs (epilog_stmt, new_temp);
2669 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2670 }
2671
2672 extract_scalar_result = false;
2673 }
2674 }
2675
2676 /* 2.4 Extract the final scalar result. Create:
2677 s_out3 = extract_field <v_out2, bitpos> */
2678
2679 if (extract_scalar_result)
2680 {
2681 tree rhs;
2682
2683 gcc_assert (!nested_in_vect_loop);
2684 if (vect_print_dump_info (REPORT_DETAILS))
2685 fprintf (vect_dump, "extract scalar result");
2686
2687 if (BYTES_BIG_ENDIAN)
2688 bitpos = size_binop (MULT_EXPR,
2689 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
2690 TYPE_SIZE (scalar_type));
2691 else
2692 bitpos = bitsize_zero_node;
2693
2694 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
2695 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
2696 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2697 gimple_assign_set_lhs (epilog_stmt, new_temp);
2698 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2699 }
2700
2701 vect_finalize_reduction:
2702
2703 /* 2.5 Adjust the final result by the initial value of the reduction
2704 variable. (When such adjustment is not needed, then
2705 'adjustment_def' is zero). For example, if code is PLUS we create:
2706 new_temp = loop_exit_def + adjustment_def */
2707
2708 if (adjustment_def)
2709 {
2710 if (nested_in_vect_loop)
2711 {
2712 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
2713 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
2714 new_dest = vect_create_destination_var (scalar_dest, vectype);
2715 }
2716 else
2717 {
2718 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
2719 expr = build2 (code, scalar_type, new_temp, adjustment_def);
2720 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
2721 }
2722 epilog_stmt = gimple_build_assign (new_dest, expr);
2723 new_temp = make_ssa_name (new_dest, epilog_stmt);
2724 gimple_assign_set_lhs (epilog_stmt, new_temp);
2725 SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
2726 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2727 }
2728
2729
2730 /* 2.6 Handle the loop-exit phi */
2731
2732 /* Replace uses of s_out0 with uses of s_out3:
2733 Find the loop-closed-use at the loop exit of the original scalar result.
2734 (The reduction result is expected to have two immediate uses - one at the
2735 latch block, and one at the loop exit). */
2736 phis = VEC_alloc (gimple, heap, 10);
2737 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
2738 {
2739 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
2740 {
2741 exit_phi = USE_STMT (use_p);
2742 VEC_quick_push (gimple, phis, exit_phi);
2743 }
2744 }
2745 /* We expect to have found an exit_phi because of loop-closed-ssa form. */
2746 gcc_assert (!VEC_empty (gimple, phis));
2747
2748 for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++)
2749 {
2750 if (nested_in_vect_loop)
2751 {
2752 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2753
2754 /* FORNOW. Currently not supporting the case that an inner-loop
2755 reduction is not used in the outer-loop (but only outside the
2756 outer-loop). */
2757 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2758 && !STMT_VINFO_LIVE_P (stmt_vinfo));
2759
2760 epilog_stmt = adjustment_def ? epilog_stmt : new_phi;
2761 STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
2762 set_vinfo_for_stmt (epilog_stmt,
2763 new_stmt_vec_info (epilog_stmt, loop_vinfo));
2764 if (adjustment_def)
2765 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
2766 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
2767 continue;
2768 }
2769
2770 /* Replace the uses: */
2771 orig_name = PHI_RESULT (exit_phi);
2772 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
2773 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
2774 SET_USE (use_p, new_temp);
2775 }
2776 VEC_free (gimple, heap, phis);
2777 }
2778
2779
2780 /* Function vectorizable_reduction.
2781
2782 Check if STMT performs a reduction operation that can be vectorized.
2783 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2784 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2785 Return FALSE if not a vectorizable STMT, TRUE otherwise.
2786
2787 This function also handles reduction idioms (patterns) that have been
2788 recognized in advance during vect_pattern_recog. In this case, STMT may be
2789 of this form:
2790 X = pattern_expr (arg0, arg1, ..., X)
2791 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
2792 sequence that had been detected and replaced by the pattern-stmt (STMT).
2793
2794 In some cases of reduction patterns, the type of the reduction variable X is
2795 different than the type of the other arguments of STMT.
2796 In such cases, the vectype that is used when transforming STMT into a vector
2797 stmt is different than the vectype that is used to determine the
2798 vectorization factor, because it consists of a different number of elements
2799 than the actual number of elements that are being operated upon in parallel.
2800
2801 For example, consider an accumulation of shorts into an int accumulator.
2802 On some targets it's possible to vectorize this pattern operating on 8
2803 shorts at a time (hence, the vectype for purposes of determining the
2804 vectorization factor should be V8HI); on the other hand, the vectype that
2805 is used to create the vector form is actually V4SI (the type of the result).
2806
2807 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
2808 indicates what is the actual level of parallelism (V8HI in the example), so
2809 that the right vectorization factor would be derived. This vectype
2810 corresponds to the type of arguments to the reduction stmt, and should *NOT*
2811 be used to create the vectorized stmt. The right vectype for the vectorized
2812 stmt is obtained from the type of the result X:
2813 get_vectype_for_scalar_type (TREE_TYPE (X))
2814
2815 This means that, contrary to "regular" reductions (or "regular" stmts in
2816 general), the following equation:
2817 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
2818 does *NOT* necessarily hold for reduction patterns. */
2819
2820 bool
2821 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
2822 gimple *vec_stmt)
2823 {
2824 tree vec_dest;
2825 tree scalar_dest;
2826 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
2827 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2828 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2829 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2830 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2831 enum tree_code code, orig_code, epilog_reduc_code = 0;
2832 enum machine_mode vec_mode;
2833 int op_type;
2834 optab optab, reduc_optab;
2835 tree new_temp = NULL_TREE;
2836 tree def;
2837 gimple def_stmt;
2838 enum vect_def_type dt;
2839 gimple new_phi = NULL;
2840 tree scalar_type;
2841 bool is_simple_use;
2842 gimple orig_stmt;
2843 stmt_vec_info orig_stmt_info;
2844 tree expr = NULL_TREE;
2845 int i;
2846 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2847 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2848 int epilog_copies;
2849 stmt_vec_info prev_stmt_info, prev_phi_info;
2850 gimple first_phi = NULL;
2851 bool single_defuse_cycle = false;
2852 tree reduc_def;
2853 gimple new_stmt = NULL;
2854 int j;
2855 tree ops[3];
2856
2857 if (nested_in_vect_loop_p (loop, stmt))
2858 loop = loop->inner;
2859
2860 gcc_assert (ncopies >= 1);
2861
2862 /* FORNOW: SLP not supported. */
2863 if (STMT_SLP_TYPE (stmt_info))
2864 return false;
2865
2866 /* 1. Is vectorizable reduction? */
2867
2868 /* Not supportable if the reduction variable is used in the loop. */
2869 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
2870 return false;
2871
2872 /* Reductions that are not used even in an enclosing outer-loop,
2873 are expected to be "live" (used out of the loop). */
2874 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_loop
2875 && !STMT_VINFO_LIVE_P (stmt_info))
2876 return false;
2877
2878 /* Make sure it was already recognized as a reduction computation. */
2879 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2880 return false;
2881
2882 /* 2. Has this been recognized as a reduction pattern?
2883
2884 Check if STMT represents a pattern that has been recognized
2885 in earlier analysis stages. For stmts that represent a pattern,
2886 the STMT_VINFO_RELATED_STMT field records the last stmt in
2887 the original sequence that constitutes the pattern. */
2888
2889 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2890 if (orig_stmt)
2891 {
2892 orig_stmt_info = vinfo_for_stmt (orig_stmt);
2893 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
2894 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
2895 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
2896 }
2897
2898 /* 3. Check the operands of the operation. The first operands are defined
2899 inside the loop body. The last operand is the reduction variable,
2900 which is defined by the loop-header-phi. */
2901
2902 gcc_assert (is_gimple_assign (stmt));
2903
2904 /* Flatten RHS */
2905 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2906 {
2907 case GIMPLE_SINGLE_RHS:
2908 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
2909 if (op_type == ternary_op)
2910 {
2911 tree rhs = gimple_assign_rhs1 (stmt);
2912 ops[0] = TREE_OPERAND (rhs, 0);
2913 ops[1] = TREE_OPERAND (rhs, 1);
2914 ops[2] = TREE_OPERAND (rhs, 2);
2915 code = TREE_CODE (rhs);
2916 }
2917 else
2918 return false;
2919 break;
2920
2921 case GIMPLE_BINARY_RHS:
2922 code = gimple_assign_rhs_code (stmt);
2923 op_type = TREE_CODE_LENGTH (code);
2924 gcc_assert (op_type == binary_op);
2925 ops[0] = gimple_assign_rhs1 (stmt);
2926 ops[1] = gimple_assign_rhs2 (stmt);
2927 break;
2928
2929 case GIMPLE_UNARY_RHS:
2930 return false;
2931
2932 default:
2933 gcc_unreachable ();
2934 }
2935
2936 scalar_dest = gimple_assign_lhs (stmt);
2937 scalar_type = TREE_TYPE (scalar_dest);
2938 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
2939 && !SCALAR_FLOAT_TYPE_P (scalar_type))
2940 return false;
2941
2942 /* All uses but the last are expected to be defined in the loop.
2943 The last use is the reduction variable. */
2944 for (i = 0; i < op_type-1; i++)
2945 {
2946 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &def_stmt,
2947 &def, &dt);
2948 gcc_assert (is_simple_use);
2949 if (dt != vect_loop_def
2950 && dt != vect_invariant_def
2951 && dt != vect_constant_def
2952 && dt != vect_induction_def)
2953 return false;
2954 }
2955
2956 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &def_stmt, &def, &dt);
2957 gcc_assert (is_simple_use);
2958 gcc_assert (dt == vect_reduction_def);
2959 gcc_assert (gimple_code (def_stmt) == GIMPLE_PHI);
2960 if (orig_stmt)
2961 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2962 else
2963 gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2964
2965 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
2966 return false;
2967
2968 /* 4. Supportable by target? */
2969
2970 /* 4.1. check support for the operation in the loop */
2971 optab = optab_for_tree_code (code, vectype, optab_default);
2972 if (!optab)
2973 {
2974 if (vect_print_dump_info (REPORT_DETAILS))
2975 fprintf (vect_dump, "no optab.");
2976 return false;
2977 }
2978 vec_mode = TYPE_MODE (vectype);
2979 if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing)
2980 {
2981 if (vect_print_dump_info (REPORT_DETAILS))
2982 fprintf (vect_dump, "op not supported by target.");
2983 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
2984 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2985 < vect_min_worthwhile_factor (code))
2986 return false;
2987 if (vect_print_dump_info (REPORT_DETAILS))
2988 fprintf (vect_dump, "proceeding using word mode.");
2989 }
2990
2991 /* Worthwhile without SIMD support? */
2992 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
2993 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2994 < vect_min_worthwhile_factor (code))
2995 {
2996 if (vect_print_dump_info (REPORT_DETAILS))
2997 fprintf (vect_dump, "not worthwhile without SIMD support.");
2998 return false;
2999 }
3000
3001 /* 4.2. Check support for the epilog operation.
3002
3003 If STMT represents a reduction pattern, then the type of the
3004 reduction variable may be different than the type of the rest
3005 of the arguments. For example, consider the case of accumulation
3006 of shorts into an int accumulator; The original code:
3007 S1: int_a = (int) short_a;
3008 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
3009
3010 was replaced with:
3011 STMT: int_acc = widen_sum <short_a, int_acc>
3012
3013 This means that:
3014 1. The tree-code that is used to create the vector operation in the
3015 epilog code (that reduces the partial results) is not the
3016 tree-code of STMT, but is rather the tree-code of the original
3017 stmt from the pattern that STMT is replacing. I.e, in the example
3018 above we want to use 'widen_sum' in the loop, but 'plus' in the
3019 epilog.
3020 2. The type (mode) we use to check available target support
3021 for the vector operation to be created in the *epilog*, is
3022 determined by the type of the reduction variable (in the example
3023 above we'd check this: plus_optab[vect_int_mode]).
3024 However the type (mode) we use to check available target support
3025 for the vector operation to be created *inside the loop*, is
3026 determined by the type of the other arguments to STMT (in the
3027 example we'd check this: widen_sum_optab[vect_short_mode]).
3028
3029 This is contrary to "regular" reductions, in which the types of all
3030 the arguments are the same as the type of the reduction variable.
3031 For "regular" reductions we can therefore use the same vector type
3032 (and also the same tree-code) when generating the epilog code and
3033 when generating the code inside the loop. */
3034
3035 if (orig_stmt)
3036 {
3037 /* This is a reduction pattern: get the vectype from the type of the
3038 reduction variable, and get the tree-code from orig_stmt. */
3039 orig_code = gimple_assign_rhs_code (orig_stmt);
3040 vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
3041 if (!vectype)
3042 {
3043 if (vect_print_dump_info (REPORT_DETAILS))
3044 {
3045 fprintf (vect_dump, "unsupported data-type ");
3046 print_generic_expr (vect_dump, TREE_TYPE (def), TDF_SLIM);
3047 }
3048 return false;
3049 }
3050
3051 vec_mode = TYPE_MODE (vectype);
3052 }
3053 else
3054 {
3055 /* Regular reduction: use the same vectype and tree-code as used for
3056 the vector code inside the loop can be used for the epilog code. */
3057 orig_code = code;
3058 }
3059
3060 if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
3061 return false;
3062 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype, optab_default);
3063 if (!reduc_optab)
3064 {
3065 if (vect_print_dump_info (REPORT_DETAILS))
3066 fprintf (vect_dump, "no optab for reduction.");
3067 epilog_reduc_code = NUM_TREE_CODES;
3068 }
3069 if (optab_handler (reduc_optab, vec_mode)->insn_code == CODE_FOR_nothing)
3070 {
3071 if (vect_print_dump_info (REPORT_DETAILS))
3072 fprintf (vect_dump, "reduc op not supported by target.");
3073 epilog_reduc_code = NUM_TREE_CODES;
3074 }
3075
3076 if (!vec_stmt) /* transformation not required. */
3077 {
3078 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3079 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
3080 return false;
3081 return true;
3082 }
3083
3084 /** Transform. **/
3085
3086 if (vect_print_dump_info (REPORT_DETAILS))
3087 fprintf (vect_dump, "transform reduction.");
3088
3089 /* Create the destination vector */
3090 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3091
3092 /* In case the vectorization factor (VF) is bigger than the number
3093 of elements that we can fit in a vectype (nunits), we have to generate
3094 more than one vector stmt - i.e - we need to "unroll" the
3095 vector stmt by a factor VF/nunits. For more details see documentation
3096 in vectorizable_operation. */
3097
3098 /* If the reduction is used in an outer loop we need to generate
3099 VF intermediate results, like so (e.g. for ncopies=2):
3100 r0 = phi (init, r0)
3101 r1 = phi (init, r1)
3102 r0 = x0 + r0;
3103 r1 = x1 + r1;
3104 (i.e. we generate VF results in 2 registers).
3105 In this case we have a separate def-use cycle for each copy, and therefore
3106 for each copy we get the vector def for the reduction variable from the
3107 respective phi node created for this copy.
3108
3109 Otherwise (the reduction is unused in the loop nest), we can combine
3110 together intermediate results, like so (e.g. for ncopies=2):
3111 r = phi (init, r)
3112 r = x0 + r;
3113 r = x1 + r;
3114 (i.e. we generate VF/2 results in a single register).
3115 In this case for each copy we get the vector def for the reduction variable
3116 from the vectorized reduction operation generated in the previous iteration.
3117 */
3118
3119 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_loop)
3120 {
3121 single_defuse_cycle = true;
3122 epilog_copies = 1;
3123 }
3124 else
3125 epilog_copies = ncopies;
3126
3127 prev_stmt_info = NULL;
3128 prev_phi_info = NULL;
3129 for (j = 0; j < ncopies; j++)
3130 {
3131 if (j == 0 || !single_defuse_cycle)
3132 {
3133 /* Create the reduction-phi that defines the reduction-operand. */
3134 new_phi = create_phi_node (vec_dest, loop->header);
3135 set_vinfo_for_stmt (new_phi, new_stmt_vec_info (new_phi, loop_vinfo));
3136 }
3137
3138 /* Handle uses. */
3139 if (j == 0)
3140 {
3141 loop_vec_def0 = vect_get_vec_def_for_operand (ops[0], stmt, NULL);
3142 if (op_type == ternary_op)
3143 {
3144 loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt, NULL);
3145 }
3146
3147 /* Get the vector def for the reduction variable from the phi node */
3148 reduc_def = PHI_RESULT (new_phi);
3149 first_phi = new_phi;
3150 }
3151 else
3152 {
3153 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
3154 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
3155 if (op_type == ternary_op)
3156 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
3157
3158 if (single_defuse_cycle)
3159 reduc_def = gimple_assign_lhs (new_stmt);
3160 else
3161 reduc_def = PHI_RESULT (new_phi);
3162
3163 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
3164 }
3165
3166 /* Arguments are ready. create the new vector stmt. */
3167 if (op_type == binary_op)
3168 expr = build2 (code, vectype, loop_vec_def0, reduc_def);
3169 else
3170 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
3171 reduc_def);
3172 new_stmt = gimple_build_assign (vec_dest, expr);
3173 new_temp = make_ssa_name (vec_dest, new_stmt);
3174 gimple_assign_set_lhs (new_stmt, new_temp);
3175 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3176
3177 if (j == 0)
3178 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3179 else
3180 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3181 prev_stmt_info = vinfo_for_stmt (new_stmt);
3182 prev_phi_info = vinfo_for_stmt (new_phi);
3183 }
3184
3185 /* Finalize the reduction-phi (set its arguments) and create the
3186 epilog reduction code. */
3187 if (!single_defuse_cycle)
3188 new_temp = gimple_assign_lhs (*vec_stmt);
3189 vect_create_epilog_for_reduction (new_temp, stmt, epilog_copies,
3190 epilog_reduc_code, first_phi);
3191 return true;
3192 }
3193
3194 /* Checks if CALL can be vectorized in type VECTYPE. Returns
3195 a function declaration if the target has a vectorized version
3196 of the function, or NULL_TREE if the function cannot be vectorized. */
3197
3198 tree
3199 vectorizable_function (gimple call, tree vectype_out, tree vectype_in)
3200 {
3201 tree fndecl = gimple_call_fndecl (call);
3202 enum built_in_function code;
3203
3204 /* We only handle functions that do not read or clobber memory -- i.e.
3205 const or novops ones. */
3206 if (!(gimple_call_flags (call) & (ECF_CONST | ECF_NOVOPS)))
3207 return NULL_TREE;
3208
3209 if (!fndecl
3210 || TREE_CODE (fndecl) != FUNCTION_DECL
3211 || !DECL_BUILT_IN (fndecl))
3212 return NULL_TREE;
3213
3214 code = DECL_FUNCTION_CODE (fndecl);
3215 return targetm.vectorize.builtin_vectorized_function (code, vectype_out,
3216 vectype_in);
3217 }
3218
3219 /* Function vectorizable_call.
3220
3221 Check if STMT performs a function call that can be vectorized.
3222 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3223 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3224 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3225
3226 bool
3227 vectorizable_call (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt)
3228 {
3229 tree vec_dest;
3230 tree scalar_dest;
3231 tree op, type;
3232 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3233 stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info;
3234 tree vectype_out, vectype_in;
3235 int nunits_in;
3236 int nunits_out;
3237 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3238 tree fndecl, new_temp, def, rhs_type, lhs_type;
3239 gimple def_stmt;
3240 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3241 gimple new_stmt;
3242 int ncopies, j;
3243 VEC(tree, heap) *vargs = NULL;
3244 enum { NARROW, NONE, WIDEN } modifier;
3245 size_t i, nargs;
3246
3247 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3248 return false;
3249
3250 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3251 return false;
3252
3253 /* FORNOW: SLP not supported. */
3254 if (STMT_SLP_TYPE (stmt_info))
3255 return false;
3256
3257 /* Is STMT a vectorizable call? */
3258 if (!is_gimple_call (stmt))
3259 return false;
3260
3261 if (TREE_CODE (gimple_call_lhs (stmt)) != SSA_NAME)
3262 return false;
3263
3264 /* Process function arguments. */
3265 rhs_type = NULL_TREE;
3266 nargs = gimple_call_num_args (stmt);
3267
3268 /* Bail out if the function has more than two arguments, we
3269 do not have interesting builtin functions to vectorize with
3270 more than two arguments. No arguments is also not good. */
3271 if (nargs == 0 || nargs > 2)
3272 return false;
3273
3274 for (i = 0; i < nargs; i++)
3275 {
3276 op = gimple_call_arg (stmt, i);
3277
3278 /* We can only handle calls with arguments of the same type. */
3279 if (rhs_type
3280 && rhs_type != TREE_TYPE (op))
3281 {
3282 if (vect_print_dump_info (REPORT_DETAILS))
3283 fprintf (vect_dump, "argument types differ.");
3284 return false;
3285 }
3286 rhs_type = TREE_TYPE (op);
3287
3288 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[i]))
3289 {
3290 if (vect_print_dump_info (REPORT_DETAILS))
3291 fprintf (vect_dump, "use not simple.");
3292 return false;
3293 }
3294 }
3295
3296 vectype_in = get_vectype_for_scalar_type (rhs_type);
3297 if (!vectype_in)
3298 return false;
3299 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3300
3301 lhs_type = TREE_TYPE (gimple_call_lhs (stmt));
3302 vectype_out = get_vectype_for_scalar_type (lhs_type);
3303 if (!vectype_out)
3304 return false;
3305 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3306
3307 /* FORNOW */
3308 if (nunits_in == nunits_out / 2)
3309 modifier = NARROW;
3310 else if (nunits_out == nunits_in)
3311 modifier = NONE;
3312 else if (nunits_out == nunits_in / 2)
3313 modifier = WIDEN;
3314 else
3315 return false;
3316
3317 /* For now, we only vectorize functions if a target specific builtin
3318 is available. TODO -- in some cases, it might be profitable to
3319 insert the calls for pieces of the vector, in order to be able
3320 to vectorize other operations in the loop. */
3321 fndecl = vectorizable_function (stmt, vectype_out, vectype_in);
3322 if (fndecl == NULL_TREE)
3323 {
3324 if (vect_print_dump_info (REPORT_DETAILS))
3325 fprintf (vect_dump, "function is not vectorizable.");
3326
3327 return false;
3328 }
3329
3330 gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
3331
3332 if (modifier == NARROW)
3333 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3334 else
3335 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3336
3337 /* Sanity check: make sure that at least one copy of the vectorized stmt
3338 needs to be generated. */
3339 gcc_assert (ncopies >= 1);
3340
3341 if (!vec_stmt) /* transformation not required. */
3342 {
3343 STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
3344 if (vect_print_dump_info (REPORT_DETAILS))
3345 fprintf (vect_dump, "=== vectorizable_call ===");
3346 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3347 return true;
3348 }
3349
3350 /** Transform. **/
3351
3352 if (vect_print_dump_info (REPORT_DETAILS))
3353 fprintf (vect_dump, "transform operation.");
3354
3355 /* Handle def. */
3356 scalar_dest = gimple_call_lhs (stmt);
3357 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3358
3359 prev_stmt_info = NULL;
3360 switch (modifier)
3361 {
3362 case NONE:
3363 for (j = 0; j < ncopies; ++j)
3364 {
3365 /* Build argument list for the vectorized call. */
3366 if (j == 0)
3367 vargs = VEC_alloc (tree, heap, nargs);
3368 else
3369 VEC_truncate (tree, vargs, 0);
3370
3371 for (i = 0; i < nargs; i++)
3372 {
3373 op = gimple_call_arg (stmt, i);
3374 if (j == 0)
3375 vec_oprnd0
3376 = vect_get_vec_def_for_operand (op, stmt, NULL);
3377 else
3378 vec_oprnd0
3379 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3380
3381 VEC_quick_push (tree, vargs, vec_oprnd0);
3382 }
3383
3384 new_stmt = gimple_build_call_vec (fndecl, vargs);
3385 new_temp = make_ssa_name (vec_dest, new_stmt);
3386 gimple_call_set_lhs (new_stmt, new_temp);
3387
3388 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3389
3390 if (j == 0)
3391 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3392 else
3393 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3394
3395 prev_stmt_info = vinfo_for_stmt (new_stmt);
3396 }
3397
3398 break;
3399
3400 case NARROW:
3401 for (j = 0; j < ncopies; ++j)
3402 {
3403 /* Build argument list for the vectorized call. */
3404 if (j == 0)
3405 vargs = VEC_alloc (tree, heap, nargs * 2);
3406 else
3407 VEC_truncate (tree, vargs, 0);
3408
3409 for (i = 0; i < nargs; i++)
3410 {
3411 op = gimple_call_arg (stmt, i);
3412 if (j == 0)
3413 {
3414 vec_oprnd0
3415 = vect_get_vec_def_for_operand (op, stmt, NULL);
3416 vec_oprnd1
3417 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3418 }
3419 else
3420 {
3421 vec_oprnd0
3422 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd1);
3423 vec_oprnd1
3424 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3425 }
3426
3427 VEC_quick_push (tree, vargs, vec_oprnd0);
3428 VEC_quick_push (tree, vargs, vec_oprnd1);
3429 }
3430
3431 new_stmt = gimple_build_call_vec (fndecl, vargs);
3432 new_temp = make_ssa_name (vec_dest, new_stmt);
3433 gimple_call_set_lhs (new_stmt, new_temp);
3434
3435 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3436
3437 if (j == 0)
3438 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3439 else
3440 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3441
3442 prev_stmt_info = vinfo_for_stmt (new_stmt);
3443 }
3444
3445 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3446
3447 break;
3448
3449 case WIDEN:
3450 /* No current target implements this case. */
3451 return false;
3452 }
3453
3454 VEC_free (tree, heap, vargs);
3455
3456 /* Update the exception handling table with the vector stmt if necessary. */
3457 if (maybe_clean_or_replace_eh_stmt (stmt, *vec_stmt))
3458 gimple_purge_dead_eh_edges (gimple_bb (stmt));
3459
3460 /* The call in STMT might prevent it from being removed in dce.
3461 We however cannot remove it here, due to the way the ssa name
3462 it defines is mapped to the new definition. So just replace
3463 rhs of the statement with something harmless. */
3464
3465 type = TREE_TYPE (scalar_dest);
3466 new_stmt = gimple_build_assign (gimple_call_lhs (stmt),
3467 fold_convert (type, integer_zero_node));
3468 set_vinfo_for_stmt (new_stmt, stmt_info);
3469 set_vinfo_for_stmt (stmt, NULL);
3470 STMT_VINFO_STMT (stmt_info) = new_stmt;
3471 gsi_replace (gsi, new_stmt, false);
3472 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3473
3474 return true;
3475 }
3476
3477
3478 /* Function vect_gen_widened_results_half
3479
3480 Create a vector stmt whose code, type, number of arguments, and result
3481 variable are CODE, OP_TYPE, and VEC_DEST, and its arguments are
3482 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
3483 In the case that CODE is a CALL_EXPR, this means that a call to DECL
3484 needs to be created (DECL is a function-decl of a target-builtin).
3485 STMT is the original scalar stmt that we are vectorizing. */
3486
3487 static gimple
3488 vect_gen_widened_results_half (enum tree_code code,
3489 tree decl,
3490 tree vec_oprnd0, tree vec_oprnd1, int op_type,
3491 tree vec_dest, gimple_stmt_iterator *gsi,
3492 gimple stmt)
3493 {
3494 gimple new_stmt;
3495 tree new_temp;
3496 tree sym;
3497 ssa_op_iter iter;
3498
3499 /* Generate half of the widened result: */
3500 if (code == CALL_EXPR)
3501 {
3502 /* Target specific support */
3503 if (op_type == binary_op)
3504 new_stmt = gimple_build_call (decl, 2, vec_oprnd0, vec_oprnd1);
3505 else
3506 new_stmt = gimple_build_call (decl, 1, vec_oprnd0);
3507 new_temp = make_ssa_name (vec_dest, new_stmt);
3508 gimple_call_set_lhs (new_stmt, new_temp);
3509 }
3510 else
3511 {
3512 /* Generic support */
3513 gcc_assert (op_type == TREE_CODE_LENGTH (code));
3514 if (op_type != binary_op)
3515 vec_oprnd1 = NULL;
3516 new_stmt = gimple_build_assign_with_ops (code, vec_dest, vec_oprnd0,
3517 vec_oprnd1);
3518 new_temp = make_ssa_name (vec_dest, new_stmt);
3519 gimple_assign_set_lhs (new_stmt, new_temp);
3520 }
3521 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3522
3523 if (code == CALL_EXPR)
3524 {
3525 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
3526 {
3527 if (TREE_CODE (sym) == SSA_NAME)
3528 sym = SSA_NAME_VAR (sym);
3529 mark_sym_for_renaming (sym);
3530 }
3531 }
3532
3533 return new_stmt;
3534 }
3535
3536
3537 /* Check if STMT performs a conversion operation, that can be vectorized.
3538 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3539 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3540 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3541
3542 bool
3543 vectorizable_conversion (gimple stmt, gimple_stmt_iterator *gsi,
3544 gimple *vec_stmt, slp_tree slp_node)
3545 {
3546 tree vec_dest;
3547 tree scalar_dest;
3548 tree op0;
3549 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3550 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3551 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3552 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
3553 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
3554 tree new_temp;
3555 tree def;
3556 gimple def_stmt;
3557 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3558 gimple new_stmt = NULL;
3559 stmt_vec_info prev_stmt_info;
3560 int nunits_in;
3561 int nunits_out;
3562 tree vectype_out, vectype_in;
3563 int ncopies, j;
3564 tree expr;
3565 tree rhs_type, lhs_type;
3566 tree builtin_decl;
3567 enum { NARROW, NONE, WIDEN } modifier;
3568 int i;
3569 VEC(tree,heap) *vec_oprnds0 = NULL;
3570 tree vop0;
3571 tree integral_type;
3572 VEC(tree,heap) *dummy = NULL;
3573 int dummy_int;
3574
3575 /* Is STMT a vectorizable conversion? */
3576
3577 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3578 return false;
3579
3580 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3581 return false;
3582
3583 if (!is_gimple_assign (stmt))
3584 return false;
3585
3586 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
3587 return false;
3588
3589 code = gimple_assign_rhs_code (stmt);
3590 if (code != FIX_TRUNC_EXPR && code != FLOAT_EXPR)
3591 return false;
3592
3593 /* Check types of lhs and rhs. */
3594 op0 = gimple_assign_rhs1 (stmt);
3595 rhs_type = TREE_TYPE (op0);
3596 vectype_in = get_vectype_for_scalar_type (rhs_type);
3597 if (!vectype_in)
3598 return false;
3599 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3600
3601 scalar_dest = gimple_assign_lhs (stmt);
3602 lhs_type = TREE_TYPE (scalar_dest);
3603 vectype_out = get_vectype_for_scalar_type (lhs_type);
3604 if (!vectype_out)
3605 return false;
3606 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3607
3608 /* FORNOW */
3609 if (nunits_in == nunits_out / 2)
3610 modifier = NARROW;
3611 else if (nunits_out == nunits_in)
3612 modifier = NONE;
3613 else if (nunits_out == nunits_in / 2)
3614 modifier = WIDEN;
3615 else
3616 return false;
3617
3618 if (modifier == NONE)
3619 gcc_assert (STMT_VINFO_VECTYPE (stmt_info) == vectype_out);
3620
3621 /* Bail out if the types are both integral or non-integral. */
3622 if ((INTEGRAL_TYPE_P (rhs_type) && INTEGRAL_TYPE_P (lhs_type))
3623 || (!INTEGRAL_TYPE_P (rhs_type) && !INTEGRAL_TYPE_P (lhs_type)))
3624 return false;
3625
3626 integral_type = INTEGRAL_TYPE_P (rhs_type) ? vectype_in : vectype_out;
3627
3628 if (modifier == NARROW)
3629 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3630 else
3631 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3632
3633 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
3634 this, so we can safely override NCOPIES with 1 here. */
3635 if (slp_node)
3636 ncopies = 1;
3637
3638 /* Sanity check: make sure that at least one copy of the vectorized stmt
3639 needs to be generated. */
3640 gcc_assert (ncopies >= 1);
3641
3642 /* Check the operands of the operation. */
3643 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3644 {
3645 if (vect_print_dump_info (REPORT_DETAILS))
3646 fprintf (vect_dump, "use not simple.");
3647 return false;
3648 }
3649
3650 /* Supportable by target? */
3651 if ((modifier == NONE
3652 && !targetm.vectorize.builtin_conversion (code, integral_type))
3653 || (modifier == WIDEN
3654 && !supportable_widening_operation (code, stmt, vectype_in,
3655 &decl1, &decl2,
3656 &code1, &code2,
3657 &dummy_int, &dummy))
3658 || (modifier == NARROW
3659 && !supportable_narrowing_operation (code, stmt, vectype_in,
3660 &code1, &dummy_int, &dummy)))
3661 {
3662 if (vect_print_dump_info (REPORT_DETAILS))
3663 fprintf (vect_dump, "conversion not supported by target.");
3664 return false;
3665 }
3666
3667 if (modifier != NONE)
3668 {
3669 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3670 /* FORNOW: SLP not supported. */
3671 if (STMT_SLP_TYPE (stmt_info))
3672 return false;
3673 }
3674
3675 if (!vec_stmt) /* transformation not required. */
3676 {
3677 STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type;
3678 return true;
3679 }
3680
3681 /** Transform. **/
3682 if (vect_print_dump_info (REPORT_DETAILS))
3683 fprintf (vect_dump, "transform conversion.");
3684
3685 /* Handle def. */
3686 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3687
3688 if (modifier == NONE && !slp_node)
3689 vec_oprnds0 = VEC_alloc (tree, heap, 1);
3690
3691 prev_stmt_info = NULL;
3692 switch (modifier)
3693 {
3694 case NONE:
3695 for (j = 0; j < ncopies; j++)
3696 {
3697 tree sym;
3698 ssa_op_iter iter;
3699
3700 if (j == 0)
3701 vect_get_vec_defs (op0, NULL, stmt, &vec_oprnds0, NULL, slp_node);
3702 else
3703 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, NULL);
3704
3705 builtin_decl =
3706 targetm.vectorize.builtin_conversion (code, integral_type);
3707 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
3708 {
3709 /* Arguments are ready. create the new vector stmt. */
3710 new_stmt = gimple_build_call (builtin_decl, 1, vop0);
3711 new_temp = make_ssa_name (vec_dest, new_stmt);
3712 gimple_call_set_lhs (new_stmt, new_temp);
3713 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3714 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter,
3715 SSA_OP_ALL_VIRTUALS)
3716 {
3717 if (TREE_CODE (sym) == SSA_NAME)
3718 sym = SSA_NAME_VAR (sym);
3719 mark_sym_for_renaming (sym);
3720 }
3721 if (slp_node)
3722 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
3723 }
3724
3725 if (j == 0)
3726 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3727 else
3728 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3729 prev_stmt_info = vinfo_for_stmt (new_stmt);
3730 }
3731 break;
3732
3733 case WIDEN:
3734 /* In case the vectorization factor (VF) is bigger than the number
3735 of elements that we can fit in a vectype (nunits), we have to
3736 generate more than one vector stmt - i.e - we need to "unroll"
3737 the vector stmt by a factor VF/nunits. */
3738 for (j = 0; j < ncopies; j++)
3739 {
3740 if (j == 0)
3741 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3742 else
3743 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3744
3745 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3746
3747 /* Generate first half of the widened result: */
3748 new_stmt
3749 = vect_gen_widened_results_half (code1, decl1,
3750 vec_oprnd0, vec_oprnd1,
3751 unary_op, vec_dest, gsi, stmt);
3752 if (j == 0)
3753 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3754 else
3755 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3756 prev_stmt_info = vinfo_for_stmt (new_stmt);
3757
3758 /* Generate second half of the widened result: */
3759 new_stmt
3760 = vect_gen_widened_results_half (code2, decl2,
3761 vec_oprnd0, vec_oprnd1,
3762 unary_op, vec_dest, gsi, stmt);
3763 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3764 prev_stmt_info = vinfo_for_stmt (new_stmt);
3765 }
3766 break;
3767
3768 case NARROW:
3769 /* In case the vectorization factor (VF) is bigger than the number
3770 of elements that we can fit in a vectype (nunits), we have to
3771 generate more than one vector stmt - i.e - we need to "unroll"
3772 the vector stmt by a factor VF/nunits. */
3773 for (j = 0; j < ncopies; j++)
3774 {
3775 /* Handle uses. */
3776 if (j == 0)
3777 {
3778 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3779 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3780 }
3781 else
3782 {
3783 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
3784 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3785 }
3786
3787 /* Arguments are ready. Create the new vector stmt. */
3788 expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
3789 new_stmt = gimple_build_assign_with_ops (code1, vec_dest, vec_oprnd0,
3790 vec_oprnd1);
3791 new_temp = make_ssa_name (vec_dest, new_stmt);
3792 gimple_assign_set_lhs (new_stmt, new_temp);
3793 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3794
3795 if (j == 0)
3796 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3797 else
3798 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3799
3800 prev_stmt_info = vinfo_for_stmt (new_stmt);
3801 }
3802
3803 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3804 }
3805
3806 if (vec_oprnds0)
3807 VEC_free (tree, heap, vec_oprnds0);
3808
3809 return true;
3810 }
3811
3812
3813 /* Function vectorizable_assignment.
3814
3815 Check if STMT performs an assignment (copy) that can be vectorized.
3816 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3817 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3818 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3819
3820 bool
3821 vectorizable_assignment (gimple stmt, gimple_stmt_iterator *gsi,
3822 gimple *vec_stmt, slp_tree slp_node)
3823 {
3824 tree vec_dest;
3825 tree scalar_dest;
3826 tree op;
3827 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3828 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3829 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3830 tree new_temp;
3831 tree def;
3832 gimple def_stmt;
3833 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3834 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3835 int ncopies;
3836 int i;
3837 VEC(tree,heap) *vec_oprnds = NULL;
3838 tree vop;
3839
3840 /* Multiple types in SLP are handled by creating the appropriate number of
3841 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
3842 case of SLP. */
3843 if (slp_node)
3844 ncopies = 1;
3845 else
3846 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3847
3848 gcc_assert (ncopies >= 1);
3849 if (ncopies > 1)
3850 return false; /* FORNOW */
3851
3852 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3853 return false;
3854
3855 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3856 return false;
3857
3858 /* Is vectorizable assignment? */
3859 if (!is_gimple_assign (stmt))
3860 return false;
3861
3862 scalar_dest = gimple_assign_lhs (stmt);
3863 if (TREE_CODE (scalar_dest) != SSA_NAME)
3864 return false;
3865
3866 if (gimple_assign_single_p (stmt)
3867 || gimple_assign_rhs_code (stmt) == PAREN_EXPR)
3868 op = gimple_assign_rhs1 (stmt);
3869 else
3870 return false;
3871
3872 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[0]))
3873 {
3874 if (vect_print_dump_info (REPORT_DETAILS))
3875 fprintf (vect_dump, "use not simple.");
3876 return false;
3877 }
3878
3879 if (!vec_stmt) /* transformation not required. */
3880 {
3881 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
3882 if (vect_print_dump_info (REPORT_DETAILS))
3883 fprintf (vect_dump, "=== vectorizable_assignment ===");
3884 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3885 return true;
3886 }
3887
3888 /** Transform. **/
3889 if (vect_print_dump_info (REPORT_DETAILS))
3890 fprintf (vect_dump, "transform assignment.");
3891
3892 /* Handle def. */
3893 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3894
3895 /* Handle use. */
3896 vect_get_vec_defs (op, NULL, stmt, &vec_oprnds, NULL, slp_node);
3897
3898 /* Arguments are ready. create the new vector stmt. */
3899 for (i = 0; VEC_iterate (tree, vec_oprnds, i, vop); i++)
3900 {
3901 *vec_stmt = gimple_build_assign (vec_dest, vop);
3902 new_temp = make_ssa_name (vec_dest, *vec_stmt);
3903 gimple_assign_set_lhs (*vec_stmt, new_temp);
3904 vect_finish_stmt_generation (stmt, *vec_stmt, gsi);
3905 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt;
3906
3907 if (slp_node)
3908 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), *vec_stmt);
3909 }
3910
3911 VEC_free (tree, heap, vec_oprnds);
3912 return true;
3913 }
3914
3915
3916 /* Function vect_min_worthwhile_factor.
3917
3918 For a loop where we could vectorize the operation indicated by CODE,
3919 return the minimum vectorization factor that makes it worthwhile
3920 to use generic vectors. */
3921 static int
3922 vect_min_worthwhile_factor (enum tree_code code)
3923 {
3924 switch (code)
3925 {
3926 case PLUS_EXPR:
3927 case MINUS_EXPR:
3928 case NEGATE_EXPR:
3929 return 4;
3930
3931 case BIT_AND_EXPR:
3932 case BIT_IOR_EXPR:
3933 case BIT_XOR_EXPR:
3934 case BIT_NOT_EXPR:
3935 return 2;
3936
3937 default:
3938 return INT_MAX;
3939 }
3940 }
3941
3942
3943 /* Function vectorizable_induction
3944
3945 Check if PHI performs an induction computation that can be vectorized.
3946 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
3947 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
3948 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3949
3950 bool
3951 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
3952 gimple *vec_stmt)
3953 {
3954 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
3955 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3956 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3957 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3958 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3959 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3960 tree vec_def;
3961
3962 gcc_assert (ncopies >= 1);
3963 /* FORNOW. This restriction should be relaxed. */
3964 if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
3965 {
3966 if (vect_print_dump_info (REPORT_DETAILS))
3967 fprintf (vect_dump, "multiple types in nested loop.");
3968 return false;
3969 }
3970
3971 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3972 return false;
3973
3974 /* FORNOW: SLP not supported. */
3975 if (STMT_SLP_TYPE (stmt_info))
3976 return false;
3977
3978 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
3979
3980 if (gimple_code (phi) != GIMPLE_PHI)
3981 return false;
3982
3983 if (!vec_stmt) /* transformation not required. */
3984 {
3985 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
3986 if (vect_print_dump_info (REPORT_DETAILS))
3987 fprintf (vect_dump, "=== vectorizable_induction ===");
3988 vect_model_induction_cost (stmt_info, ncopies);
3989 return true;
3990 }
3991
3992 /** Transform. **/
3993
3994 if (vect_print_dump_info (REPORT_DETAILS))
3995 fprintf (vect_dump, "transform induction phi.");
3996
3997 vec_def = get_initial_def_for_induction (phi);
3998 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
3999 return true;
4000 }
4001
4002
4003 /* Function vectorizable_operation.
4004
4005 Check if STMT performs a binary or unary operation that can be vectorized.
4006 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4007 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4008 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4009
4010 bool
4011 vectorizable_operation (gimple stmt, gimple_stmt_iterator *gsi,
4012 gimple *vec_stmt, slp_tree slp_node)
4013 {
4014 tree vec_dest;
4015 tree scalar_dest;
4016 tree op0, op1 = NULL;
4017 tree vec_oprnd1 = NULL_TREE;
4018 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4019 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4020 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4021 enum tree_code code;
4022 enum machine_mode vec_mode;
4023 tree new_temp;
4024 int op_type;
4025 optab optab;
4026 int icode;
4027 enum machine_mode optab_op2_mode;
4028 tree def;
4029 gimple def_stmt;
4030 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4031 gimple new_stmt = NULL;
4032 stmt_vec_info prev_stmt_info;
4033 int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
4034 int nunits_out;
4035 tree vectype_out;
4036 int ncopies;
4037 int j, i;
4038 VEC(tree,heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
4039 tree vop0, vop1;
4040 unsigned int k;
4041 bool shift_p = false;
4042 bool scalar_shift_arg = false;
4043
4044 /* Multiple types in SLP are handled by creating the appropriate number of
4045 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
4046 case of SLP. */
4047 if (slp_node)
4048 ncopies = 1;
4049 else
4050 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
4051
4052 gcc_assert (ncopies >= 1);
4053
4054 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4055 return false;
4056
4057 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4058 return false;
4059
4060 /* Is STMT a vectorizable binary/unary operation? */
4061 if (!is_gimple_assign (stmt))
4062 return false;
4063
4064 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4065 return false;
4066
4067 scalar_dest = gimple_assign_lhs (stmt);
4068 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4069 if (!vectype_out)
4070 return false;
4071 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4072 if (nunits_out != nunits_in)
4073 return false;
4074
4075 code = gimple_assign_rhs_code (stmt);
4076
4077 /* For pointer addition, we should use the normal plus for
4078 the vector addition. */
4079 if (code == POINTER_PLUS_EXPR)
4080 code = PLUS_EXPR;
4081
4082 /* Support only unary or binary operations. */
4083 op_type = TREE_CODE_LENGTH (code);
4084 if (op_type != unary_op && op_type != binary_op)
4085 {
4086 if (vect_print_dump_info (REPORT_DETAILS))
4087 fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type);
4088 return false;
4089 }
4090
4091 op0 = gimple_assign_rhs1 (stmt);
4092 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4093 {
4094 if (vect_print_dump_info (REPORT_DETAILS))
4095 fprintf (vect_dump, "use not simple.");
4096 return false;
4097 }
4098
4099 if (op_type == binary_op)
4100 {
4101 op1 = gimple_assign_rhs2 (stmt);
4102 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
4103 {
4104 if (vect_print_dump_info (REPORT_DETAILS))
4105 fprintf (vect_dump, "use not simple.");
4106 return false;
4107 }
4108 }
4109
4110 /* If this is a shift/rotate, determine whether the shift amount is a vector,
4111 or scalar. If the shift/rotate amount is a vector, use the vector/vector
4112 shift optabs. */
4113 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR || code == LROTATE_EXPR
4114 || code == RROTATE_EXPR)
4115 {
4116 shift_p = true;
4117
4118 /* vector shifted by vector */
4119 if (dt[1] == vect_loop_def)
4120 {
4121 optab = optab_for_tree_code (code, vectype, optab_vector);
4122 if (vect_print_dump_info (REPORT_DETAILS))
4123 fprintf (vect_dump, "vector/vector shift/rotate found.");
4124 }
4125
4126 /* See if the machine has a vector shifted by scalar insn and if not
4127 then see if it has a vector shifted by vector insn */
4128 else if (dt[1] == vect_constant_def || dt[1] == vect_invariant_def)
4129 {
4130 optab = optab_for_tree_code (code, vectype, optab_scalar);
4131 if (optab
4132 && (optab_handler (optab, TYPE_MODE (vectype))->insn_code
4133 != CODE_FOR_nothing))
4134 {
4135 scalar_shift_arg = true;
4136 if (vect_print_dump_info (REPORT_DETAILS))
4137 fprintf (vect_dump, "vector/scalar shift/rotate found.");
4138 }
4139 else
4140 {
4141 optab = optab_for_tree_code (code, vectype, optab_vector);
4142 if (vect_print_dump_info (REPORT_DETAILS)
4143 && optab
4144 && (optab_handler (optab, TYPE_MODE (vectype))->insn_code
4145 != CODE_FOR_nothing))
4146 fprintf (vect_dump, "vector/vector shift/rotate found.");
4147 }
4148 }
4149
4150 else
4151 {
4152 if (vect_print_dump_info (REPORT_DETAILS))
4153 fprintf (vect_dump, "operand mode requires invariant argument.");
4154 return false;
4155 }
4156 }
4157 else
4158 optab = optab_for_tree_code (code, vectype, optab_default);
4159
4160 /* Supportable by target? */
4161 if (!optab)
4162 {
4163 if (vect_print_dump_info (REPORT_DETAILS))
4164 fprintf (vect_dump, "no optab.");
4165 return false;
4166 }
4167 vec_mode = TYPE_MODE (vectype);
4168 icode = (int) optab_handler (optab, vec_mode)->insn_code;
4169 if (icode == CODE_FOR_nothing)
4170 {
4171 if (vect_print_dump_info (REPORT_DETAILS))
4172 fprintf (vect_dump, "op not supported by target.");
4173 /* Check only during analysis. */
4174 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4175 || (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4176 < vect_min_worthwhile_factor (code)
4177 && !vec_stmt))
4178 return false;
4179 if (vect_print_dump_info (REPORT_DETAILS))
4180 fprintf (vect_dump, "proceeding using word mode.");
4181 }
4182
4183 /* Worthwhile without SIMD support? Check only during analysis. */
4184 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
4185 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4186 < vect_min_worthwhile_factor (code)
4187 && !vec_stmt)
4188 {
4189 if (vect_print_dump_info (REPORT_DETAILS))
4190 fprintf (vect_dump, "not worthwhile without SIMD support.");
4191 return false;
4192 }
4193
4194 if (!vec_stmt) /* transformation not required. */
4195 {
4196 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
4197 if (vect_print_dump_info (REPORT_DETAILS))
4198 fprintf (vect_dump, "=== vectorizable_operation ===");
4199 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
4200 return true;
4201 }
4202
4203 /** Transform. **/
4204
4205 if (vect_print_dump_info (REPORT_DETAILS))
4206 fprintf (vect_dump, "transform binary/unary operation.");
4207
4208 /* Handle def. */
4209 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4210
4211 /* Allocate VECs for vector operands. In case of SLP, vector operands are
4212 created in the previous stages of the recursion, so no allocation is
4213 needed, except for the case of shift with scalar shift argument. In that
4214 case we store the scalar operand in VEC_OPRNDS1 for every vector stmt to
4215 be created to vectorize the SLP group, i.e., SLP_NODE->VEC_STMTS_SIZE.
4216 In case of loop-based vectorization we allocate VECs of size 1. We
4217 allocate VEC_OPRNDS1 only in case of binary operation. */
4218 if (!slp_node)
4219 {
4220 vec_oprnds0 = VEC_alloc (tree, heap, 1);
4221 if (op_type == binary_op)
4222 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4223 }
4224 else if (scalar_shift_arg)
4225 vec_oprnds1 = VEC_alloc (tree, heap, slp_node->vec_stmts_size);
4226
4227 /* In case the vectorization factor (VF) is bigger than the number
4228 of elements that we can fit in a vectype (nunits), we have to generate
4229 more than one vector stmt - i.e - we need to "unroll" the
4230 vector stmt by a factor VF/nunits. In doing so, we record a pointer
4231 from one copy of the vector stmt to the next, in the field
4232 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
4233 stages to find the correct vector defs to be used when vectorizing
4234 stmts that use the defs of the current stmt. The example below illustrates
4235 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
4236 4 vectorized stmts):
4237
4238 before vectorization:
4239 RELATED_STMT VEC_STMT
4240 S1: x = memref - -
4241 S2: z = x + 1 - -
4242
4243 step 1: vectorize stmt S1 (done in vectorizable_load. See more details
4244 there):
4245 RELATED_STMT VEC_STMT
4246 VS1_0: vx0 = memref0 VS1_1 -
4247 VS1_1: vx1 = memref1 VS1_2 -
4248 VS1_2: vx2 = memref2 VS1_3 -
4249 VS1_3: vx3 = memref3 - -
4250 S1: x = load - VS1_0
4251 S2: z = x + 1 - -
4252
4253 step2: vectorize stmt S2 (done here):
4254 To vectorize stmt S2 we first need to find the relevant vector
4255 def for the first operand 'x'. This is, as usual, obtained from
4256 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
4257 that defines 'x' (S1). This way we find the stmt VS1_0, and the
4258 relevant vector def 'vx0'. Having found 'vx0' we can generate
4259 the vector stmt VS2_0, and as usual, record it in the
4260 STMT_VINFO_VEC_STMT of stmt S2.
4261 When creating the second copy (VS2_1), we obtain the relevant vector
4262 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
4263 stmt VS1_0. This way we find the stmt VS1_1 and the relevant
4264 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
4265 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
4266 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
4267 chain of stmts and pointers:
4268 RELATED_STMT VEC_STMT
4269 VS1_0: vx0 = memref0 VS1_1 -
4270 VS1_1: vx1 = memref1 VS1_2 -
4271 VS1_2: vx2 = memref2 VS1_3 -
4272 VS1_3: vx3 = memref3 - -
4273 S1: x = load - VS1_0
4274 VS2_0: vz0 = vx0 + v1 VS2_1 -
4275 VS2_1: vz1 = vx1 + v1 VS2_2 -
4276 VS2_2: vz2 = vx2 + v1 VS2_3 -
4277 VS2_3: vz3 = vx3 + v1 - -
4278 S2: z = x + 1 - VS2_0 */
4279
4280 prev_stmt_info = NULL;
4281 for (j = 0; j < ncopies; j++)
4282 {
4283 /* Handle uses. */
4284 if (j == 0)
4285 {
4286 if (op_type == binary_op && scalar_shift_arg)
4287 {
4288 /* Vector shl and shr insn patterns can be defined with scalar
4289 operand 2 (shift operand). In this case, use constant or loop
4290 invariant op1 directly, without extending it to vector mode
4291 first. */
4292 optab_op2_mode = insn_data[icode].operand[2].mode;
4293 if (!VECTOR_MODE_P (optab_op2_mode))
4294 {
4295 if (vect_print_dump_info (REPORT_DETAILS))
4296 fprintf (vect_dump, "operand 1 using scalar mode.");
4297 vec_oprnd1 = op1;
4298 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4299 if (slp_node)
4300 {
4301 /* Store vec_oprnd1 for every vector stmt to be created
4302 for SLP_NODE. We check during the analysis that all the
4303 shift arguments are the same.
4304 TODO: Allow different constants for different vector
4305 stmts generated for an SLP instance. */
4306 for (k = 0; k < slp_node->vec_stmts_size - 1; k++)
4307 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4308 }
4309 }
4310 }
4311
4312 /* vec_oprnd1 is available if operand 1 should be of a scalar-type
4313 (a special case for certain kind of vector shifts); otherwise,
4314 operand 1 should be of a vector type (the usual case). */
4315 if (op_type == binary_op && !vec_oprnd1)
4316 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
4317 slp_node);
4318 else
4319 vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL,
4320 slp_node);
4321 }
4322 else
4323 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, &vec_oprnds1);
4324
4325 /* Arguments are ready. Create the new vector stmt. */
4326 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
4327 {
4328 vop1 = ((op_type == binary_op)
4329 ? VEC_index (tree, vec_oprnds1, i) : NULL);
4330 new_stmt = gimple_build_assign_with_ops (code, vec_dest, vop0, vop1);
4331 new_temp = make_ssa_name (vec_dest, new_stmt);
4332 gimple_assign_set_lhs (new_stmt, new_temp);
4333 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4334 if (slp_node)
4335 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4336 }
4337
4338 if (slp_node)
4339 continue;
4340
4341 if (j == 0)
4342 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4343 else
4344 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4345 prev_stmt_info = vinfo_for_stmt (new_stmt);
4346 }
4347
4348 VEC_free (tree, heap, vec_oprnds0);
4349 if (vec_oprnds1)
4350 VEC_free (tree, heap, vec_oprnds1);
4351
4352 return true;
4353 }
4354
4355
4356 /* Get vectorized definitions for loop-based vectorization. For the first
4357 operand we call vect_get_vec_def_for_operand() (with OPRND containing
4358 scalar operand), and for the rest we get a copy with
4359 vect_get_vec_def_for_stmt_copy() using the previous vector definition
4360 (stored in OPRND). See vect_get_vec_def_for_stmt_copy() for details.
4361 The vectors are collected into VEC_OPRNDS. */
4362
4363 static void
4364 vect_get_loop_based_defs (tree *oprnd, gimple stmt, enum vect_def_type dt,
4365 VEC (tree, heap) **vec_oprnds, int multi_step_cvt)
4366 {
4367 tree vec_oprnd;
4368
4369 /* Get first vector operand. */
4370 /* All the vector operands except the very first one (that is scalar oprnd)
4371 are stmt copies. */
4372 if (TREE_CODE (TREE_TYPE (*oprnd)) != VECTOR_TYPE)
4373 vec_oprnd = vect_get_vec_def_for_operand (*oprnd, stmt, NULL);
4374 else
4375 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, *oprnd);
4376
4377 VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
4378
4379 /* Get second vector operand. */
4380 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, vec_oprnd);
4381 VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
4382
4383 *oprnd = vec_oprnd;
4384
4385 /* For conversion in multiple steps, continue to get operands
4386 recursively. */
4387 if (multi_step_cvt)
4388 vect_get_loop_based_defs (oprnd, stmt, dt, vec_oprnds, multi_step_cvt - 1);
4389 }
4390
4391
4392 /* Create vectorized demotion statements for vector operands from VEC_OPRNDS.
4393 For multi-step conversions store the resulting vectors and call the function
4394 recursively. */
4395
4396 static void
4397 vect_create_vectorized_demotion_stmts (VEC (tree, heap) **vec_oprnds,
4398 int multi_step_cvt, gimple stmt,
4399 VEC (tree, heap) *vec_dsts,
4400 gimple_stmt_iterator *gsi,
4401 slp_tree slp_node, enum tree_code code,
4402 stmt_vec_info *prev_stmt_info)
4403 {
4404 unsigned int i;
4405 tree vop0, vop1, new_tmp, vec_dest;
4406 gimple new_stmt;
4407 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4408
4409 vec_dest = VEC_pop (tree, vec_dsts);
4410
4411 for (i = 0; i < VEC_length (tree, *vec_oprnds); i += 2)
4412 {
4413 /* Create demotion operation. */
4414 vop0 = VEC_index (tree, *vec_oprnds, i);
4415 vop1 = VEC_index (tree, *vec_oprnds, i + 1);
4416 new_stmt = gimple_build_assign_with_ops (code, vec_dest, vop0, vop1);
4417 new_tmp = make_ssa_name (vec_dest, new_stmt);
4418 gimple_assign_set_lhs (new_stmt, new_tmp);
4419 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4420
4421 if (multi_step_cvt)
4422 /* Store the resulting vector for next recursive call. */
4423 VEC_replace (tree, *vec_oprnds, i/2, new_tmp);
4424 else
4425 {
4426 /* This is the last step of the conversion sequence. Store the
4427 vectors in SLP_NODE or in vector info of the scalar statement
4428 (or in STMT_VINFO_RELATED_STMT chain). */
4429 if (slp_node)
4430 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4431 else
4432 {
4433 if (!*prev_stmt_info)
4434 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
4435 else
4436 STMT_VINFO_RELATED_STMT (*prev_stmt_info) = new_stmt;
4437
4438 *prev_stmt_info = vinfo_for_stmt (new_stmt);
4439 }
4440 }
4441 }
4442
4443 /* For multi-step demotion operations we first generate demotion operations
4444 from the source type to the intermediate types, and then combine the
4445 results (stored in VEC_OPRNDS) in demotion operation to the destination
4446 type. */
4447 if (multi_step_cvt)
4448 {
4449 /* At each level of recursion we have have of the operands we had at the
4450 previous level. */
4451 VEC_truncate (tree, *vec_oprnds, (i+1)/2);
4452 vect_create_vectorized_demotion_stmts (vec_oprnds, multi_step_cvt - 1,
4453 stmt, vec_dsts, gsi, slp_node,
4454 code, prev_stmt_info);
4455 }
4456 }
4457
4458
4459 /* Function vectorizable_type_demotion
4460
4461 Check if STMT performs a binary or unary operation that involves
4462 type demotion, and if it can be vectorized.
4463 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4464 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4465 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4466
4467 bool
4468 vectorizable_type_demotion (gimple stmt, gimple_stmt_iterator *gsi,
4469 gimple *vec_stmt, slp_tree slp_node)
4470 {
4471 tree vec_dest;
4472 tree scalar_dest;
4473 tree op0;
4474 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4475 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4476 enum tree_code code, code1 = ERROR_MARK;
4477 tree def;
4478 gimple def_stmt;
4479 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4480 stmt_vec_info prev_stmt_info;
4481 int nunits_in;
4482 int nunits_out;
4483 tree vectype_out;
4484 int ncopies;
4485 int j, i;
4486 tree vectype_in;
4487 int multi_step_cvt = 0;
4488 VEC (tree, heap) *vec_oprnds0 = NULL;
4489 VEC (tree, heap) *vec_dsts = NULL, *interm_types = NULL, *tmp_vec_dsts = NULL;
4490 tree last_oprnd, intermediate_type;
4491
4492 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4493 return false;
4494
4495 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4496 return false;
4497
4498 /* Is STMT a vectorizable type-demotion operation? */
4499 if (!is_gimple_assign (stmt))
4500 return false;
4501
4502 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4503 return false;
4504
4505 code = gimple_assign_rhs_code (stmt);
4506 if (!CONVERT_EXPR_CODE_P (code))
4507 return false;
4508
4509 op0 = gimple_assign_rhs1 (stmt);
4510 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4511 if (!vectype_in)
4512 return false;
4513 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4514
4515 scalar_dest = gimple_assign_lhs (stmt);
4516 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4517 if (!vectype_out)
4518 return false;
4519 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4520 if (nunits_in >= nunits_out)
4521 return false;
4522
4523 /* Multiple types in SLP are handled by creating the appropriate number of
4524 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
4525 case of SLP. */
4526 if (slp_node)
4527 ncopies = 1;
4528 else
4529 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
4530
4531 gcc_assert (ncopies >= 1);
4532
4533 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4534 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4535 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4536 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4537 && CONVERT_EXPR_CODE_P (code))))
4538 return false;
4539
4540 /* Check the operands of the operation. */
4541 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4542 {
4543 if (vect_print_dump_info (REPORT_DETAILS))
4544 fprintf (vect_dump, "use not simple.");
4545 return false;
4546 }
4547
4548 /* Supportable by target? */
4549 if (!supportable_narrowing_operation (code, stmt, vectype_in, &code1,
4550 &multi_step_cvt, &interm_types))
4551 return false;
4552
4553 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4554
4555 if (!vec_stmt) /* transformation not required. */
4556 {
4557 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
4558 if (vect_print_dump_info (REPORT_DETAILS))
4559 fprintf (vect_dump, "=== vectorizable_demotion ===");
4560 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
4561 return true;
4562 }
4563
4564 /** Transform. **/
4565 if (vect_print_dump_info (REPORT_DETAILS))
4566 fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
4567 ncopies);
4568
4569 /* In case of multi-step demotion, we first generate demotion operations to
4570 the intermediate types, and then from that types to the final one.
4571 We create vector destinations for the intermediate type (TYPES) received
4572 from supportable_narrowing_operation, and store them in the correct order
4573 for future use in vect_create_vectorized_demotion_stmts(). */
4574 if (multi_step_cvt)
4575 vec_dsts = VEC_alloc (tree, heap, multi_step_cvt + 1);
4576 else
4577 vec_dsts = VEC_alloc (tree, heap, 1);
4578
4579 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4580 VEC_quick_push (tree, vec_dsts, vec_dest);
4581
4582 if (multi_step_cvt)
4583 {
4584 for (i = VEC_length (tree, interm_types) - 1;
4585 VEC_iterate (tree, interm_types, i, intermediate_type); i--)
4586 {
4587 vec_dest = vect_create_destination_var (scalar_dest,
4588 intermediate_type);
4589 VEC_quick_push (tree, vec_dsts, vec_dest);
4590 }
4591 }
4592
4593 /* In case the vectorization factor (VF) is bigger than the number
4594 of elements that we can fit in a vectype (nunits), we have to generate
4595 more than one vector stmt - i.e - we need to "unroll" the
4596 vector stmt by a factor VF/nunits. */
4597 last_oprnd = op0;
4598 prev_stmt_info = NULL;
4599 for (j = 0; j < ncopies; j++)
4600 {
4601 /* Handle uses. */
4602 if (slp_node)
4603 vect_get_slp_defs (slp_node, &vec_oprnds0, NULL);
4604 else
4605 {
4606 VEC_free (tree, heap, vec_oprnds0);
4607 vec_oprnds0 = VEC_alloc (tree, heap,
4608 (multi_step_cvt ? vect_pow2 (multi_step_cvt) * 2 : 2));
4609 vect_get_loop_based_defs (&last_oprnd, stmt, dt[0], &vec_oprnds0,
4610 vect_pow2 (multi_step_cvt) - 1);
4611 }
4612
4613 /* Arguments are ready. Create the new vector stmts. */
4614 tmp_vec_dsts = VEC_copy (tree, heap, vec_dsts);
4615 vect_create_vectorized_demotion_stmts (&vec_oprnds0,
4616 multi_step_cvt, stmt, tmp_vec_dsts,
4617 gsi, slp_node, code1,
4618 &prev_stmt_info);
4619 }
4620
4621 VEC_free (tree, heap, vec_oprnds0);
4622 VEC_free (tree, heap, vec_dsts);
4623 VEC_free (tree, heap, tmp_vec_dsts);
4624 VEC_free (tree, heap, interm_types);
4625
4626 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4627 return true;
4628 }
4629
4630
4631 /* Create vectorized promotion statements for vector operands from VEC_OPRNDS0
4632 and VEC_OPRNDS1 (for binary operations). For multi-step conversions store
4633 the resulting vectors and call the function recursively. */
4634
4635 static void
4636 vect_create_vectorized_promotion_stmts (VEC (tree, heap) **vec_oprnds0,
4637 VEC (tree, heap) **vec_oprnds1,
4638 int multi_step_cvt, gimple stmt,
4639 VEC (tree, heap) *vec_dsts,
4640 gimple_stmt_iterator *gsi,
4641 slp_tree slp_node, enum tree_code code1,
4642 enum tree_code code2, tree decl1,
4643 tree decl2, int op_type,
4644 stmt_vec_info *prev_stmt_info)
4645 {
4646 int i;
4647 tree vop0, vop1, new_tmp1, new_tmp2, vec_dest;
4648 gimple new_stmt1, new_stmt2;
4649 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4650 VEC (tree, heap) *vec_tmp;
4651
4652 vec_dest = VEC_pop (tree, vec_dsts);
4653 vec_tmp = VEC_alloc (tree, heap, VEC_length (tree, *vec_oprnds0) * 2);
4654
4655 for (i = 0; VEC_iterate (tree, *vec_oprnds0, i, vop0); i++)
4656 {
4657 if (op_type == binary_op)
4658 vop1 = VEC_index (tree, *vec_oprnds1, i);
4659 else
4660 vop1 = NULL_TREE;
4661
4662 /* Generate the two halves of promotion operation. */
4663 new_stmt1 = vect_gen_widened_results_half (code1, decl1, vop0, vop1,
4664 op_type, vec_dest, gsi, stmt);
4665 new_stmt2 = vect_gen_widened_results_half (code2, decl2, vop0, vop1,
4666 op_type, vec_dest, gsi, stmt);
4667 if (is_gimple_call (new_stmt1))
4668 {
4669 new_tmp1 = gimple_call_lhs (new_stmt1);
4670 new_tmp2 = gimple_call_lhs (new_stmt2);
4671 }
4672 else
4673 {
4674 new_tmp1 = gimple_assign_lhs (new_stmt1);
4675 new_tmp2 = gimple_assign_lhs (new_stmt2);
4676 }
4677
4678 if (multi_step_cvt)
4679 {
4680 /* Store the results for the recursive call. */
4681 VEC_quick_push (tree, vec_tmp, new_tmp1);
4682 VEC_quick_push (tree, vec_tmp, new_tmp2);
4683 }
4684 else
4685 {
4686 /* Last step of promotion sequience - store the results. */
4687 if (slp_node)
4688 {
4689 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt1);
4690 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt2);
4691 }
4692 else
4693 {
4694 if (!*prev_stmt_info)
4695 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt1;
4696 else
4697 STMT_VINFO_RELATED_STMT (*prev_stmt_info) = new_stmt1;
4698
4699 *prev_stmt_info = vinfo_for_stmt (new_stmt1);
4700 STMT_VINFO_RELATED_STMT (*prev_stmt_info) = new_stmt2;
4701 *prev_stmt_info = vinfo_for_stmt (new_stmt2);
4702 }
4703 }
4704 }
4705
4706 if (multi_step_cvt)
4707 {
4708 /* For multi-step promotion operation we first generate we call the
4709 function recurcively for every stage. We start from the input type,
4710 create promotion operations to the intermediate types, and then
4711 create promotions to the output type. */
4712 *vec_oprnds0 = VEC_copy (tree, heap, vec_tmp);
4713 VEC_free (tree, heap, vec_tmp);
4714 vect_create_vectorized_promotion_stmts (vec_oprnds0, vec_oprnds1,
4715 multi_step_cvt - 1, stmt,
4716 vec_dsts, gsi, slp_node, code1,
4717 code2, decl2, decl2, op_type,
4718 prev_stmt_info);
4719 }
4720 }
4721
4722
4723 /* Function vectorizable_type_promotion
4724
4725 Check if STMT performs a binary or unary operation that involves
4726 type promotion, and if it can be vectorized.
4727 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4728 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4729 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4730
4731 bool
4732 vectorizable_type_promotion (gimple stmt, gimple_stmt_iterator *gsi,
4733 gimple *vec_stmt, slp_tree slp_node)
4734 {
4735 tree vec_dest;
4736 tree scalar_dest;
4737 tree op0, op1 = NULL;
4738 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
4739 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4740 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4741 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
4742 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
4743 int op_type;
4744 tree def;
4745 gimple def_stmt;
4746 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4747 stmt_vec_info prev_stmt_info;
4748 int nunits_in;
4749 int nunits_out;
4750 tree vectype_out;
4751 int ncopies;
4752 int j, i;
4753 tree vectype_in;
4754 tree intermediate_type = NULL_TREE;
4755 int multi_step_cvt = 0;
4756 VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
4757 VEC (tree, heap) *vec_dsts = NULL, *interm_types = NULL, *tmp_vec_dsts = NULL;
4758
4759 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4760 return false;
4761
4762 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4763 return false;
4764
4765 /* Is STMT a vectorizable type-promotion operation? */
4766 if (!is_gimple_assign (stmt))
4767 return false;
4768
4769 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4770 return false;
4771
4772 code = gimple_assign_rhs_code (stmt);
4773 if (!CONVERT_EXPR_CODE_P (code)
4774 && code != WIDEN_MULT_EXPR)
4775 return false;
4776
4777 op0 = gimple_assign_rhs1 (stmt);
4778 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4779 if (!vectype_in)
4780 return false;
4781 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4782
4783 scalar_dest = gimple_assign_lhs (stmt);
4784 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4785 if (!vectype_out)
4786 return false;
4787 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4788 if (nunits_in <= nunits_out)
4789 return false;
4790
4791 /* Multiple types in SLP are handled by creating the appropriate number of
4792 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
4793 case of SLP. */
4794 if (slp_node)
4795 ncopies = 1;
4796 else
4797 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
4798
4799 gcc_assert (ncopies >= 1);
4800
4801 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4802 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4803 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4804 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4805 && CONVERT_EXPR_CODE_P (code))))
4806 return false;
4807
4808 /* Check the operands of the operation. */
4809 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4810 {
4811 if (vect_print_dump_info (REPORT_DETAILS))
4812 fprintf (vect_dump, "use not simple.");
4813 return false;
4814 }
4815
4816 op_type = TREE_CODE_LENGTH (code);
4817 if (op_type == binary_op)
4818 {
4819 op1 = gimple_assign_rhs2 (stmt);
4820 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
4821 {
4822 if (vect_print_dump_info (REPORT_DETAILS))
4823 fprintf (vect_dump, "use not simple.");
4824 return false;
4825 }
4826 }
4827
4828 /* Supportable by target? */
4829 if (!supportable_widening_operation (code, stmt, vectype_in,
4830 &decl1, &decl2, &code1, &code2,
4831 &multi_step_cvt, &interm_types))
4832 return false;
4833
4834 /* Binary widening operation can only be supported directly by the
4835 architecture. */
4836 gcc_assert (!(multi_step_cvt && op_type == binary_op));
4837
4838 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4839
4840 if (!vec_stmt) /* transformation not required. */
4841 {
4842 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
4843 if (vect_print_dump_info (REPORT_DETAILS))
4844 fprintf (vect_dump, "=== vectorizable_promotion ===");
4845 vect_model_simple_cost (stmt_info, 2*ncopies, dt, NULL);
4846 return true;
4847 }
4848
4849 /** Transform. **/
4850
4851 if (vect_print_dump_info (REPORT_DETAILS))
4852 fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
4853 ncopies);
4854
4855 /* Handle def. */
4856 /* In case of multi-step promotion, we first generate promotion operations
4857 to the intermediate types, and then from that types to the final one.
4858 We store vector destination in VEC_DSTS in the correct order for
4859 recursive creation of promotion operations in
4860 vect_create_vectorized_promotion_stmts(). Vector destinations are created
4861 according to TYPES recieved from supportable_widening_operation(). */
4862 if (multi_step_cvt)
4863 vec_dsts = VEC_alloc (tree, heap, multi_step_cvt + 1);
4864 else
4865 vec_dsts = VEC_alloc (tree, heap, 1);
4866
4867 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4868 VEC_quick_push (tree, vec_dsts, vec_dest);
4869
4870 if (multi_step_cvt)
4871 {
4872 for (i = VEC_length (tree, interm_types) - 1;
4873 VEC_iterate (tree, interm_types, i, intermediate_type); i--)
4874 {
4875 vec_dest = vect_create_destination_var (scalar_dest,
4876 intermediate_type);
4877 VEC_quick_push (tree, vec_dsts, vec_dest);
4878 }
4879 }
4880
4881 if (!slp_node)
4882 {
4883 vec_oprnds0 = VEC_alloc (tree, heap,
4884 (multi_step_cvt ? vect_pow2 (multi_step_cvt) : 1));
4885 if (op_type == binary_op)
4886 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4887 }
4888
4889 /* In case the vectorization factor (VF) is bigger than the number
4890 of elements that we can fit in a vectype (nunits), we have to generate
4891 more than one vector stmt - i.e - we need to "unroll" the
4892 vector stmt by a factor VF/nunits. */
4893
4894 prev_stmt_info = NULL;
4895 for (j = 0; j < ncopies; j++)
4896 {
4897 /* Handle uses. */
4898 if (j == 0)
4899 {
4900 if (slp_node)
4901 vect_get_slp_defs (slp_node, &vec_oprnds0, &vec_oprnds1);
4902 else
4903 {
4904 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
4905 VEC_quick_push (tree, vec_oprnds0, vec_oprnd0);
4906 if (op_type == binary_op)
4907 {
4908 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
4909 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4910 }
4911 }
4912 }
4913 else
4914 {
4915 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4916 VEC_replace (tree, vec_oprnds0, 0, vec_oprnd0);
4917 if (op_type == binary_op)
4918 {
4919 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd1);
4920 VEC_replace (tree, vec_oprnds1, 0, vec_oprnd1);
4921 }
4922 }
4923
4924 /* Arguments are ready. Create the new vector stmts. */
4925 tmp_vec_dsts = VEC_copy (tree, heap, vec_dsts);
4926 vect_create_vectorized_promotion_stmts (&vec_oprnds0, &vec_oprnds1,
4927 multi_step_cvt, stmt,
4928 tmp_vec_dsts,
4929 gsi, slp_node, code1, code2,
4930 decl1, decl2, op_type,
4931 &prev_stmt_info);
4932 }
4933
4934 VEC_free (tree, heap, vec_dsts);
4935 VEC_free (tree, heap, tmp_vec_dsts);
4936 VEC_free (tree, heap, interm_types);
4937 VEC_free (tree, heap, vec_oprnds0);
4938 VEC_free (tree, heap, vec_oprnds1);
4939
4940 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4941 return true;
4942 }
4943
4944
4945 /* Function vect_strided_store_supported.
4946
4947 Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported,
4948 and FALSE otherwise. */
4949
4950 static bool
4951 vect_strided_store_supported (tree vectype)
4952 {
4953 optab interleave_high_optab, interleave_low_optab;
4954 int mode;
4955
4956 mode = (int) TYPE_MODE (vectype);
4957
4958 /* Check that the operation is supported. */
4959 interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
4960 vectype, optab_default);
4961 interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
4962 vectype, optab_default);
4963 if (!interleave_high_optab || !interleave_low_optab)
4964 {
4965 if (vect_print_dump_info (REPORT_DETAILS))
4966 fprintf (vect_dump, "no optab for interleave.");
4967 return false;
4968 }
4969
4970 if (optab_handler (interleave_high_optab, mode)->insn_code
4971 == CODE_FOR_nothing
4972 || optab_handler (interleave_low_optab, mode)->insn_code
4973 == CODE_FOR_nothing)
4974 {
4975 if (vect_print_dump_info (REPORT_DETAILS))
4976 fprintf (vect_dump, "interleave op not supported by target.");
4977 return false;
4978 }
4979
4980 return true;
4981 }
4982
4983
4984 /* Function vect_permute_store_chain.
4985
4986 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4987 a power of 2, generate interleave_high/low stmts to reorder the data
4988 correctly for the stores. Return the final references for stores in
4989 RESULT_CHAIN.
4990
4991 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4992 The input is 4 vectors each containing 8 elements. We assign a number to each
4993 element, the input sequence is:
4994
4995 1st vec: 0 1 2 3 4 5 6 7
4996 2nd vec: 8 9 10 11 12 13 14 15
4997 3rd vec: 16 17 18 19 20 21 22 23
4998 4th vec: 24 25 26 27 28 29 30 31
4999
5000 The output sequence should be:
5001
5002 1st vec: 0 8 16 24 1 9 17 25
5003 2nd vec: 2 10 18 26 3 11 19 27
5004 3rd vec: 4 12 20 28 5 13 21 30
5005 4th vec: 6 14 22 30 7 15 23 31
5006
5007 i.e., we interleave the contents of the four vectors in their order.
5008
5009 We use interleave_high/low instructions to create such output. The input of
5010 each interleave_high/low operation is two vectors:
5011 1st vec 2nd vec
5012 0 1 2 3 4 5 6 7
5013 the even elements of the result vector are obtained left-to-right from the
5014 high/low elements of the first vector. The odd elements of the result are
5015 obtained left-to-right from the high/low elements of the second vector.
5016 The output of interleave_high will be: 0 4 1 5
5017 and of interleave_low: 2 6 3 7
5018
5019
5020 The permutation is done in log LENGTH stages. In each stage interleave_high
5021 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5022 where the first argument is taken from the first half of DR_CHAIN and the
5023 second argument from it's second half.
5024 In our example,
5025
5026 I1: interleave_high (1st vec, 3rd vec)
5027 I2: interleave_low (1st vec, 3rd vec)
5028 I3: interleave_high (2nd vec, 4th vec)
5029 I4: interleave_low (2nd vec, 4th vec)
5030
5031 The output for the first stage is:
5032
5033 I1: 0 16 1 17 2 18 3 19
5034 I2: 4 20 5 21 6 22 7 23
5035 I3: 8 24 9 25 10 26 11 27
5036 I4: 12 28 13 29 14 30 15 31
5037
5038 The output of the second stage, i.e. the final result is:
5039
5040 I1: 0 8 16 24 1 9 17 25
5041 I2: 2 10 18 26 3 11 19 27
5042 I3: 4 12 20 28 5 13 21 30
5043 I4: 6 14 22 30 7 15 23 31. */
5044
5045 static bool
5046 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
5047 unsigned int length,
5048 gimple stmt,
5049 gimple_stmt_iterator *gsi,
5050 VEC(tree,heap) **result_chain)
5051 {
5052 tree perm_dest, vect1, vect2, high, low;
5053 gimple perm_stmt;
5054 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5055 tree scalar_dest;
5056 int i;
5057 unsigned int j;
5058 enum tree_code high_code, low_code;
5059
5060 scalar_dest = gimple_assign_lhs (stmt);
5061
5062 /* Check that the operation is supported. */
5063 if (!vect_strided_store_supported (vectype))
5064 return false;
5065
5066 *result_chain = VEC_copy (tree, heap, dr_chain);
5067
5068 for (i = 0; i < exact_log2 (length); i++)
5069 {
5070 for (j = 0; j < length/2; j++)
5071 {
5072 vect1 = VEC_index (tree, dr_chain, j);
5073 vect2 = VEC_index (tree, dr_chain, j+length/2);
5074
5075 /* Create interleaving stmt:
5076 in the case of big endian:
5077 high = interleave_high (vect1, vect2)
5078 and in the case of little endian:
5079 high = interleave_low (vect1, vect2). */
5080 perm_dest = create_tmp_var (vectype, "vect_inter_high");
5081 DECL_GIMPLE_REG_P (perm_dest) = 1;
5082 add_referenced_var (perm_dest);
5083 if (BYTES_BIG_ENDIAN)
5084 {
5085 high_code = VEC_INTERLEAVE_HIGH_EXPR;
5086 low_code = VEC_INTERLEAVE_LOW_EXPR;
5087 }
5088 else
5089 {
5090 low_code = VEC_INTERLEAVE_HIGH_EXPR;
5091 high_code = VEC_INTERLEAVE_LOW_EXPR;
5092 }
5093 perm_stmt = gimple_build_assign_with_ops (high_code, perm_dest,
5094 vect1, vect2);
5095 high = make_ssa_name (perm_dest, perm_stmt);
5096 gimple_assign_set_lhs (perm_stmt, high);
5097 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5098 VEC_replace (tree, *result_chain, 2*j, high);
5099
5100 /* Create interleaving stmt:
5101 in the case of big endian:
5102 low = interleave_low (vect1, vect2)
5103 and in the case of little endian:
5104 low = interleave_high (vect1, vect2). */
5105 perm_dest = create_tmp_var (vectype, "vect_inter_low");
5106 DECL_GIMPLE_REG_P (perm_dest) = 1;
5107 add_referenced_var (perm_dest);
5108 perm_stmt = gimple_build_assign_with_ops (low_code, perm_dest,
5109 vect1, vect2);
5110 low = make_ssa_name (perm_dest, perm_stmt);
5111 gimple_assign_set_lhs (perm_stmt, low);
5112 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5113 VEC_replace (tree, *result_chain, 2*j+1, low);
5114 }
5115 dr_chain = VEC_copy (tree, heap, *result_chain);
5116 }
5117 return true;
5118 }
5119
5120
5121 /* Function vectorizable_store.
5122
5123 Check if STMT defines a non scalar data-ref (array/pointer/structure) that
5124 can be vectorized.
5125 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5126 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
5127 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5128
5129 bool
5130 vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
5131 slp_tree slp_node)
5132 {
5133 tree scalar_dest;
5134 tree data_ref;
5135 tree op;
5136 tree vec_oprnd = NULL_TREE;
5137 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5138 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL;
5139 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5140 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5141 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5142 enum machine_mode vec_mode;
5143 tree dummy;
5144 enum dr_alignment_support alignment_support_scheme;
5145 tree def;
5146 gimple def_stmt;
5147 enum vect_def_type dt;
5148 stmt_vec_info prev_stmt_info = NULL;
5149 tree dataref_ptr = NULL_TREE;
5150 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5151 int ncopies;
5152 int j;
5153 gimple next_stmt, first_stmt = NULL;
5154 bool strided_store = false;
5155 unsigned int group_size, i;
5156 VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
5157 bool inv_p;
5158 VEC(tree,heap) *vec_oprnds = NULL;
5159 bool slp = (slp_node != NULL);
5160 stmt_vec_info first_stmt_vinfo;
5161 unsigned int vec_num;
5162
5163 /* Multiple types in SLP are handled by creating the appropriate number of
5164 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
5165 case of SLP. */
5166 if (slp)
5167 ncopies = 1;
5168 else
5169 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5170
5171 gcc_assert (ncopies >= 1);
5172
5173 /* FORNOW. This restriction should be relaxed. */
5174 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
5175 {
5176 if (vect_print_dump_info (REPORT_DETAILS))
5177 fprintf (vect_dump, "multiple types in nested loop.");
5178 return false;
5179 }
5180
5181 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5182 return false;
5183
5184 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
5185 return false;
5186
5187 /* Is vectorizable store? */
5188
5189 if (!is_gimple_assign (stmt))
5190 return false;
5191
5192 scalar_dest = gimple_assign_lhs (stmt);
5193 if (TREE_CODE (scalar_dest) != ARRAY_REF
5194 && TREE_CODE (scalar_dest) != INDIRECT_REF
5195 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
5196 return false;
5197
5198 gcc_assert (gimple_assign_single_p (stmt));
5199 op = gimple_assign_rhs1 (stmt);
5200 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
5201 {
5202 if (vect_print_dump_info (REPORT_DETAILS))
5203 fprintf (vect_dump, "use not simple.");
5204 return false;
5205 }
5206
5207 /* The scalar rhs type needs to be trivially convertible to the vector
5208 component type. This should always be the case. */
5209 if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (op)))
5210 {
5211 if (vect_print_dump_info (REPORT_DETAILS))
5212 fprintf (vect_dump, "??? operands of different types");
5213 return false;
5214 }
5215
5216 vec_mode = TYPE_MODE (vectype);
5217 /* FORNOW. In some cases can vectorize even if data-type not supported
5218 (e.g. - array initialization with 0). */
5219 if (optab_handler (mov_optab, (int)vec_mode)->insn_code == CODE_FOR_nothing)
5220 return false;
5221
5222 if (!STMT_VINFO_DATA_REF (stmt_info))
5223 return false;
5224
5225 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
5226 {
5227 strided_store = true;
5228 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5229 if (!vect_strided_store_supported (vectype)
5230 && !PURE_SLP_STMT (stmt_info) && !slp)
5231 return false;
5232
5233 if (first_stmt == stmt)
5234 {
5235 /* STMT is the leader of the group. Check the operands of all the
5236 stmts of the group. */
5237 next_stmt = DR_GROUP_NEXT_DR (stmt_info);
5238 while (next_stmt)
5239 {
5240 gcc_assert (gimple_assign_single_p (next_stmt));
5241 op = gimple_assign_rhs1 (next_stmt);
5242 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
5243 {
5244 if (vect_print_dump_info (REPORT_DETAILS))
5245 fprintf (vect_dump, "use not simple.");
5246 return false;
5247 }
5248 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5249 }
5250 }
5251 }
5252
5253 if (!vec_stmt) /* transformation not required. */
5254 {
5255 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
5256 vect_model_store_cost (stmt_info, ncopies, dt, NULL);
5257 return true;
5258 }
5259
5260 /** Transform. **/
5261
5262 if (strided_store)
5263 {
5264 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
5265 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
5266
5267 DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
5268
5269 /* FORNOW */
5270 gcc_assert (!nested_in_vect_loop_p (loop, stmt));
5271
5272 /* We vectorize all the stmts of the interleaving group when we
5273 reach the last stmt in the group. */
5274 if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
5275 < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt))
5276 && !slp)
5277 {
5278 *vec_stmt = NULL;
5279 return true;
5280 }
5281
5282 if (slp)
5283 strided_store = false;
5284
5285 /* VEC_NUM is the number of vect stmts to be created for this group. */
5286 if (slp)
5287 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5288 else
5289 vec_num = group_size;
5290 }
5291 else
5292 {
5293 first_stmt = stmt;
5294 first_dr = dr;
5295 group_size = vec_num = 1;
5296 first_stmt_vinfo = stmt_info;
5297 }
5298
5299 if (vect_print_dump_info (REPORT_DETAILS))
5300 fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
5301
5302 dr_chain = VEC_alloc (tree, heap, group_size);
5303 oprnds = VEC_alloc (tree, heap, group_size);
5304
5305 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
5306 gcc_assert (alignment_support_scheme);
5307 gcc_assert (alignment_support_scheme == dr_aligned); /* FORNOW */
5308
5309 /* In case the vectorization factor (VF) is bigger than the number
5310 of elements that we can fit in a vectype (nunits), we have to generate
5311 more than one vector stmt - i.e - we need to "unroll" the
5312 vector stmt by a factor VF/nunits. For more details see documentation in
5313 vect_get_vec_def_for_copy_stmt. */
5314
5315 /* In case of interleaving (non-unit strided access):
5316
5317 S1: &base + 2 = x2
5318 S2: &base = x0
5319 S3: &base + 1 = x1
5320 S4: &base + 3 = x3
5321
5322 We create vectorized stores starting from base address (the access of the
5323 first stmt in the chain (S2 in the above example), when the last store stmt
5324 of the chain (S4) is reached:
5325
5326 VS1: &base = vx2
5327 VS2: &base + vec_size*1 = vx0
5328 VS3: &base + vec_size*2 = vx1
5329 VS4: &base + vec_size*3 = vx3
5330
5331 Then permutation statements are generated:
5332
5333 VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
5334 VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
5335 ...
5336
5337 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
5338 (the order of the data-refs in the output of vect_permute_store_chain
5339 corresponds to the order of scalar stmts in the interleaving chain - see
5340 the documentation of vect_permute_store_chain()).
5341
5342 In case of both multiple types and interleaving, above vector stores and
5343 permutation stmts are created for every copy. The result vector stmts are
5344 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
5345 STMT_VINFO_RELATED_STMT for the next copies.
5346 */
5347
5348 prev_stmt_info = NULL;
5349 for (j = 0; j < ncopies; j++)
5350 {
5351 gimple new_stmt;
5352 gimple ptr_incr;
5353
5354 if (j == 0)
5355 {
5356 if (slp)
5357 {
5358 /* Get vectorized arguments for SLP_NODE. */
5359 vect_get_slp_defs (slp_node, &vec_oprnds, NULL);
5360
5361 vec_oprnd = VEC_index (tree, vec_oprnds, 0);
5362 }
5363 else
5364 {
5365 /* For interleaved stores we collect vectorized defs for all the
5366 stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then
5367 used as an input to vect_permute_store_chain(), and OPRNDS as
5368 an input to vect_get_vec_def_for_stmt_copy() for the next copy.
5369
5370 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
5371 OPRNDS are of size 1. */
5372 next_stmt = first_stmt;
5373 for (i = 0; i < group_size; i++)
5374 {
5375 /* Since gaps are not supported for interleaved stores,
5376 GROUP_SIZE is the exact number of stmts in the chain.
5377 Therefore, NEXT_STMT can't be NULL_TREE. In case that
5378 there is no interleaving, GROUP_SIZE is 1, and only one
5379 iteration of the loop will be executed. */
5380 gcc_assert (next_stmt
5381 && gimple_assign_single_p (next_stmt));
5382 op = gimple_assign_rhs1 (next_stmt);
5383
5384 vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt,
5385 NULL);
5386 VEC_quick_push(tree, dr_chain, vec_oprnd);
5387 VEC_quick_push(tree, oprnds, vec_oprnd);
5388 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5389 }
5390 }
5391
5392 /* We should have catched mismatched types earlier. */
5393 gcc_assert (useless_type_conversion_p (vectype,
5394 TREE_TYPE (vec_oprnd)));
5395 dataref_ptr = vect_create_data_ref_ptr (first_stmt, NULL, NULL_TREE,
5396 &dummy, &ptr_incr, false,
5397 &inv_p, NULL);
5398 gcc_assert (!inv_p);
5399 }
5400 else
5401 {
5402 /* For interleaved stores we created vectorized defs for all the
5403 defs stored in OPRNDS in the previous iteration (previous copy).
5404 DR_CHAIN is then used as an input to vect_permute_store_chain(),
5405 and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
5406 next copy.
5407 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
5408 OPRNDS are of size 1. */
5409 for (i = 0; i < group_size; i++)
5410 {
5411 op = VEC_index (tree, oprnds, i);
5412 vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
5413 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, op);
5414 VEC_replace(tree, dr_chain, i, vec_oprnd);
5415 VEC_replace(tree, oprnds, i, vec_oprnd);
5416 }
5417 dataref_ptr =
5418 bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt, NULL_TREE);
5419 }
5420
5421 if (strided_store)
5422 {
5423 result_chain = VEC_alloc (tree, heap, group_size);
5424 /* Permute. */
5425 if (!vect_permute_store_chain (dr_chain, group_size, stmt, gsi,
5426 &result_chain))
5427 return false;
5428 }
5429
5430 next_stmt = first_stmt;
5431 for (i = 0; i < vec_num; i++)
5432 {
5433 if (i > 0)
5434 /* Bump the vector pointer. */
5435 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt,
5436 NULL_TREE);
5437
5438 if (slp)
5439 vec_oprnd = VEC_index (tree, vec_oprnds, i);
5440 else if (strided_store)
5441 /* For strided stores vectorized defs are interleaved in
5442 vect_permute_store_chain(). */
5443 vec_oprnd = VEC_index (tree, result_chain, i);
5444
5445 data_ref = build_fold_indirect_ref (dataref_ptr);
5446
5447 /* Arguments are ready. Create the new vector stmt. */
5448 new_stmt = gimple_build_assign (data_ref, vec_oprnd);
5449 vect_finish_stmt_generation (stmt, new_stmt, gsi);
5450 mark_symbols_for_renaming (new_stmt);
5451
5452 if (slp)
5453 continue;
5454
5455 if (j == 0)
5456 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5457 else
5458 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5459
5460 prev_stmt_info = vinfo_for_stmt (new_stmt);
5461 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5462 if (!next_stmt)
5463 break;
5464 }
5465 }
5466
5467 VEC_free (tree, heap, dr_chain);
5468 VEC_free (tree, heap, oprnds);
5469 if (result_chain)
5470 VEC_free (tree, heap, result_chain);
5471
5472 return true;
5473 }
5474
5475
5476 /* Function vect_setup_realignment
5477
5478 This function is called when vectorizing an unaligned load using
5479 the dr_explicit_realign[_optimized] scheme.
5480 This function generates the following code at the loop prolog:
5481
5482 p = initial_addr;
5483 x msq_init = *(floor(p)); # prolog load
5484 realignment_token = call target_builtin;
5485 loop:
5486 x msq = phi (msq_init, ---)
5487
5488 The stmts marked with x are generated only for the case of
5489 dr_explicit_realign_optimized.
5490
5491 The code above sets up a new (vector) pointer, pointing to the first
5492 location accessed by STMT, and a "floor-aligned" load using that pointer.
5493 It also generates code to compute the "realignment-token" (if the relevant
5494 target hook was defined), and creates a phi-node at the loop-header bb
5495 whose arguments are the result of the prolog-load (created by this
5496 function) and the result of a load that takes place in the loop (to be
5497 created by the caller to this function).
5498
5499 For the case of dr_explicit_realign_optimized:
5500 The caller to this function uses the phi-result (msq) to create the
5501 realignment code inside the loop, and sets up the missing phi argument,
5502 as follows:
5503 loop:
5504 msq = phi (msq_init, lsq)
5505 lsq = *(floor(p')); # load in loop
5506 result = realign_load (msq, lsq, realignment_token);
5507
5508 For the case of dr_explicit_realign:
5509 loop:
5510 msq = *(floor(p)); # load in loop
5511 p' = p + (VS-1);
5512 lsq = *(floor(p')); # load in loop
5513 result = realign_load (msq, lsq, realignment_token);
5514
5515 Input:
5516 STMT - (scalar) load stmt to be vectorized. This load accesses
5517 a memory location that may be unaligned.
5518 BSI - place where new code is to be inserted.
5519 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5520 is used.
5521
5522 Output:
5523 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5524 target hook, if defined.
5525 Return value - the result of the loop-header phi node. */
5526
5527 static tree
5528 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
5529 tree *realignment_token,
5530 enum dr_alignment_support alignment_support_scheme,
5531 tree init_addr,
5532 struct loop **at_loop)
5533 {
5534 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5535 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5536 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5537 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5538 edge pe;
5539 tree scalar_dest = gimple_assign_lhs (stmt);
5540 tree vec_dest;
5541 gimple inc;
5542 tree ptr;
5543 tree data_ref;
5544 gimple new_stmt;
5545 basic_block new_bb;
5546 tree msq_init = NULL_TREE;
5547 tree new_temp;
5548 gimple phi_stmt;
5549 tree msq = NULL_TREE;
5550 gimple_seq stmts = NULL;
5551 bool inv_p;
5552 bool compute_in_loop = false;
5553 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5554 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5555 struct loop *loop_for_initial_load;
5556
5557 gcc_assert (alignment_support_scheme == dr_explicit_realign
5558 || alignment_support_scheme == dr_explicit_realign_optimized);
5559
5560 /* We need to generate three things:
5561 1. the misalignment computation
5562 2. the extra vector load (for the optimized realignment scheme).
5563 3. the phi node for the two vectors from which the realignment is
5564 done (for the optimized realignment scheme).
5565 */
5566
5567 /* 1. Determine where to generate the misalignment computation.
5568
5569 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5570 calculation will be generated by this function, outside the loop (in the
5571 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5572 caller, inside the loop.
5573
5574 Background: If the misalignment remains fixed throughout the iterations of
5575 the loop, then both realignment schemes are applicable, and also the
5576 misalignment computation can be done outside LOOP. This is because we are
5577 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5578 are a multiple of VS (the Vector Size), and therefore the misalignment in
5579 different vectorized LOOP iterations is always the same.
5580 The problem arises only if the memory access is in an inner-loop nested
5581 inside LOOP, which is now being vectorized using outer-loop vectorization.
5582 This is the only case when the misalignment of the memory access may not
5583 remain fixed throughout the iterations of the inner-loop (as explained in
5584 detail in vect_supportable_dr_alignment). In this case, not only is the
5585 optimized realignment scheme not applicable, but also the misalignment
5586 computation (and generation of the realignment token that is passed to
5587 REALIGN_LOAD) have to be done inside the loop.
5588
5589 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5590 or not, which in turn determines if the misalignment is computed inside
5591 the inner-loop, or outside LOOP. */
5592
5593 if (init_addr != NULL_TREE)
5594 {
5595 compute_in_loop = true;
5596 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5597 }
5598
5599
5600 /* 2. Determine where to generate the extra vector load.
5601
5602 For the optimized realignment scheme, instead of generating two vector
5603 loads in each iteration, we generate a single extra vector load in the
5604 preheader of the loop, and in each iteration reuse the result of the
5605 vector load from the previous iteration. In case the memory access is in
5606 an inner-loop nested inside LOOP, which is now being vectorized using
5607 outer-loop vectorization, we need to determine whether this initial vector
5608 load should be generated at the preheader of the inner-loop, or can be
5609 generated at the preheader of LOOP. If the memory access has no evolution
5610 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5611 to be generated inside LOOP (in the preheader of the inner-loop). */
5612
5613 if (nested_in_vect_loop)
5614 {
5615 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5616 bool invariant_in_outerloop =
5617 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5618 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5619 }
5620 else
5621 loop_for_initial_load = loop;
5622 if (at_loop)
5623 *at_loop = loop_for_initial_load;
5624
5625 /* 3. For the case of the optimized realignment, create the first vector
5626 load at the loop preheader. */
5627
5628 if (alignment_support_scheme == dr_explicit_realign_optimized)
5629 {
5630 /* Create msq_init = *(floor(p1)) in the loop preheader */
5631
5632 gcc_assert (!compute_in_loop);
5633 pe = loop_preheader_edge (loop_for_initial_load);
5634 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5635 ptr = vect_create_data_ref_ptr (stmt, loop_for_initial_load, NULL_TREE,
5636 &init_addr, &inc, true, &inv_p, NULL_TREE);
5637 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
5638 new_stmt = gimple_build_assign (vec_dest, data_ref);
5639 new_temp = make_ssa_name (vec_dest, new_stmt);
5640 gimple_assign_set_lhs (new_stmt, new_temp);
5641 mark_symbols_for_renaming (new_stmt);
5642 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5643 gcc_assert (!new_bb);
5644 msq_init = gimple_assign_lhs (new_stmt);
5645 }
5646
5647 /* 4. Create realignment token using a target builtin, if available.
5648 It is done either inside the containing loop, or before LOOP (as
5649 determined above). */
5650
5651 if (targetm.vectorize.builtin_mask_for_load)
5652 {
5653 tree builtin_decl;
5654
5655 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5656 if (compute_in_loop)
5657 gcc_assert (init_addr); /* already computed by the caller. */
5658 else
5659 {
5660 /* Generate the INIT_ADDR computation outside LOOP. */
5661 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5662 NULL_TREE, loop);
5663 pe = loop_preheader_edge (loop);
5664 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5665 gcc_assert (!new_bb);
5666 }
5667
5668 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5669 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5670 vec_dest =
5671 vect_create_destination_var (scalar_dest,
5672 gimple_call_return_type (new_stmt));
5673 new_temp = make_ssa_name (vec_dest, new_stmt);
5674 gimple_call_set_lhs (new_stmt, new_temp);
5675
5676 if (compute_in_loop)
5677 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5678 else
5679 {
5680 /* Generate the misalignment computation outside LOOP. */
5681 pe = loop_preheader_edge (loop);
5682 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5683 gcc_assert (!new_bb);
5684 }
5685
5686 *realignment_token = gimple_call_lhs (new_stmt);
5687
5688 /* The result of the CALL_EXPR to this builtin is determined from
5689 the value of the parameter and no global variables are touched
5690 which makes the builtin a "const" function. Requiring the
5691 builtin to have the "const" attribute makes it unnecessary
5692 to call mark_call_clobbered. */
5693 gcc_assert (TREE_READONLY (builtin_decl));
5694 }
5695
5696 if (alignment_support_scheme == dr_explicit_realign)
5697 return msq;
5698
5699 gcc_assert (!compute_in_loop);
5700 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5701
5702
5703 /* 5. Create msq = phi <msq_init, lsq> in loop */
5704
5705 pe = loop_preheader_edge (containing_loop);
5706 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5707 msq = make_ssa_name (vec_dest, NULL);
5708 phi_stmt = create_phi_node (msq, containing_loop->header);
5709 SSA_NAME_DEF_STMT (msq) = phi_stmt;
5710 add_phi_arg (phi_stmt, msq_init, pe);
5711
5712 return msq;
5713 }
5714
5715
5716 /* Function vect_strided_load_supported.
5717
5718 Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported,
5719 and FALSE otherwise. */
5720
5721 static bool
5722 vect_strided_load_supported (tree vectype)
5723 {
5724 optab perm_even_optab, perm_odd_optab;
5725 int mode;
5726
5727 mode = (int) TYPE_MODE (vectype);
5728
5729 perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype,
5730 optab_default);
5731 if (!perm_even_optab)
5732 {
5733 if (vect_print_dump_info (REPORT_DETAILS))
5734 fprintf (vect_dump, "no optab for perm_even.");
5735 return false;
5736 }
5737
5738 if (optab_handler (perm_even_optab, mode)->insn_code == CODE_FOR_nothing)
5739 {
5740 if (vect_print_dump_info (REPORT_DETAILS))
5741 fprintf (vect_dump, "perm_even op not supported by target.");
5742 return false;
5743 }
5744
5745 perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype,
5746 optab_default);
5747 if (!perm_odd_optab)
5748 {
5749 if (vect_print_dump_info (REPORT_DETAILS))
5750 fprintf (vect_dump, "no optab for perm_odd.");
5751 return false;
5752 }
5753
5754 if (optab_handler (perm_odd_optab, mode)->insn_code == CODE_FOR_nothing)
5755 {
5756 if (vect_print_dump_info (REPORT_DETAILS))
5757 fprintf (vect_dump, "perm_odd op not supported by target.");
5758 return false;
5759 }
5760 return true;
5761 }
5762
5763
5764 /* Function vect_permute_load_chain.
5765
5766 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5767 a power of 2, generate extract_even/odd stmts to reorder the input data
5768 correctly. Return the final references for loads in RESULT_CHAIN.
5769
5770 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5771 The input is 4 vectors each containing 8 elements. We assign a number to each
5772 element, the input sequence is:
5773
5774 1st vec: 0 1 2 3 4 5 6 7
5775 2nd vec: 8 9 10 11 12 13 14 15
5776 3rd vec: 16 17 18 19 20 21 22 23
5777 4th vec: 24 25 26 27 28 29 30 31
5778
5779 The output sequence should be:
5780
5781 1st vec: 0 4 8 12 16 20 24 28
5782 2nd vec: 1 5 9 13 17 21 25 29
5783 3rd vec: 2 6 10 14 18 22 26 30
5784 4th vec: 3 7 11 15 19 23 27 31
5785
5786 i.e., the first output vector should contain the first elements of each
5787 interleaving group, etc.
5788
5789 We use extract_even/odd instructions to create such output. The input of each
5790 extract_even/odd operation is two vectors
5791 1st vec 2nd vec
5792 0 1 2 3 4 5 6 7
5793
5794 and the output is the vector of extracted even/odd elements. The output of
5795 extract_even will be: 0 2 4 6
5796 and of extract_odd: 1 3 5 7
5797
5798
5799 The permutation is done in log LENGTH stages. In each stage extract_even and
5800 extract_odd stmts are created for each pair of vectors in DR_CHAIN in their
5801 order. In our example,
5802
5803 E1: extract_even (1st vec, 2nd vec)
5804 E2: extract_odd (1st vec, 2nd vec)
5805 E3: extract_even (3rd vec, 4th vec)
5806 E4: extract_odd (3rd vec, 4th vec)
5807
5808 The output for the first stage will be:
5809
5810 E1: 0 2 4 6 8 10 12 14
5811 E2: 1 3 5 7 9 11 13 15
5812 E3: 16 18 20 22 24 26 28 30
5813 E4: 17 19 21 23 25 27 29 31
5814
5815 In order to proceed and create the correct sequence for the next stage (or
5816 for the correct output, if the second stage is the last one, as in our
5817 example), we first put the output of extract_even operation and then the
5818 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5819 The input for the second stage is:
5820
5821 1st vec (E1): 0 2 4 6 8 10 12 14
5822 2nd vec (E3): 16 18 20 22 24 26 28 30
5823 3rd vec (E2): 1 3 5 7 9 11 13 15
5824 4th vec (E4): 17 19 21 23 25 27 29 31
5825
5826 The output of the second stage:
5827
5828 E1: 0 4 8 12 16 20 24 28
5829 E2: 2 6 10 14 18 22 26 30
5830 E3: 1 5 9 13 17 21 25 29
5831 E4: 3 7 11 15 19 23 27 31
5832
5833 And RESULT_CHAIN after reordering:
5834
5835 1st vec (E1): 0 4 8 12 16 20 24 28
5836 2nd vec (E3): 1 5 9 13 17 21 25 29
5837 3rd vec (E2): 2 6 10 14 18 22 26 30
5838 4th vec (E4): 3 7 11 15 19 23 27 31. */
5839
5840 static bool
5841 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
5842 unsigned int length,
5843 gimple stmt,
5844 gimple_stmt_iterator *gsi,
5845 VEC(tree,heap) **result_chain)
5846 {
5847 tree perm_dest, data_ref, first_vect, second_vect;
5848 gimple perm_stmt;
5849 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5850 int i;
5851 unsigned int j;
5852
5853 /* Check that the operation is supported. */
5854 if (!vect_strided_load_supported (vectype))
5855 return false;
5856
5857 *result_chain = VEC_copy (tree, heap, dr_chain);
5858 for (i = 0; i < exact_log2 (length); i++)
5859 {
5860 for (j = 0; j < length; j +=2)
5861 {
5862 first_vect = VEC_index (tree, dr_chain, j);
5863 second_vect = VEC_index (tree, dr_chain, j+1);
5864
5865 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5866 perm_dest = create_tmp_var (vectype, "vect_perm_even");
5867 DECL_GIMPLE_REG_P (perm_dest) = 1;
5868 add_referenced_var (perm_dest);
5869
5870 perm_stmt = gimple_build_assign_with_ops (VEC_EXTRACT_EVEN_EXPR,
5871 perm_dest, first_vect,
5872 second_vect);
5873
5874 data_ref = make_ssa_name (perm_dest, perm_stmt);
5875 gimple_assign_set_lhs (perm_stmt, data_ref);
5876 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5877 mark_symbols_for_renaming (perm_stmt);
5878
5879 VEC_replace (tree, *result_chain, j/2, data_ref);
5880
5881 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5882 perm_dest = create_tmp_var (vectype, "vect_perm_odd");
5883 DECL_GIMPLE_REG_P (perm_dest) = 1;
5884 add_referenced_var (perm_dest);
5885
5886 perm_stmt = gimple_build_assign_with_ops (VEC_EXTRACT_ODD_EXPR,
5887 perm_dest, first_vect,
5888 second_vect);
5889 data_ref = make_ssa_name (perm_dest, perm_stmt);
5890 gimple_assign_set_lhs (perm_stmt, data_ref);
5891 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5892 mark_symbols_for_renaming (perm_stmt);
5893
5894 VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
5895 }
5896 dr_chain = VEC_copy (tree, heap, *result_chain);
5897 }
5898 return true;
5899 }
5900
5901
5902 /* Function vect_transform_strided_load.
5903
5904 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5905 to perform their permutation and ascribe the result vectorized statements to
5906 the scalar statements.
5907 */
5908
5909 static bool
5910 vect_transform_strided_load (gimple stmt, VEC(tree,heap) *dr_chain, int size,
5911 gimple_stmt_iterator *gsi)
5912 {
5913 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5914 gimple first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5915 gimple next_stmt, new_stmt;
5916 VEC(tree,heap) *result_chain = NULL;
5917 unsigned int i, gap_count;
5918 tree tmp_data_ref;
5919
5920 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5921 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5922 vectors, that are ready for vector computation. */
5923 result_chain = VEC_alloc (tree, heap, size);
5924 /* Permute. */
5925 if (!vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain))
5926 return false;
5927
5928 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5929 Since we scan the chain starting from it's first node, their order
5930 corresponds the order of data-refs in RESULT_CHAIN. */
5931 next_stmt = first_stmt;
5932 gap_count = 1;
5933 for (i = 0; VEC_iterate (tree, result_chain, i, tmp_data_ref); i++)
5934 {
5935 if (!next_stmt)
5936 break;
5937
5938 /* Skip the gaps. Loads created for the gaps will be removed by dead
5939 code elimination pass later. No need to check for the first stmt in
5940 the group, since it always exists.
5941 DR_GROUP_GAP is the number of steps in elements from the previous
5942 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
5943 correspond to the gaps.
5944 */
5945 if (next_stmt != first_stmt
5946 && gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
5947 {
5948 gap_count++;
5949 continue;
5950 }
5951
5952 while (next_stmt)
5953 {
5954 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5955 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5956 copies, and we put the new vector statement in the first available
5957 RELATED_STMT. */
5958 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5959 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5960 else
5961 {
5962 if (!DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5963 {
5964 gimple prev_stmt =
5965 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5966 gimple rel_stmt =
5967 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5968 while (rel_stmt)
5969 {
5970 prev_stmt = rel_stmt;
5971 rel_stmt =
5972 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5973 }
5974
5975 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5976 new_stmt;
5977 }
5978 }
5979
5980 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5981 gap_count = 1;
5982 /* If NEXT_STMT accesses the same DR as the previous statement,
5983 put the same TMP_DATA_REF as its vectorized statement; otherwise
5984 get the next data-ref from RESULT_CHAIN. */
5985 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5986 break;
5987 }
5988 }
5989
5990 VEC_free (tree, heap, result_chain);
5991 return true;
5992 }
5993
5994
5995 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
5996 building a vector of type MASK_TYPE from it) and two input vectors placed in
5997 DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
5998 shifting by STRIDE elements of DR_CHAIN for every copy.
5999 (STRIDE is the number of vectorized stmts for NODE divided by the number of
6000 copies).
6001 VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
6002 the created stmts must be inserted. */
6003
6004 static inline void
6005 vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
6006 int *mask_array, int mask_nunits,
6007 tree mask_element_type, tree mask_type,
6008 int first_vec_indx, int second_vec_indx,
6009 gimple_stmt_iterator *gsi, slp_tree node,
6010 tree builtin_decl, tree vectype,
6011 VEC(tree,heap) *dr_chain,
6012 int ncopies, int vect_stmts_counter)
6013 {
6014 tree t = NULL_TREE, mask_vec, mask, perm_dest;
6015 gimple perm_stmt = NULL;
6016 stmt_vec_info next_stmt_info;
6017 int i, group_size, stride, dr_chain_size;
6018 tree first_vec, second_vec, data_ref;
6019 tree sym;
6020 ssa_op_iter iter;
6021 VEC (tree, heap) *params = NULL;
6022
6023 /* Create a vector mask. */
6024 for (i = mask_nunits - 1; i >= 0; --i)
6025 t = tree_cons (NULL_TREE, build_int_cst (mask_element_type, mask_array[i]),
6026 t);
6027 mask_vec = build_vector (mask_type, t);
6028 mask = vect_init_vector (stmt, mask_vec, mask_type, NULL);
6029
6030 group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (node));
6031 stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
6032 dr_chain_size = VEC_length (tree, dr_chain);
6033
6034 /* Initialize the vect stmts of NODE to properly insert the generated
6035 stmts later. */
6036 for (i = VEC_length (gimple, SLP_TREE_VEC_STMTS (node));
6037 i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
6038 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (node), NULL);
6039
6040 perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
6041 for (i = 0; i < ncopies; i++)
6042 {
6043 first_vec = VEC_index (tree, dr_chain, first_vec_indx);
6044 second_vec = VEC_index (tree, dr_chain, second_vec_indx);
6045
6046 /* Build argument list for the vectorized call. */
6047 VEC_free (tree, heap, params);
6048 params = VEC_alloc (tree, heap, 3);
6049 VEC_quick_push (tree, params, first_vec);
6050 VEC_quick_push (tree, params, second_vec);
6051 VEC_quick_push (tree, params, mask);
6052
6053 /* Generate the permute statement. */
6054 perm_stmt = gimple_build_call_vec (builtin_decl, params);
6055 data_ref = make_ssa_name (perm_dest, perm_stmt);
6056 gimple_call_set_lhs (perm_stmt, data_ref);
6057 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6058 FOR_EACH_SSA_TREE_OPERAND (sym, perm_stmt, iter, SSA_OP_ALL_VIRTUALS)
6059 {
6060 if (TREE_CODE (sym) == SSA_NAME)
6061 sym = SSA_NAME_VAR (sym);
6062 mark_sym_for_renaming (sym);
6063 }
6064
6065 /* Store the vector statement in NODE. */
6066 VEC_replace (gimple, SLP_TREE_VEC_STMTS (node),
6067 stride * i + vect_stmts_counter, perm_stmt);
6068
6069 first_vec_indx += stride;
6070 second_vec_indx += stride;
6071 }
6072
6073 /* Mark the scalar stmt as vectorized. */
6074 next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
6075 STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
6076 }
6077
6078
6079 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
6080 return in CURRENT_MASK_ELEMENT its equivalent in target specific
6081 representation. Check that the mask is valid and return FALSE if not.
6082 Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
6083 the next vector, i.e., the current first vector is not needed. */
6084
6085 static bool
6086 vect_get_mask_element (gimple stmt, int first_mask_element, int m,
6087 int mask_nunits, bool only_one_vec, int index,
6088 int *mask, int *current_mask_element,
6089 bool *need_next_vector)
6090 {
6091 int i;
6092 static int number_of_mask_fixes = 1;
6093 static bool mask_fixed = false;
6094 static bool needs_first_vector = false;
6095
6096 /* Convert to target specific representation. */
6097 *current_mask_element = first_mask_element + m;
6098 /* Adjust the value in case it's a mask for second and third vectors. */
6099 *current_mask_element -= mask_nunits * (number_of_mask_fixes - 1);
6100
6101 if (*current_mask_element < mask_nunits)
6102 needs_first_vector = true;
6103
6104 /* We have only one input vector to permute but the mask accesses values in
6105 the next vector as well. */
6106 if (only_one_vec && *current_mask_element >= mask_nunits)
6107 {
6108 if (vect_print_dump_info (REPORT_DETAILS))
6109 {
6110 fprintf (vect_dump, "permutation requires at least two vectors ");
6111 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
6112 }
6113
6114 return false;
6115 }
6116
6117 /* The mask requires the next vector. */
6118 if (*current_mask_element >= mask_nunits * 2)
6119 {
6120 if (needs_first_vector || mask_fixed)
6121 {
6122 /* We either need the first vector too or have already moved to the
6123 next vector. In both cases, this permutation needs three
6124 vectors. */
6125 if (vect_print_dump_info (REPORT_DETAILS))
6126 {
6127 fprintf (vect_dump, "permutation requires at "
6128 "least three vectors ");
6129 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
6130 }
6131
6132 return false;
6133 }
6134
6135 /* We move to the next vector, dropping the first one and working with
6136 the second and the third - we need to adjust the values of the mask
6137 accordingly. */
6138 *current_mask_element -= mask_nunits * number_of_mask_fixes;
6139
6140 for (i = 0; i < index; i++)
6141 mask[i] -= mask_nunits * number_of_mask_fixes;
6142
6143 (number_of_mask_fixes)++;
6144 mask_fixed = true;
6145 }
6146
6147 *need_next_vector = mask_fixed;
6148
6149 /* This was the last element of this mask. Start a new one. */
6150 if (index == mask_nunits - 1)
6151 {
6152 number_of_mask_fixes = 1;
6153 mask_fixed = false;
6154 needs_first_vector = false;
6155 }
6156
6157 return true;
6158 }
6159
6160
6161 /* Generate vector permute statements from a list of loads in DR_CHAIN.
6162 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
6163 permute statements for SLP_NODE_INSTANCE. */
6164 bool
6165 vect_transform_slp_perm_load (gimple stmt, VEC (tree, heap) *dr_chain,
6166 gimple_stmt_iterator *gsi, int vf,
6167 slp_instance slp_node_instance, bool analyze_only)
6168 {
6169 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6170 tree mask_element_type = NULL_TREE, mask_type;
6171 int i, j, k, m, scale, mask_nunits, nunits, vec_index = 0, scalar_index;
6172 slp_tree node;
6173 tree vectype = STMT_VINFO_VECTYPE (stmt_info), builtin_decl;
6174 gimple next_scalar_stmt;
6175 int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
6176 int first_mask_element;
6177 int index, unroll_factor, *mask, current_mask_element, ncopies;
6178 bool only_one_vec = false, need_next_vector = false;
6179 int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
6180
6181 if (!targetm.vectorize.builtin_vec_perm)
6182 {
6183 if (vect_print_dump_info (REPORT_DETAILS))
6184 {
6185 fprintf (vect_dump, "no builtin for vect permute for ");
6186 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
6187 }
6188
6189 return false;
6190 }
6191
6192 builtin_decl = targetm.vectorize.builtin_vec_perm (vectype,
6193 &mask_element_type);
6194 if (!builtin_decl || !mask_element_type)
6195 {
6196 if (vect_print_dump_info (REPORT_DETAILS))
6197 {
6198 fprintf (vect_dump, "no builtin for vect permute for ");
6199 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
6200 }
6201
6202 return false;
6203 }
6204
6205 mask_type = get_vectype_for_scalar_type (mask_element_type);
6206 mask_nunits = TYPE_VECTOR_SUBPARTS (mask_type);
6207 mask = (int *) xmalloc (sizeof (int) * mask_nunits);
6208 nunits = TYPE_VECTOR_SUBPARTS (vectype);
6209 scale = mask_nunits / nunits;
6210 unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
6211
6212 /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
6213 unrolling factor. */
6214 orig_vec_stmts_num = group_size *
6215 SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
6216 if (orig_vec_stmts_num == 1)
6217 only_one_vec = true;
6218
6219 /* Number of copies is determined by the final vectorization factor
6220 relatively to SLP_NODE_INSTANCE unrolling factor. */
6221 ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
6222
6223 /* Generate permutation masks for every NODE. Number of masks for each NODE
6224 is equal to GROUP_SIZE.
6225 E.g., we have a group of three nodes with three loads from the same
6226 location in each node, and the vector size is 4. I.e., we have a
6227 a0b0c0a1b1c1... sequence and we need to create the following vectors:
6228 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
6229 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
6230 ...
6231
6232 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9} (in target
6233 scpecific type, e.g., in bytes for Altivec.
6234 The last mask is illegal since we assume two operands for permute
6235 operation, and the mask element values can't be outside that range. Hence,
6236 the last mask must be converted into {2,5,5,5}.
6237 For the first two permutations we need the first and the second input
6238 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
6239 we need the second and the third vectors: {b1,c1,a2,b2} and
6240 {c2,a3,b3,c3}. */
6241
6242 for (i = 0;
6243 VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (slp_node_instance),
6244 i, node);
6245 i++)
6246 {
6247 scalar_index = 0;
6248 index = 0;
6249 vect_stmts_counter = 0;
6250 vec_index = 0;
6251 first_vec_index = vec_index++;
6252 if (only_one_vec)
6253 second_vec_index = first_vec_index;
6254 else
6255 second_vec_index = vec_index++;
6256
6257 for (j = 0; j < unroll_factor; j++)
6258 {
6259 for (k = 0; k < group_size; k++)
6260 {
6261 first_mask_element = (i + j * group_size) * scale;
6262 for (m = 0; m < scale; m++)
6263 {
6264 if (!vect_get_mask_element (stmt, first_mask_element, m,
6265 mask_nunits, only_one_vec, index, mask,
6266 &current_mask_element, &need_next_vector))
6267 return false;
6268
6269 mask[index++] = current_mask_element;
6270 }
6271
6272 if (index == mask_nunits)
6273 {
6274 index = 0;
6275 if (!analyze_only)
6276 {
6277 if (need_next_vector)
6278 {
6279 first_vec_index = second_vec_index;
6280 second_vec_index = vec_index;
6281 }
6282
6283 next_scalar_stmt = VEC_index (gimple,
6284 SLP_TREE_SCALAR_STMTS (node), scalar_index++);
6285
6286 vect_create_mask_and_perm (stmt, next_scalar_stmt,
6287 mask, mask_nunits, mask_element_type, mask_type,
6288 first_vec_index, second_vec_index, gsi, node,
6289 builtin_decl, vectype, dr_chain, ncopies,
6290 vect_stmts_counter++);
6291 }
6292 }
6293 }
6294 }
6295 }
6296
6297 free (mask);
6298 return true;
6299 }
6300
6301 /* vectorizable_load.
6302
6303 Check if STMT reads a non scalar data-ref (array/pointer/structure) that
6304 can be vectorized.
6305 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6306 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
6307 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
6308
6309 bool
6310 vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
6311 slp_tree slp_node, slp_instance slp_node_instance)
6312 {
6313 tree scalar_dest;
6314 tree vec_dest = NULL;
6315 tree data_ref = NULL;
6316 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6317 stmt_vec_info prev_stmt_info;
6318 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6319 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6320 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
6321 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
6322 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
6323 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6324 tree new_temp;
6325 int mode;
6326 gimple new_stmt = NULL;
6327 tree dummy;
6328 enum dr_alignment_support alignment_support_scheme;
6329 tree dataref_ptr = NULL_TREE;
6330 gimple ptr_incr;
6331 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6332 int ncopies;
6333 int i, j, group_size;
6334 tree msq = NULL_TREE, lsq;
6335 tree offset = NULL_TREE;
6336 tree realignment_token = NULL_TREE;
6337 gimple phi = NULL;
6338 VEC(tree,heap) *dr_chain = NULL;
6339 bool strided_load = false;
6340 gimple first_stmt;
6341 tree scalar_type;
6342 bool inv_p;
6343 bool compute_in_loop = false;
6344 struct loop *at_loop;
6345 int vec_num;
6346 bool slp = (slp_node != NULL);
6347 bool slp_perm = false;
6348 enum tree_code code;
6349
6350 /* Multiple types in SLP are handled by creating the appropriate number of
6351 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
6352 case of SLP. */
6353 if (slp)
6354 ncopies = 1;
6355 else
6356 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6357
6358 gcc_assert (ncopies >= 1);
6359
6360 /* FORNOW. This restriction should be relaxed. */
6361 if (nested_in_vect_loop && ncopies > 1)
6362 {
6363 if (vect_print_dump_info (REPORT_DETAILS))
6364 fprintf (vect_dump, "multiple types in nested loop.");
6365 return false;
6366 }
6367
6368 if (slp && SLP_INSTANCE_LOAD_PERMUTATION (slp_node_instance))
6369 slp_perm = true;
6370
6371 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6372 return false;
6373
6374 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
6375 return false;
6376
6377 /* Is vectorizable load? */
6378 if (!is_gimple_assign (stmt))
6379 return false;
6380
6381 scalar_dest = gimple_assign_lhs (stmt);
6382 if (TREE_CODE (scalar_dest) != SSA_NAME)
6383 return false;
6384
6385 code = gimple_assign_rhs_code (stmt);
6386 if (code != ARRAY_REF
6387 && code != INDIRECT_REF
6388 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
6389 return false;
6390
6391 if (!STMT_VINFO_DATA_REF (stmt_info))
6392 return false;
6393
6394 scalar_type = TREE_TYPE (DR_REF (dr));
6395 mode = (int) TYPE_MODE (vectype);
6396
6397 /* FORNOW. In some cases can vectorize even if data-type not supported
6398 (e.g. - data copies). */
6399 if (optab_handler (mov_optab, mode)->insn_code == CODE_FOR_nothing)
6400 {
6401 if (vect_print_dump_info (REPORT_DETAILS))
6402 fprintf (vect_dump, "Aligned load, but unsupported type.");
6403 return false;
6404 }
6405
6406 /* The vector component type needs to be trivially convertible to the
6407 scalar lhs. This should always be the case. */
6408 if (!useless_type_conversion_p (TREE_TYPE (scalar_dest), TREE_TYPE (vectype)))
6409 {
6410 if (vect_print_dump_info (REPORT_DETAILS))
6411 fprintf (vect_dump, "??? operands of different types");
6412 return false;
6413 }
6414
6415 /* Check if the load is a part of an interleaving chain. */
6416 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
6417 {
6418 strided_load = true;
6419 /* FORNOW */
6420 gcc_assert (! nested_in_vect_loop);
6421
6422 /* Check if interleaving is supported. */
6423 if (!vect_strided_load_supported (vectype)
6424 && !PURE_SLP_STMT (stmt_info) && !slp)
6425 return false;
6426 }
6427
6428 if (!vec_stmt) /* transformation not required. */
6429 {
6430 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
6431 vect_model_load_cost (stmt_info, ncopies, NULL);
6432 return true;
6433 }
6434
6435 if (vect_print_dump_info (REPORT_DETAILS))
6436 fprintf (vect_dump, "transform load.");
6437
6438 /** Transform. **/
6439
6440 if (strided_load)
6441 {
6442 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
6443 /* Check if the chain of loads is already vectorized. */
6444 if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
6445 {
6446 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
6447 return true;
6448 }
6449 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
6450 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
6451
6452 /* VEC_NUM is the number of vect stmts to be created for this group. */
6453 if (slp)
6454 {
6455 strided_load = false;
6456 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6457 }
6458 else
6459 vec_num = group_size;
6460
6461 dr_chain = VEC_alloc (tree, heap, vec_num);
6462 }
6463 else
6464 {
6465 first_stmt = stmt;
6466 first_dr = dr;
6467 group_size = vec_num = 1;
6468 }
6469
6470 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
6471 gcc_assert (alignment_support_scheme);
6472
6473 /* In case the vectorization factor (VF) is bigger than the number
6474 of elements that we can fit in a vectype (nunits), we have to generate
6475 more than one vector stmt - i.e - we need to "unroll" the
6476 vector stmt by a factor VF/nunits. In doing so, we record a pointer
6477 from one copy of the vector stmt to the next, in the field
6478 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
6479 stages to find the correct vector defs to be used when vectorizing
6480 stmts that use the defs of the current stmt. The example below illustrates
6481 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
6482 4 vectorized stmts):
6483
6484 before vectorization:
6485 RELATED_STMT VEC_STMT
6486 S1: x = memref - -
6487 S2: z = x + 1 - -
6488
6489 step 1: vectorize stmt S1:
6490 We first create the vector stmt VS1_0, and, as usual, record a
6491 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
6492 Next, we create the vector stmt VS1_1, and record a pointer to
6493 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
6494 Similarly, for VS1_2 and VS1_3. This is the resulting chain of
6495 stmts and pointers:
6496 RELATED_STMT VEC_STMT
6497 VS1_0: vx0 = memref0 VS1_1 -
6498 VS1_1: vx1 = memref1 VS1_2 -
6499 VS1_2: vx2 = memref2 VS1_3 -
6500 VS1_3: vx3 = memref3 - -
6501 S1: x = load - VS1_0
6502 S2: z = x + 1 - -
6503
6504 See in documentation in vect_get_vec_def_for_stmt_copy for how the
6505 information we recorded in RELATED_STMT field is used to vectorize
6506 stmt S2. */
6507
6508 /* In case of interleaving (non-unit strided access):
6509
6510 S1: x2 = &base + 2
6511 S2: x0 = &base
6512 S3: x1 = &base + 1
6513 S4: x3 = &base + 3
6514
6515 Vectorized loads are created in the order of memory accesses
6516 starting from the access of the first stmt of the chain:
6517
6518 VS1: vx0 = &base
6519 VS2: vx1 = &base + vec_size*1
6520 VS3: vx3 = &base + vec_size*2
6521 VS4: vx4 = &base + vec_size*3
6522
6523 Then permutation statements are generated:
6524
6525 VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 >
6526 VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 >
6527 ...
6528
6529 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
6530 (the order of the data-refs in the output of vect_permute_load_chain
6531 corresponds to the order of scalar stmts in the interleaving chain - see
6532 the documentation of vect_permute_load_chain()).
6533 The generation of permutation stmts and recording them in
6534 STMT_VINFO_VEC_STMT is done in vect_transform_strided_load().
6535
6536 In case of both multiple types and interleaving, the vector loads and
6537 permutation stmts above are created for every copy. The result vector stmts
6538 are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
6539 STMT_VINFO_RELATED_STMT for the next copies. */
6540
6541 /* If the data reference is aligned (dr_aligned) or potentially unaligned
6542 on a target that supports unaligned accesses (dr_unaligned_supported)
6543 we generate the following code:
6544 p = initial_addr;
6545 indx = 0;
6546 loop {
6547 p = p + indx * vectype_size;
6548 vec_dest = *(p);
6549 indx = indx + 1;
6550 }
6551
6552 Otherwise, the data reference is potentially unaligned on a target that
6553 does not support unaligned accesses (dr_explicit_realign_optimized) -
6554 then generate the following code, in which the data in each iteration is
6555 obtained by two vector loads, one from the previous iteration, and one
6556 from the current iteration:
6557 p1 = initial_addr;
6558 msq_init = *(floor(p1))
6559 p2 = initial_addr + VS - 1;
6560 realignment_token = call target_builtin;
6561 indx = 0;
6562 loop {
6563 p2 = p2 + indx * vectype_size
6564 lsq = *(floor(p2))
6565 vec_dest = realign_load (msq, lsq, realignment_token)
6566 indx = indx + 1;
6567 msq = lsq;
6568 } */
6569
6570 /* If the misalignment remains the same throughout the execution of the
6571 loop, we can create the init_addr and permutation mask at the loop
6572 preheader. Otherwise, it needs to be created inside the loop.
6573 This can only occur when vectorizing memory accesses in the inner-loop
6574 nested within an outer-loop that is being vectorized. */
6575
6576 if (nested_in_vect_loop_p (loop, stmt)
6577 && (TREE_INT_CST_LOW (DR_STEP (dr))
6578 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
6579 {
6580 gcc_assert (alignment_support_scheme != dr_explicit_realign_optimized);
6581 compute_in_loop = true;
6582 }
6583
6584 if ((alignment_support_scheme == dr_explicit_realign_optimized
6585 || alignment_support_scheme == dr_explicit_realign)
6586 && !compute_in_loop)
6587 {
6588 msq = vect_setup_realignment (first_stmt, gsi, &realignment_token,
6589 alignment_support_scheme, NULL_TREE,
6590 &at_loop);
6591 if (alignment_support_scheme == dr_explicit_realign_optimized)
6592 {
6593 phi = SSA_NAME_DEF_STMT (msq);
6594 offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
6595 }
6596 }
6597 else
6598 at_loop = loop;
6599
6600 prev_stmt_info = NULL;
6601 for (j = 0; j < ncopies; j++)
6602 {
6603 /* 1. Create the vector pointer update chain. */
6604 if (j == 0)
6605 dataref_ptr = vect_create_data_ref_ptr (first_stmt,
6606 at_loop, offset,
6607 &dummy, &ptr_incr, false,
6608 &inv_p, NULL_TREE);
6609 else
6610 dataref_ptr =
6611 bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt, NULL_TREE);
6612
6613 for (i = 0; i < vec_num; i++)
6614 {
6615 if (i > 0)
6616 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt,
6617 NULL_TREE);
6618
6619 /* 2. Create the vector-load in the loop. */
6620 switch (alignment_support_scheme)
6621 {
6622 case dr_aligned:
6623 gcc_assert (aligned_access_p (first_dr));
6624 data_ref = build_fold_indirect_ref (dataref_ptr);
6625 break;
6626 case dr_unaligned_supported:
6627 {
6628 int mis = DR_MISALIGNMENT (first_dr);
6629 tree tmis = (mis == -1 ? size_zero_node : size_int (mis));
6630
6631 tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT));
6632 data_ref =
6633 build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis);
6634 break;
6635 }
6636 case dr_explicit_realign:
6637 {
6638 tree ptr, bump;
6639 tree vs_minus_1 = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
6640
6641 if (compute_in_loop)
6642 msq = vect_setup_realignment (first_stmt, gsi,
6643 &realignment_token,
6644 dr_explicit_realign,
6645 dataref_ptr, NULL);
6646
6647 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
6648 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6649 new_stmt = gimple_build_assign (vec_dest, data_ref);
6650 new_temp = make_ssa_name (vec_dest, new_stmt);
6651 gimple_assign_set_lhs (new_stmt, new_temp);
6652 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6653 copy_virtual_operands (new_stmt, stmt);
6654 mark_symbols_for_renaming (new_stmt);
6655 msq = new_temp;
6656
6657 bump = size_binop (MULT_EXPR, vs_minus_1,
6658 TYPE_SIZE_UNIT (scalar_type));
6659 ptr = bump_vector_ptr (dataref_ptr, NULL, gsi, stmt, bump);
6660 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
6661 break;
6662 }
6663 case dr_explicit_realign_optimized:
6664 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
6665 break;
6666 default:
6667 gcc_unreachable ();
6668 }
6669 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6670 new_stmt = gimple_build_assign (vec_dest, data_ref);
6671 new_temp = make_ssa_name (vec_dest, new_stmt);
6672 gimple_assign_set_lhs (new_stmt, new_temp);
6673 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6674 mark_symbols_for_renaming (new_stmt);
6675
6676 /* 3. Handle explicit realignment if necessary/supported. Create in
6677 loop: vec_dest = realign_load (msq, lsq, realignment_token) */
6678 if (alignment_support_scheme == dr_explicit_realign_optimized
6679 || alignment_support_scheme == dr_explicit_realign)
6680 {
6681 tree tmp;
6682
6683 lsq = gimple_assign_lhs (new_stmt);
6684 if (!realignment_token)
6685 realignment_token = dataref_ptr;
6686 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6687 tmp = build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq,
6688 realignment_token);
6689 new_stmt = gimple_build_assign (vec_dest, tmp);
6690 new_temp = make_ssa_name (vec_dest, new_stmt);
6691 gimple_assign_set_lhs (new_stmt, new_temp);
6692 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6693
6694 if (alignment_support_scheme == dr_explicit_realign_optimized)
6695 {
6696 gcc_assert (phi);
6697 if (i == vec_num - 1 && j == ncopies - 1)
6698 add_phi_arg (phi, lsq, loop_latch_edge (containing_loop));
6699 msq = lsq;
6700 }
6701 }
6702
6703 /* 4. Handle invariant-load. */
6704 if (inv_p)
6705 {
6706 gcc_assert (!strided_load);
6707 gcc_assert (nested_in_vect_loop_p (loop, stmt));
6708 if (j == 0)
6709 {
6710 int k;
6711 tree t = NULL_TREE;
6712 tree vec_inv, bitpos, bitsize = TYPE_SIZE (scalar_type);
6713
6714 /* CHECKME: bitpos depends on endianess? */
6715 bitpos = bitsize_zero_node;
6716 vec_inv = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6717 bitsize, bitpos);
6718 vec_dest =
6719 vect_create_destination_var (scalar_dest, NULL_TREE);
6720 new_stmt = gimple_build_assign (vec_dest, vec_inv);
6721 new_temp = make_ssa_name (vec_dest, new_stmt);
6722 gimple_assign_set_lhs (new_stmt, new_temp);
6723 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6724
6725 for (k = nunits - 1; k >= 0; --k)
6726 t = tree_cons (NULL_TREE, new_temp, t);
6727 /* FIXME: use build_constructor directly. */
6728 vec_inv = build_constructor_from_list (vectype, t);
6729 new_temp = vect_init_vector (stmt, vec_inv, vectype, gsi);
6730 new_stmt = SSA_NAME_DEF_STMT (new_temp);
6731 }
6732 else
6733 gcc_unreachable (); /* FORNOW. */
6734 }
6735
6736 /* Collect vector loads and later create their permutation in
6737 vect_transform_strided_load (). */
6738 if (strided_load || slp_perm)
6739 VEC_quick_push (tree, dr_chain, new_temp);
6740
6741 /* Store vector loads in the corresponding SLP_NODE. */
6742 if (slp && !slp_perm)
6743 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
6744 }
6745
6746 if (slp && !slp_perm)
6747 continue;
6748
6749 if (slp_perm)
6750 {
6751 if (!vect_transform_slp_perm_load (stmt, dr_chain, gsi,
6752 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
6753 slp_node_instance, false))
6754 {
6755 VEC_free (tree, heap, dr_chain);
6756 return false;
6757 }
6758 }
6759 else
6760 {
6761 if (strided_load)
6762 {
6763 if (!vect_transform_strided_load (stmt, dr_chain, group_size, gsi))
6764 return false;
6765
6766 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
6767 VEC_free (tree, heap, dr_chain);
6768 dr_chain = VEC_alloc (tree, heap, group_size);
6769 }
6770 else
6771 {
6772 if (j == 0)
6773 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6774 else
6775 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6776 prev_stmt_info = vinfo_for_stmt (new_stmt);
6777 }
6778 }
6779 }
6780
6781 if (dr_chain)
6782 VEC_free (tree, heap, dr_chain);
6783
6784 return true;
6785 }
6786
6787
6788 /* Function vectorizable_live_operation.
6789
6790 STMT computes a value that is used outside the loop. Check if
6791 it can be supported. */
6792
6793 bool
6794 vectorizable_live_operation (gimple stmt,
6795 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6796 gimple *vec_stmt ATTRIBUTE_UNUSED)
6797 {
6798 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6799 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6800 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6801 int i;
6802 int op_type;
6803 tree op;
6804 tree def;
6805 gimple def_stmt;
6806 enum vect_def_type dt;
6807 enum tree_code code;
6808 enum gimple_rhs_class rhs_class;
6809
6810 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
6811
6812 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6813 return false;
6814
6815 if (!is_gimple_assign (stmt))
6816 return false;
6817
6818 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
6819 return false;
6820
6821 /* FORNOW. CHECKME. */
6822 if (nested_in_vect_loop_p (loop, stmt))
6823 return false;
6824
6825 code = gimple_assign_rhs_code (stmt);
6826 op_type = TREE_CODE_LENGTH (code);
6827 rhs_class = get_gimple_rhs_class (code);
6828 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
6829 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
6830
6831 /* FORNOW: support only if all uses are invariant. This means
6832 that the scalar operations can remain in place, unvectorized.
6833 The original last scalar value that they compute will be used. */
6834
6835 for (i = 0; i < op_type; i++)
6836 {
6837 if (rhs_class == GIMPLE_SINGLE_RHS)
6838 op = TREE_OPERAND (gimple_op (stmt, 1), i);
6839 else
6840 op = gimple_op (stmt, i + 1);
6841 if (op && !vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
6842 {
6843 if (vect_print_dump_info (REPORT_DETAILS))
6844 fprintf (vect_dump, "use not simple.");
6845 return false;
6846 }
6847
6848 if (dt != vect_invariant_def && dt != vect_constant_def)
6849 return false;
6850 }
6851
6852 /* No transformation is required for the cases we currently support. */
6853 return true;
6854 }
6855
6856
6857 /* Function vect_is_simple_cond.
6858
6859 Input:
6860 LOOP - the loop that is being vectorized.
6861 COND - Condition that is checked for simple use.
6862
6863 Returns whether a COND can be vectorized. Checks whether
6864 condition operands are supportable using vec_is_simple_use. */
6865
6866 static bool
6867 vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
6868 {
6869 tree lhs, rhs;
6870 tree def;
6871 enum vect_def_type dt;
6872
6873 if (!COMPARISON_CLASS_P (cond))
6874 return false;
6875
6876 lhs = TREE_OPERAND (cond, 0);
6877 rhs = TREE_OPERAND (cond, 1);
6878
6879 if (TREE_CODE (lhs) == SSA_NAME)
6880 {
6881 gimple lhs_def_stmt = SSA_NAME_DEF_STMT (lhs);
6882 if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt))
6883 return false;
6884 }
6885 else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST
6886 && TREE_CODE (lhs) != FIXED_CST)
6887 return false;
6888
6889 if (TREE_CODE (rhs) == SSA_NAME)
6890 {
6891 gimple rhs_def_stmt = SSA_NAME_DEF_STMT (rhs);
6892 if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt))
6893 return false;
6894 }
6895 else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST
6896 && TREE_CODE (rhs) != FIXED_CST)
6897 return false;
6898
6899 return true;
6900 }
6901
6902 /* vectorizable_condition.
6903
6904 Check if STMT is conditional modify expression that can be vectorized.
6905 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6906 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it
6907 at BSI.
6908
6909 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
6910
6911 bool
6912 vectorizable_condition (gimple stmt, gimple_stmt_iterator *gsi,
6913 gimple *vec_stmt)
6914 {
6915 tree scalar_dest = NULL_TREE;
6916 tree vec_dest = NULL_TREE;
6917 tree op = NULL_TREE;
6918 tree cond_expr, then_clause, else_clause;
6919 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6920 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6921 tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause;
6922 tree vec_compare, vec_cond_expr;
6923 tree new_temp;
6924 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6925 enum machine_mode vec_mode;
6926 tree def;
6927 enum vect_def_type dt;
6928 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6929 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6930 enum tree_code code;
6931
6932 gcc_assert (ncopies >= 1);
6933 if (ncopies > 1)
6934 return false; /* FORNOW */
6935
6936 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6937 return false;
6938
6939 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
6940 return false;
6941
6942 /* FORNOW: SLP not supported. */
6943 if (STMT_SLP_TYPE (stmt_info))
6944 return false;
6945
6946 /* FORNOW: not yet supported. */
6947 if (STMT_VINFO_LIVE_P (stmt_info))
6948 {
6949 if (vect_print_dump_info (REPORT_DETAILS))
6950 fprintf (vect_dump, "value used after loop.");
6951 return false;
6952 }
6953
6954 /* Is vectorizable conditional operation? */
6955 if (!is_gimple_assign (stmt))
6956 return false;
6957
6958 code = gimple_assign_rhs_code (stmt);
6959
6960 if (code != COND_EXPR)
6961 return false;
6962
6963 gcc_assert (gimple_assign_single_p (stmt));
6964 op = gimple_assign_rhs1 (stmt);
6965 cond_expr = TREE_OPERAND (op, 0);
6966 then_clause = TREE_OPERAND (op, 1);
6967 else_clause = TREE_OPERAND (op, 2);
6968
6969 if (!vect_is_simple_cond (cond_expr, loop_vinfo))
6970 return false;
6971
6972 /* We do not handle two different vector types for the condition
6973 and the values. */
6974 if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype))
6975 return false;
6976
6977 if (TREE_CODE (then_clause) == SSA_NAME)
6978 {
6979 gimple then_def_stmt = SSA_NAME_DEF_STMT (then_clause);
6980 if (!vect_is_simple_use (then_clause, loop_vinfo,
6981 &then_def_stmt, &def, &dt))
6982 return false;
6983 }
6984 else if (TREE_CODE (then_clause) != INTEGER_CST
6985 && TREE_CODE (then_clause) != REAL_CST
6986 && TREE_CODE (then_clause) != FIXED_CST)
6987 return false;
6988
6989 if (TREE_CODE (else_clause) == SSA_NAME)
6990 {
6991 gimple else_def_stmt = SSA_NAME_DEF_STMT (else_clause);
6992 if (!vect_is_simple_use (else_clause, loop_vinfo,
6993 &else_def_stmt, &def, &dt))
6994 return false;
6995 }
6996 else if (TREE_CODE (else_clause) != INTEGER_CST
6997 && TREE_CODE (else_clause) != REAL_CST
6998 && TREE_CODE (else_clause) != FIXED_CST)
6999 return false;
7000
7001
7002 vec_mode = TYPE_MODE (vectype);
7003
7004 if (!vec_stmt)
7005 {
7006 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
7007 return expand_vec_cond_expr_p (op, vec_mode);
7008 }
7009
7010 /* Transform */
7011
7012 /* Handle def. */
7013 scalar_dest = gimple_assign_lhs (stmt);
7014 vec_dest = vect_create_destination_var (scalar_dest, vectype);
7015
7016 /* Handle cond expr. */
7017 vec_cond_lhs =
7018 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL);
7019 vec_cond_rhs =
7020 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL);
7021 vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL);
7022 vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL);
7023
7024 /* Arguments are ready. Create the new vector stmt. */
7025 vec_compare = build2 (TREE_CODE (cond_expr), vectype,
7026 vec_cond_lhs, vec_cond_rhs);
7027 vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
7028 vec_compare, vec_then_clause, vec_else_clause);
7029
7030 *vec_stmt = gimple_build_assign (vec_dest, vec_cond_expr);
7031 new_temp = make_ssa_name (vec_dest, *vec_stmt);
7032 gimple_assign_set_lhs (*vec_stmt, new_temp);
7033 vect_finish_stmt_generation (stmt, *vec_stmt, gsi);
7034
7035 return true;
7036 }
7037
7038
7039 /* Function vect_transform_stmt.
7040
7041 Create a vectorized stmt to replace STMT, and insert it at BSI. */
7042
7043 static bool
7044 vect_transform_stmt (gimple stmt, gimple_stmt_iterator *gsi,
7045 bool *strided_store, slp_tree slp_node,
7046 slp_instance slp_node_instance)
7047 {
7048 bool is_store = false;
7049 gimple vec_stmt = NULL;
7050 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7051 gimple orig_stmt_in_pattern;
7052 bool done;
7053 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7054 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7055
7056 switch (STMT_VINFO_TYPE (stmt_info))
7057 {
7058 case type_demotion_vec_info_type:
7059 done = vectorizable_type_demotion (stmt, gsi, &vec_stmt, slp_node);
7060 gcc_assert (done);
7061 break;
7062
7063 case type_promotion_vec_info_type:
7064 done = vectorizable_type_promotion (stmt, gsi, &vec_stmt, slp_node);
7065 gcc_assert (done);
7066 break;
7067
7068 case type_conversion_vec_info_type:
7069 done = vectorizable_conversion (stmt, gsi, &vec_stmt, slp_node);
7070 gcc_assert (done);
7071 break;
7072
7073 case induc_vec_info_type:
7074 gcc_assert (!slp_node);
7075 done = vectorizable_induction (stmt, gsi, &vec_stmt);
7076 gcc_assert (done);
7077 break;
7078
7079 case op_vec_info_type:
7080 done = vectorizable_operation (stmt, gsi, &vec_stmt, slp_node);
7081 gcc_assert (done);
7082 break;
7083
7084 case assignment_vec_info_type:
7085 done = vectorizable_assignment (stmt, gsi, &vec_stmt, slp_node);
7086 gcc_assert (done);
7087 break;
7088
7089 case load_vec_info_type:
7090 done = vectorizable_load (stmt, gsi, &vec_stmt, slp_node,
7091 slp_node_instance);
7092 gcc_assert (done);
7093 break;
7094
7095 case store_vec_info_type:
7096 done = vectorizable_store (stmt, gsi, &vec_stmt, slp_node);
7097 gcc_assert (done);
7098 if (STMT_VINFO_STRIDED_ACCESS (stmt_info) && !slp_node)
7099 {
7100 /* In case of interleaving, the whole chain is vectorized when the
7101 last store in the chain is reached. Store stmts before the last
7102 one are skipped, and there vec_stmt_info shouldn't be freed
7103 meanwhile. */
7104 *strided_store = true;
7105 if (STMT_VINFO_VEC_STMT (stmt_info))
7106 is_store = true;
7107 }
7108 else
7109 is_store = true;
7110 break;
7111
7112 case condition_vec_info_type:
7113 gcc_assert (!slp_node);
7114 done = vectorizable_condition (stmt, gsi, &vec_stmt);
7115 gcc_assert (done);
7116 break;
7117
7118 case call_vec_info_type:
7119 gcc_assert (!slp_node);
7120 done = vectorizable_call (stmt, gsi, &vec_stmt);
7121 break;
7122
7123 case reduc_vec_info_type:
7124 gcc_assert (!slp_node);
7125 done = vectorizable_reduction (stmt, gsi, &vec_stmt);
7126 gcc_assert (done);
7127 break;
7128
7129 default:
7130 if (!STMT_VINFO_LIVE_P (stmt_info))
7131 {
7132 if (vect_print_dump_info (REPORT_DETAILS))
7133 fprintf (vect_dump, "stmt not supported.");
7134 gcc_unreachable ();
7135 }
7136 }
7137
7138 /* Handle inner-loop stmts whose DEF is used in the loop-nest that
7139 is being vectorized, but outside the immediately enclosing loop. */
7140 if (vec_stmt
7141 && nested_in_vect_loop_p (loop, stmt)
7142 && STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type
7143 && (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_outer
7144 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_outer_by_reduction))
7145 {
7146 struct loop *innerloop = loop->inner;
7147 imm_use_iterator imm_iter;
7148 use_operand_p use_p;
7149 tree scalar_dest;
7150 gimple exit_phi;
7151
7152 if (vect_print_dump_info (REPORT_DETAILS))
7153 fprintf (vect_dump, "Record the vdef for outer-loop vectorization.");
7154
7155 /* Find the relevant loop-exit phi-node, and reord the vec_stmt there
7156 (to be used when vectorizing outer-loop stmts that use the DEF of
7157 STMT). */
7158 if (gimple_code (stmt) == GIMPLE_PHI)
7159 scalar_dest = PHI_RESULT (stmt);
7160 else
7161 scalar_dest = gimple_assign_lhs (stmt);
7162
7163 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
7164 {
7165 if (!flow_bb_inside_loop_p (innerloop, gimple_bb (USE_STMT (use_p))))
7166 {
7167 exit_phi = USE_STMT (use_p);
7168 STMT_VINFO_VEC_STMT (vinfo_for_stmt (exit_phi)) = vec_stmt;
7169 }
7170 }
7171 }
7172
7173 /* Handle stmts whose DEF is used outside the loop-nest that is
7174 being vectorized. */
7175 if (STMT_VINFO_LIVE_P (stmt_info)
7176 && STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type)
7177 {
7178 done = vectorizable_live_operation (stmt, gsi, &vec_stmt);
7179 gcc_assert (done);
7180 }
7181
7182 if (vec_stmt)
7183 {
7184 STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
7185 orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
7186 if (orig_stmt_in_pattern)
7187 {
7188 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
7189 /* STMT was inserted by the vectorizer to replace a computation idiom.
7190 ORIG_STMT_IN_PATTERN is a stmt in the original sequence that
7191 computed this idiom. We need to record a pointer to VEC_STMT in
7192 the stmt_info of ORIG_STMT_IN_PATTERN. See more details in the
7193 documentation of vect_pattern_recog. */
7194 if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
7195 {
7196 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
7197 STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
7198 }
7199 }
7200 }
7201
7202 return is_store;
7203 }
7204
7205
7206 /* This function builds ni_name = number of iterations loop executes
7207 on the loop preheader. */
7208
7209 static tree
7210 vect_build_loop_niters (loop_vec_info loop_vinfo)
7211 {
7212 tree ni_name, var;
7213 gimple_seq stmts = NULL;
7214 edge pe;
7215 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7216 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
7217
7218 var = create_tmp_var (TREE_TYPE (ni), "niters");
7219 add_referenced_var (var);
7220 ni_name = force_gimple_operand (ni, &stmts, false, var);
7221
7222 pe = loop_preheader_edge (loop);
7223 if (stmts)
7224 {
7225 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7226 gcc_assert (!new_bb);
7227 }
7228
7229 return ni_name;
7230 }
7231
7232
7233 /* This function generates the following statements:
7234
7235 ni_name = number of iterations loop executes
7236 ratio = ni_name / vf
7237 ratio_mult_vf_name = ratio * vf
7238
7239 and places them at the loop preheader edge. */
7240
7241 static void
7242 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
7243 tree *ni_name_ptr,
7244 tree *ratio_mult_vf_name_ptr,
7245 tree *ratio_name_ptr)
7246 {
7247
7248 edge pe;
7249 basic_block new_bb;
7250 gimple_seq stmts;
7251 tree ni_name;
7252 tree var;
7253 tree ratio_name;
7254 tree ratio_mult_vf_name;
7255 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7256 tree ni = LOOP_VINFO_NITERS (loop_vinfo);
7257 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7258 tree log_vf;
7259
7260 pe = loop_preheader_edge (loop);
7261
7262 /* Generate temporary variable that contains
7263 number of iterations loop executes. */
7264
7265 ni_name = vect_build_loop_niters (loop_vinfo);
7266 log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
7267
7268 /* Create: ratio = ni >> log2(vf) */
7269
7270 ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf);
7271 if (!is_gimple_val (ratio_name))
7272 {
7273 var = create_tmp_var (TREE_TYPE (ni), "bnd");
7274 add_referenced_var (var);
7275
7276 stmts = NULL;
7277 ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
7278 pe = loop_preheader_edge (loop);
7279 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7280 gcc_assert (!new_bb);
7281 }
7282
7283 /* Create: ratio_mult_vf = ratio << log2 (vf). */
7284
7285 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
7286 ratio_name, log_vf);
7287 if (!is_gimple_val (ratio_mult_vf_name))
7288 {
7289 var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
7290 add_referenced_var (var);
7291
7292 stmts = NULL;
7293 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
7294 true, var);
7295 pe = loop_preheader_edge (loop);
7296 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7297 gcc_assert (!new_bb);
7298 }
7299
7300 *ni_name_ptr = ni_name;
7301 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
7302 *ratio_name_ptr = ratio_name;
7303
7304 return;
7305 }
7306
7307
7308 /* Function vect_update_ivs_after_vectorizer.
7309
7310 "Advance" the induction variables of LOOP to the value they should take
7311 after the execution of LOOP. This is currently necessary because the
7312 vectorizer does not handle induction variables that are used after the
7313 loop. Such a situation occurs when the last iterations of LOOP are
7314 peeled, because:
7315 1. We introduced new uses after LOOP for IVs that were not originally used
7316 after LOOP: the IVs of LOOP are now used by an epilog loop.
7317 2. LOOP is going to be vectorized; this means that it will iterate N/VF
7318 times, whereas the loop IVs should be bumped N times.
7319
7320 Input:
7321 - LOOP - a loop that is going to be vectorized. The last few iterations
7322 of LOOP were peeled.
7323 - NITERS - the number of iterations that LOOP executes (before it is
7324 vectorized). i.e, the number of times the ivs should be bumped.
7325 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
7326 coming out from LOOP on which there are uses of the LOOP ivs
7327 (this is the path from LOOP->exit to epilog_loop->preheader).
7328
7329 The new definitions of the ivs are placed in LOOP->exit.
7330 The phi args associated with the edge UPDATE_E in the bb
7331 UPDATE_E->dest are updated accordingly.
7332
7333 Assumption 1: Like the rest of the vectorizer, this function assumes
7334 a single loop exit that has a single predecessor.
7335
7336 Assumption 2: The phi nodes in the LOOP header and in update_bb are
7337 organized in the same order.
7338
7339 Assumption 3: The access function of the ivs is simple enough (see
7340 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
7341
7342 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
7343 coming out of LOOP on which the ivs of LOOP are used (this is the path
7344 that leads to the epilog loop; other paths skip the epilog loop). This
7345 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
7346 needs to have its phis updated.
7347 */
7348
7349 static void
7350 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
7351 edge update_e)
7352 {
7353 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7354 basic_block exit_bb = single_exit (loop)->dest;
7355 gimple phi, phi1;
7356 gimple_stmt_iterator gsi, gsi1;
7357 basic_block update_bb = update_e->dest;
7358
7359 /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */
7360
7361 /* Make sure there exists a single-predecessor exit bb: */
7362 gcc_assert (single_pred_p (exit_bb));
7363
7364 for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
7365 !gsi_end_p (gsi) && !gsi_end_p (gsi1);
7366 gsi_next (&gsi), gsi_next (&gsi1))
7367 {
7368 tree access_fn = NULL;
7369 tree evolution_part;
7370 tree init_expr;
7371 tree step_expr;
7372 tree var, ni, ni_name;
7373 gimple_stmt_iterator last_gsi;
7374
7375 phi = gsi_stmt (gsi);
7376 phi1 = gsi_stmt (gsi1);
7377 if (vect_print_dump_info (REPORT_DETAILS))
7378 {
7379 fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: ");
7380 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
7381 }
7382
7383 /* Skip virtual phi's. */
7384 if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi))))
7385 {
7386 if (vect_print_dump_info (REPORT_DETAILS))
7387 fprintf (vect_dump, "virtual phi. skip.");
7388 continue;
7389 }
7390
7391 /* Skip reduction phis. */
7392 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def)
7393 {
7394 if (vect_print_dump_info (REPORT_DETAILS))
7395 fprintf (vect_dump, "reduc phi. skip.");
7396 continue;
7397 }
7398
7399 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi));
7400 gcc_assert (access_fn);
7401 STRIP_NOPS (access_fn);
7402 evolution_part =
7403 unshare_expr (evolution_part_in_loop_num (access_fn, loop->num));
7404 gcc_assert (evolution_part != NULL_TREE);
7405
7406 /* FORNOW: We do not support IVs whose evolution function is a polynomial
7407 of degree >= 2 or exponential. */
7408 gcc_assert (!tree_is_chrec (evolution_part));
7409
7410 step_expr = evolution_part;
7411 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn,
7412 loop->num));
7413
7414 if (POINTER_TYPE_P (TREE_TYPE (init_expr)))
7415 ni = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (init_expr),
7416 init_expr,
7417 fold_convert (sizetype,
7418 fold_build2 (MULT_EXPR, TREE_TYPE (niters),
7419 niters, step_expr)));
7420 else
7421 ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
7422 fold_build2 (MULT_EXPR, TREE_TYPE (init_expr),
7423 fold_convert (TREE_TYPE (init_expr),
7424 niters),
7425 step_expr),
7426 init_expr);
7427
7428
7429
7430 var = create_tmp_var (TREE_TYPE (init_expr), "tmp");
7431 add_referenced_var (var);
7432
7433 last_gsi = gsi_last_bb (exit_bb);
7434 ni_name = force_gimple_operand_gsi (&last_gsi, ni, false, var,
7435 true, GSI_SAME_STMT);
7436
7437 /* Fix phi expressions in the successor bb. */
7438 SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name);
7439 }
7440 }
7441
7442 /* Return the more conservative threshold between the
7443 min_profitable_iters returned by the cost model and the user
7444 specified threshold, if provided. */
7445
7446 static unsigned int
7447 conservative_cost_threshold (loop_vec_info loop_vinfo,
7448 int min_profitable_iters)
7449 {
7450 unsigned int th;
7451 int min_scalar_loop_bound;
7452
7453 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
7454 * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
7455
7456 /* Use the cost model only if it is more conservative than user specified
7457 threshold. */
7458 th = (unsigned) min_scalar_loop_bound;
7459 if (min_profitable_iters
7460 && (!min_scalar_loop_bound
7461 || min_profitable_iters > min_scalar_loop_bound))
7462 th = (unsigned) min_profitable_iters;
7463
7464 if (th && vect_print_dump_info (REPORT_COST))
7465 fprintf (vect_dump, "Vectorization may not be profitable.");
7466
7467 return th;
7468 }
7469
7470 /* Function vect_do_peeling_for_loop_bound
7471
7472 Peel the last iterations of the loop represented by LOOP_VINFO.
7473 The peeled iterations form a new epilog loop. Given that the loop now
7474 iterates NITERS times, the new epilog loop iterates
7475 NITERS % VECTORIZATION_FACTOR times.
7476
7477 The original loop will later be made to iterate
7478 NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). */
7479
7480 static void
7481 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
7482 {
7483 tree ni_name, ratio_mult_vf_name;
7484 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7485 struct loop *new_loop;
7486 edge update_e;
7487 basic_block preheader;
7488 int loop_num;
7489 bool check_profitability = false;
7490 unsigned int th = 0;
7491 int min_profitable_iters;
7492
7493 if (vect_print_dump_info (REPORT_DETAILS))
7494 fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ===");
7495
7496 initialize_original_copy_tables ();
7497
7498 /* Generate the following variables on the preheader of original loop:
7499
7500 ni_name = number of iteration the original loop executes
7501 ratio = ni_name / vf
7502 ratio_mult_vf_name = ratio * vf */
7503 vect_generate_tmps_on_preheader (loop_vinfo, &ni_name,
7504 &ratio_mult_vf_name, ratio);
7505
7506 loop_num = loop->num;
7507
7508 /* If cost model check not done during versioning and
7509 peeling for alignment. */
7510 if (!VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
7511 && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo))
7512 && !LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
7513 {
7514 check_profitability = true;
7515
7516 /* Get profitability threshold for vectorized loop. */
7517 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
7518
7519 th = conservative_cost_threshold (loop_vinfo,
7520 min_profitable_iters);
7521 }
7522
7523 new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
7524 ratio_mult_vf_name, ni_name, false,
7525 th, check_profitability);
7526 gcc_assert (new_loop);
7527 gcc_assert (loop_num == loop->num);
7528 #ifdef ENABLE_CHECKING
7529 slpeel_verify_cfg_after_peeling (loop, new_loop);
7530 #endif
7531
7532 /* A guard that controls whether the new_loop is to be executed or skipped
7533 is placed in LOOP->exit. LOOP->exit therefore has two successors - one
7534 is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other
7535 is a bb after NEW_LOOP, where these IVs are not used. Find the edge that
7536 is on the path where the LOOP IVs are used and need to be updated. */
7537
7538 preheader = loop_preheader_edge (new_loop)->src;
7539 if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest)
7540 update_e = EDGE_PRED (preheader, 0);
7541 else
7542 update_e = EDGE_PRED (preheader, 1);
7543
7544 /* Update IVs of original loop as if they were advanced
7545 by ratio_mult_vf_name steps. */
7546 vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
7547
7548 /* After peeling we have to reset scalar evolution analyzer. */
7549 scev_reset ();
7550
7551 free_original_copy_tables ();
7552 }
7553
7554
7555 /* Function vect_gen_niters_for_prolog_loop
7556
7557 Set the number of iterations for the loop represented by LOOP_VINFO
7558 to the minimum between LOOP_NITERS (the original iteration count of the loop)
7559 and the misalignment of DR - the data reference recorded in
7560 LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of
7561 this loop, the data reference DR will refer to an aligned location.
7562
7563 The following computation is generated:
7564
7565 If the misalignment of DR is known at compile time:
7566 addr_mis = int mis = DR_MISALIGNMENT (dr);
7567 Else, compute address misalignment in bytes:
7568 addr_mis = addr & (vectype_size - 1)
7569
7570 prolog_niters = min (LOOP_NITERS, ((VF - addr_mis/elem_size)&(VF-1))/step)
7571
7572 (elem_size = element type size; an element is the scalar element whose type
7573 is the inner type of the vectype)
7574
7575 When the step of the data-ref in the loop is not 1 (as in interleaved data
7576 and SLP), the number of iterations of the prolog must be divided by the step
7577 (which is equal to the size of interleaved group).
7578
7579 The above formulas assume that VF == number of elements in the vector. This
7580 may not hold when there are multiple-types in the loop.
7581 In this case, for some data-references in the loop the VF does not represent
7582 the number of elements that fit in the vector. Therefore, instead of VF we
7583 use TYPE_VECTOR_SUBPARTS. */
7584
7585 static tree
7586 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
7587 {
7588 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
7589 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7590 tree var;
7591 gimple_seq stmts;
7592 tree iters, iters_name;
7593 edge pe;
7594 basic_block new_bb;
7595 gimple dr_stmt = DR_STMT (dr);
7596 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
7597 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7598 int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
7599 tree niters_type = TREE_TYPE (loop_niters);
7600 int step = 1;
7601 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
7602 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
7603
7604 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
7605 step = DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_DR (stmt_info)));
7606
7607 pe = loop_preheader_edge (loop);
7608
7609 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
7610 {
7611 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
7612 int elem_misalign = byte_misalign / element_size;
7613
7614 if (vect_print_dump_info (REPORT_DETAILS))
7615 fprintf (vect_dump, "known alignment = %d.", byte_misalign);
7616
7617 iters = build_int_cst (niters_type,
7618 (((nelements - elem_misalign) & (nelements - 1)) / step));
7619 }
7620 else
7621 {
7622 gimple_seq new_stmts = NULL;
7623 tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt,
7624 &new_stmts, NULL_TREE, loop);
7625 tree ptr_type = TREE_TYPE (start_addr);
7626 tree size = TYPE_SIZE (ptr_type);
7627 tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
7628 tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
7629 tree elem_size_log =
7630 build_int_cst (type, exact_log2 (vectype_align/nelements));
7631 tree nelements_minus_1 = build_int_cst (type, nelements - 1);
7632 tree nelements_tree = build_int_cst (type, nelements);
7633 tree byte_misalign;
7634 tree elem_misalign;
7635
7636 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmts);
7637 gcc_assert (!new_bb);
7638
7639 /* Create: byte_misalign = addr & (vectype_size - 1) */
7640 byte_misalign =
7641 fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr), vectype_size_minus_1);
7642
7643 /* Create: elem_misalign = byte_misalign / element_size */
7644 elem_misalign =
7645 fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
7646
7647 /* Create: (niters_type) (nelements - elem_misalign)&(nelements - 1) */
7648 iters = fold_build2 (MINUS_EXPR, type, nelements_tree, elem_misalign);
7649 iters = fold_build2 (BIT_AND_EXPR, type, iters, nelements_minus_1);
7650 iters = fold_convert (niters_type, iters);
7651 }
7652
7653 /* Create: prolog_loop_niters = min (iters, loop_niters) */
7654 /* If the loop bound is known at compile time we already verified that it is
7655 greater than vf; since the misalignment ('iters') is at most vf, there's
7656 no need to generate the MIN_EXPR in this case. */
7657 if (TREE_CODE (loop_niters) != INTEGER_CST)
7658 iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters);
7659
7660 if (vect_print_dump_info (REPORT_DETAILS))
7661 {
7662 fprintf (vect_dump, "niters for prolog loop: ");
7663 print_generic_expr (vect_dump, iters, TDF_SLIM);
7664 }
7665
7666 var = create_tmp_var (niters_type, "prolog_loop_niters");
7667 add_referenced_var (var);
7668 stmts = NULL;
7669 iters_name = force_gimple_operand (iters, &stmts, false, var);
7670
7671 /* Insert stmt on loop preheader edge. */
7672 if (stmts)
7673 {
7674 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7675 gcc_assert (!new_bb);
7676 }
7677
7678 return iters_name;
7679 }
7680
7681
7682 /* Function vect_update_init_of_dr
7683
7684 NITERS iterations were peeled from LOOP. DR represents a data reference
7685 in LOOP. This function updates the information recorded in DR to
7686 account for the fact that the first NITERS iterations had already been
7687 executed. Specifically, it updates the OFFSET field of DR. */
7688
7689 static void
7690 vect_update_init_of_dr (struct data_reference *dr, tree niters)
7691 {
7692 tree offset = DR_OFFSET (dr);
7693
7694 niters = fold_build2 (MULT_EXPR, sizetype,
7695 fold_convert (sizetype, niters),
7696 fold_convert (sizetype, DR_STEP (dr)));
7697 offset = fold_build2 (PLUS_EXPR, sizetype, offset, niters);
7698 DR_OFFSET (dr) = offset;
7699 }
7700
7701
7702 /* Function vect_update_inits_of_drs
7703
7704 NITERS iterations were peeled from the loop represented by LOOP_VINFO.
7705 This function updates the information recorded for the data references in
7706 the loop to account for the fact that the first NITERS iterations had
7707 already been executed. Specifically, it updates the initial_condition of
7708 the access_function of all the data_references in the loop. */
7709
7710 static void
7711 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
7712 {
7713 unsigned int i;
7714 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
7715 struct data_reference *dr;
7716
7717 if (vect_print_dump_info (REPORT_DETAILS))
7718 fprintf (vect_dump, "=== vect_update_inits_of_dr ===");
7719
7720 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
7721 vect_update_init_of_dr (dr, niters);
7722 }
7723
7724
7725 /* Function vect_do_peeling_for_alignment
7726
7727 Peel the first 'niters' iterations of the loop represented by LOOP_VINFO.
7728 'niters' is set to the misalignment of one of the data references in the
7729 loop, thereby forcing it to refer to an aligned location at the beginning
7730 of the execution of this loop. The data reference for which we are
7731 peeling is recorded in LOOP_VINFO_UNALIGNED_DR. */
7732
7733 static void
7734 vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
7735 {
7736 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7737 tree niters_of_prolog_loop, ni_name;
7738 tree n_iters;
7739 struct loop *new_loop;
7740 bool check_profitability = false;
7741 unsigned int th = 0;
7742 int min_profitable_iters;
7743
7744 if (vect_print_dump_info (REPORT_DETAILS))
7745 fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
7746
7747 initialize_original_copy_tables ();
7748
7749 ni_name = vect_build_loop_niters (loop_vinfo);
7750 niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
7751
7752
7753 /* If cost model check not done during versioning. */
7754 if (!VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
7755 && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
7756 {
7757 check_profitability = true;
7758
7759 /* Get profitability threshold for vectorized loop. */
7760 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
7761
7762 th = conservative_cost_threshold (loop_vinfo,
7763 min_profitable_iters);
7764 }
7765
7766 /* Peel the prolog loop and iterate it niters_of_prolog_loop. */
7767 new_loop =
7768 slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
7769 niters_of_prolog_loop, ni_name, true,
7770 th, check_profitability);
7771
7772 gcc_assert (new_loop);
7773 #ifdef ENABLE_CHECKING
7774 slpeel_verify_cfg_after_peeling (new_loop, loop);
7775 #endif
7776
7777 /* Update number of times loop executes. */
7778 n_iters = LOOP_VINFO_NITERS (loop_vinfo);
7779 LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
7780 TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
7781
7782 /* Update the init conditions of the access functions of all data refs. */
7783 vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop);
7784
7785 /* After peeling we have to reset scalar evolution analyzer. */
7786 scev_reset ();
7787
7788 free_original_copy_tables ();
7789 }
7790
7791
7792 /* Function vect_create_cond_for_align_checks.
7793
7794 Create a conditional expression that represents the alignment checks for
7795 all of data references (array element references) whose alignment must be
7796 checked at runtime.
7797
7798 Input:
7799 COND_EXPR - input conditional expression. New conditions will be chained
7800 with logical AND operation.
7801 LOOP_VINFO - two fields of the loop information are used.
7802 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
7803 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
7804
7805 Output:
7806 COND_EXPR_STMT_LIST - statements needed to construct the conditional
7807 expression.
7808 The returned value is the conditional expression to be used in the if
7809 statement that controls which version of the loop gets executed at runtime.
7810
7811 The algorithm makes two assumptions:
7812 1) The number of bytes "n" in a vector is a power of 2.
7813 2) An address "a" is aligned if a%n is zero and that this
7814 test can be done as a&(n-1) == 0. For example, for 16
7815 byte vectors the test is a&0xf == 0. */
7816
7817 static void
7818 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
7819 tree *cond_expr,
7820 gimple_seq *cond_expr_stmt_list)
7821 {
7822 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7823 VEC(gimple,heap) *may_misalign_stmts
7824 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
7825 gimple ref_stmt;
7826 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
7827 tree mask_cst;
7828 unsigned int i;
7829 tree psize;
7830 tree int_ptrsize_type;
7831 char tmp_name[20];
7832 tree or_tmp_name = NULL_TREE;
7833 tree and_tmp, and_tmp_name;
7834 gimple and_stmt;
7835 tree ptrsize_zero;
7836 tree part_cond_expr;
7837
7838 /* Check that mask is one less than a power of 2, i.e., mask is
7839 all zeros followed by all ones. */
7840 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
7841
7842 /* CHECKME: what is the best integer or unsigned type to use to hold a
7843 cast from a pointer value? */
7844 psize = TYPE_SIZE (ptr_type_node);
7845 int_ptrsize_type
7846 = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0);
7847
7848 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
7849 of the first vector of the i'th data reference. */
7850
7851 for (i = 0; VEC_iterate (gimple, may_misalign_stmts, i, ref_stmt); i++)
7852 {
7853 gimple_seq new_stmt_list = NULL;
7854 tree addr_base;
7855 tree addr_tmp, addr_tmp_name;
7856 tree or_tmp, new_or_tmp_name;
7857 gimple addr_stmt, or_stmt;
7858
7859 /* create: addr_tmp = (int)(address_of_first_vector) */
7860 addr_base =
7861 vect_create_addr_base_for_vector_ref (ref_stmt, &new_stmt_list,
7862 NULL_TREE, loop);
7863 if (new_stmt_list != NULL)
7864 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
7865
7866 sprintf (tmp_name, "%s%d", "addr2int", i);
7867 addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
7868 add_referenced_var (addr_tmp);
7869 addr_tmp_name = make_ssa_name (addr_tmp, NULL);
7870 addr_stmt = gimple_build_assign_with_ops (NOP_EXPR, addr_tmp_name,
7871 addr_base, NULL_TREE);
7872 SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt;
7873 gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
7874
7875 /* The addresses are OR together. */
7876
7877 if (or_tmp_name != NULL_TREE)
7878 {
7879 /* create: or_tmp = or_tmp | addr_tmp */
7880 sprintf (tmp_name, "%s%d", "orptrs", i);
7881 or_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
7882 add_referenced_var (or_tmp);
7883 new_or_tmp_name = make_ssa_name (or_tmp, NULL);
7884 or_stmt = gimple_build_assign_with_ops (BIT_IOR_EXPR,
7885 new_or_tmp_name,
7886 or_tmp_name, addr_tmp_name);
7887 SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt;
7888 gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
7889 or_tmp_name = new_or_tmp_name;
7890 }
7891 else
7892 or_tmp_name = addr_tmp_name;
7893
7894 } /* end for i */
7895
7896 mask_cst = build_int_cst (int_ptrsize_type, mask);
7897
7898 /* create: and_tmp = or_tmp & mask */
7899 and_tmp = create_tmp_var (int_ptrsize_type, "andmask" );
7900 add_referenced_var (and_tmp);
7901 and_tmp_name = make_ssa_name (and_tmp, NULL);
7902
7903 and_stmt = gimple_build_assign_with_ops (BIT_AND_EXPR, and_tmp_name,
7904 or_tmp_name, mask_cst);
7905 SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt;
7906 gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
7907
7908 /* Make and_tmp the left operand of the conditional test against zero.
7909 if and_tmp has a nonzero bit then some address is unaligned. */
7910 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
7911 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
7912 and_tmp_name, ptrsize_zero);
7913 if (*cond_expr)
7914 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
7915 *cond_expr, part_cond_expr);
7916 else
7917 *cond_expr = part_cond_expr;
7918 }
7919
7920 /* Function vect_vfa_segment_size.
7921
7922 Create an expression that computes the size of segment
7923 that will be accessed for a data reference. The functions takes into
7924 account that realignment loads may access one more vector.
7925
7926 Input:
7927 DR: The data reference.
7928 VECT_FACTOR: vectorization factor.
7929
7930 Return an expression whose value is the size of segment which will be
7931 accessed by DR. */
7932
7933 static tree
7934 vect_vfa_segment_size (struct data_reference *dr, tree vect_factor)
7935 {
7936 tree segment_length = fold_build2 (MULT_EXPR, integer_type_node,
7937 DR_STEP (dr), vect_factor);
7938
7939 if (vect_supportable_dr_alignment (dr) == dr_explicit_realign_optimized)
7940 {
7941 tree vector_size = TYPE_SIZE_UNIT
7942 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
7943
7944 segment_length = fold_build2 (PLUS_EXPR, integer_type_node,
7945 segment_length, vector_size);
7946 }
7947 return fold_convert (sizetype, segment_length);
7948 }
7949
7950 /* Function vect_create_cond_for_alias_checks.
7951
7952 Create a conditional expression that represents the run-time checks for
7953 overlapping of address ranges represented by a list of data references
7954 relations passed as input.
7955
7956 Input:
7957 COND_EXPR - input conditional expression. New conditions will be chained
7958 with logical AND operation.
7959 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
7960 to be checked.
7961
7962 Output:
7963 COND_EXPR - conditional expression.
7964 COND_EXPR_STMT_LIST - statements needed to construct the conditional
7965 expression.
7966
7967
7968 The returned value is the conditional expression to be used in the if
7969 statement that controls which version of the loop gets executed at runtime.
7970 */
7971
7972 static void
7973 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo,
7974 tree * cond_expr,
7975 gimple_seq * cond_expr_stmt_list)
7976 {
7977 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7978 VEC (ddr_p, heap) * may_alias_ddrs =
7979 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
7980 tree vect_factor =
7981 build_int_cst (integer_type_node, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
7982
7983 ddr_p ddr;
7984 unsigned int i;
7985 tree part_cond_expr;
7986
7987 /* Create expression
7988 ((store_ptr_0 + store_segment_length_0) < load_ptr_0)
7989 || (load_ptr_0 + load_segment_length_0) < store_ptr_0))
7990 &&
7991 ...
7992 &&
7993 ((store_ptr_n + store_segment_length_n) < load_ptr_n)
7994 || (load_ptr_n + load_segment_length_n) < store_ptr_n)) */
7995
7996 if (VEC_empty (ddr_p, may_alias_ddrs))
7997 return;
7998
7999 for (i = 0; VEC_iterate (ddr_p, may_alias_ddrs, i, ddr); i++)
8000 {
8001 struct data_reference *dr_a, *dr_b;
8002 gimple dr_group_first_a, dr_group_first_b;
8003 tree addr_base_a, addr_base_b;
8004 tree segment_length_a, segment_length_b;
8005 gimple stmt_a, stmt_b;
8006
8007 dr_a = DDR_A (ddr);
8008 stmt_a = DR_STMT (DDR_A (ddr));
8009 dr_group_first_a = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_a));
8010 if (dr_group_first_a)
8011 {
8012 stmt_a = dr_group_first_a;
8013 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
8014 }
8015
8016 dr_b = DDR_B (ddr);
8017 stmt_b = DR_STMT (DDR_B (ddr));
8018 dr_group_first_b = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_b));
8019 if (dr_group_first_b)
8020 {
8021 stmt_b = dr_group_first_b;
8022 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
8023 }
8024
8025 addr_base_a =
8026 vect_create_addr_base_for_vector_ref (stmt_a, cond_expr_stmt_list,
8027 NULL_TREE, loop);
8028 addr_base_b =
8029 vect_create_addr_base_for_vector_ref (stmt_b, cond_expr_stmt_list,
8030 NULL_TREE, loop);
8031
8032 segment_length_a = vect_vfa_segment_size (dr_a, vect_factor);
8033 segment_length_b = vect_vfa_segment_size (dr_b, vect_factor);
8034
8035 if (vect_print_dump_info (REPORT_DR_DETAILS))
8036 {
8037 fprintf (vect_dump,
8038 "create runtime check for data references ");
8039 print_generic_expr (vect_dump, DR_REF (dr_a), TDF_SLIM);
8040 fprintf (vect_dump, " and ");
8041 print_generic_expr (vect_dump, DR_REF (dr_b), TDF_SLIM);
8042 }
8043
8044
8045 part_cond_expr =
8046 fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
8047 fold_build2 (LT_EXPR, boolean_type_node,
8048 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_a),
8049 addr_base_a,
8050 segment_length_a),
8051 addr_base_b),
8052 fold_build2 (LT_EXPR, boolean_type_node,
8053 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_b),
8054 addr_base_b,
8055 segment_length_b),
8056 addr_base_a));
8057
8058 if (*cond_expr)
8059 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
8060 *cond_expr, part_cond_expr);
8061 else
8062 *cond_expr = part_cond_expr;
8063 }
8064 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
8065 fprintf (vect_dump, "created %u versioning for alias checks.\n",
8066 VEC_length (ddr_p, may_alias_ddrs));
8067
8068 }
8069
8070 /* Function vect_loop_versioning.
8071
8072 If the loop has data references that may or may not be aligned or/and
8073 has data reference relations whose independence was not proven then
8074 two versions of the loop need to be generated, one which is vectorized
8075 and one which isn't. A test is then generated to control which of the
8076 loops is executed. The test checks for the alignment of all of the
8077 data references that may or may not be aligned. An additional
8078 sequence of runtime tests is generated for each pairs of DDRs whose
8079 independence was not proven. The vectorized version of loop is
8080 executed only if both alias and alignment tests are passed.
8081
8082 The test generated to check which version of loop is executed
8083 is modified to also check for profitability as indicated by the
8084 cost model initially. */
8085
8086 static void
8087 vect_loop_versioning (loop_vec_info loop_vinfo)
8088 {
8089 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8090 struct loop *nloop;
8091 tree cond_expr = NULL_TREE;
8092 gimple_seq cond_expr_stmt_list = NULL;
8093 basic_block condition_bb;
8094 gimple_stmt_iterator gsi, cond_exp_gsi;
8095 basic_block merge_bb;
8096 basic_block new_exit_bb;
8097 edge new_exit_e, e;
8098 gimple orig_phi, new_phi;
8099 tree arg;
8100 unsigned prob = 4 * REG_BR_PROB_BASE / 5;
8101 gimple_seq gimplify_stmt_list = NULL;
8102 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
8103 int min_profitable_iters = 0;
8104 unsigned int th;
8105
8106 /* Get profitability threshold for vectorized loop. */
8107 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
8108
8109 th = conservative_cost_threshold (loop_vinfo,
8110 min_profitable_iters);
8111
8112 cond_expr =
8113 fold_build2 (GT_EXPR, boolean_type_node, scalar_loop_iters,
8114 build_int_cst (TREE_TYPE (scalar_loop_iters), th));
8115
8116 cond_expr = force_gimple_operand (cond_expr, &cond_expr_stmt_list,
8117 false, NULL_TREE);
8118
8119 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
8120 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
8121 &cond_expr_stmt_list);
8122
8123 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
8124 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr,
8125 &cond_expr_stmt_list);
8126
8127 cond_expr =
8128 fold_build2 (NE_EXPR, boolean_type_node, cond_expr, integer_zero_node);
8129 cond_expr =
8130 force_gimple_operand (cond_expr, &gimplify_stmt_list, true, NULL_TREE);
8131 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
8132
8133 initialize_original_copy_tables ();
8134 nloop = loop_version (loop, cond_expr, &condition_bb,
8135 prob, prob, REG_BR_PROB_BASE - prob, true);
8136 free_original_copy_tables();
8137
8138 /* Loop versioning violates an assumption we try to maintain during
8139 vectorization - that the loop exit block has a single predecessor.
8140 After versioning, the exit block of both loop versions is the same
8141 basic block (i.e. it has two predecessors). Just in order to simplify
8142 following transformations in the vectorizer, we fix this situation
8143 here by adding a new (empty) block on the exit-edge of the loop,
8144 with the proper loop-exit phis to maintain loop-closed-form. */
8145
8146 merge_bb = single_exit (loop)->dest;
8147 gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
8148 new_exit_bb = split_edge (single_exit (loop));
8149 new_exit_e = single_exit (loop);
8150 e = EDGE_SUCC (new_exit_bb, 0);
8151
8152 for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); gsi_next (&gsi))
8153 {
8154 orig_phi = gsi_stmt (gsi);
8155 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
8156 new_exit_bb);
8157 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
8158 add_phi_arg (new_phi, arg, new_exit_e);
8159 SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
8160 }
8161
8162 /* End loop-exit-fixes after versioning. */
8163
8164 update_ssa (TODO_update_ssa);
8165 if (cond_expr_stmt_list)
8166 {
8167 cond_exp_gsi = gsi_last_bb (condition_bb);
8168 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list, GSI_SAME_STMT);
8169 }
8170 }
8171
8172 /* Remove a group of stores (for SLP or interleaving), free their
8173 stmt_vec_info. */
8174
8175 static void
8176 vect_remove_stores (gimple first_stmt)
8177 {
8178 gimple next = first_stmt;
8179 gimple tmp;
8180 gimple_stmt_iterator next_si;
8181
8182 while (next)
8183 {
8184 /* Free the attached stmt_vec_info and remove the stmt. */
8185 next_si = gsi_for_stmt (next);
8186 gsi_remove (&next_si, true);
8187 tmp = DR_GROUP_NEXT_DR (vinfo_for_stmt (next));
8188 free_stmt_vec_info (next);
8189 next = tmp;
8190 }
8191 }
8192
8193
8194 /* Vectorize SLP instance tree in postorder. */
8195
8196 static bool
8197 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
8198 unsigned int vectorization_factor)
8199 {
8200 gimple stmt;
8201 bool strided_store, is_store;
8202 gimple_stmt_iterator si;
8203 stmt_vec_info stmt_info;
8204 unsigned int vec_stmts_size, nunits, group_size;
8205 tree vectype;
8206 int i;
8207 slp_tree loads_node;
8208
8209 if (!node)
8210 return false;
8211
8212 vect_schedule_slp_instance (SLP_TREE_LEFT (node), instance,
8213 vectorization_factor);
8214 vect_schedule_slp_instance (SLP_TREE_RIGHT (node), instance,
8215 vectorization_factor);
8216
8217 stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (node), 0);
8218 stmt_info = vinfo_for_stmt (stmt);
8219
8220 /* VECTYPE is the type of the destination. */
8221 vectype = get_vectype_for_scalar_type (TREE_TYPE (gimple_assign_lhs (stmt)));
8222 nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype);
8223 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
8224
8225 /* For each SLP instance calculate number of vector stmts to be created
8226 for the scalar stmts in each node of the SLP tree. Number of vector
8227 elements in one vector iteration is the number of scalar elements in
8228 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
8229 size. */
8230 vec_stmts_size = (vectorization_factor * group_size) / nunits;
8231
8232 /* In case of load permutation we have to allocate vectorized statements for
8233 all the nodes that participate in that permutation. */
8234 if (SLP_INSTANCE_LOAD_PERMUTATION (instance))
8235 {
8236 for (i = 0;
8237 VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (instance), i, loads_node);
8238 i++)
8239 {
8240 if (!SLP_TREE_VEC_STMTS (loads_node))
8241 {
8242 SLP_TREE_VEC_STMTS (loads_node) = VEC_alloc (gimple, heap,
8243 vec_stmts_size);
8244 SLP_TREE_NUMBER_OF_VEC_STMTS (loads_node) = vec_stmts_size;
8245 }
8246 }
8247 }
8248
8249 if (!SLP_TREE_VEC_STMTS (node))
8250 {
8251 SLP_TREE_VEC_STMTS (node) = VEC_alloc (gimple, heap, vec_stmts_size);
8252 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
8253 }
8254
8255 if (vect_print_dump_info (REPORT_DETAILS))
8256 {
8257 fprintf (vect_dump, "------>vectorizing SLP node starting from: ");
8258 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
8259 }
8260
8261 /* Loads should be inserted before the first load. */
8262 if (SLP_INSTANCE_FIRST_LOAD_STMT (instance)
8263 && STMT_VINFO_STRIDED_ACCESS (stmt_info)
8264 && !REFERENCE_CLASS_P (gimple_get_lhs (stmt)))
8265 si = gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance));
8266 else
8267 si = gsi_for_stmt (stmt);
8268
8269 is_store = vect_transform_stmt (stmt, &si, &strided_store, node, instance);
8270 if (is_store)
8271 {
8272 if (DR_GROUP_FIRST_DR (stmt_info))
8273 /* If IS_STORE is TRUE, the vectorization of the
8274 interleaving chain was completed - free all the stores in
8275 the chain. */
8276 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
8277 else
8278 /* FORNOW: SLP originates only from strided stores. */
8279 gcc_unreachable ();
8280
8281 return true;
8282 }
8283
8284 /* FORNOW: SLP originates only from strided stores. */
8285 return false;
8286 }
8287
8288
8289 static bool
8290 vect_schedule_slp (loop_vec_info loop_vinfo)
8291 {
8292 VEC (slp_instance, heap) *slp_instances =
8293 LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
8294 slp_instance instance;
8295 unsigned int i;
8296 bool is_store = false;
8297
8298 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
8299 {
8300 /* Schedule the tree of INSTANCE. */
8301 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
8302 instance, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
8303
8304 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS)
8305 || vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
8306 fprintf (vect_dump, "vectorizing stmts using SLP.");
8307 }
8308
8309 return is_store;
8310 }
8311
8312 /* Function vect_transform_loop.
8313
8314 The analysis phase has determined that the loop is vectorizable.
8315 Vectorize the loop - created vectorized stmts to replace the scalar
8316 stmts in the loop, and update the loop exit condition. */
8317
8318 void
8319 vect_transform_loop (loop_vec_info loop_vinfo)
8320 {
8321 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8322 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8323 int nbbs = loop->num_nodes;
8324 gimple_stmt_iterator si;
8325 int i;
8326 tree ratio = NULL;
8327 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8328 bool strided_store;
8329 bool slp_scheduled = false;
8330 unsigned int nunits;
8331
8332 if (vect_print_dump_info (REPORT_DETAILS))
8333 fprintf (vect_dump, "=== vec_transform_loop ===");
8334
8335 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
8336 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
8337 vect_loop_versioning (loop_vinfo);
8338
8339 /* CHECKME: we wouldn't need this if we called update_ssa once
8340 for all loops. */
8341 bitmap_zero (vect_memsyms_to_rename);
8342
8343 /* Peel the loop if there are data refs with unknown alignment.
8344 Only one data ref with unknown store is allowed. */
8345
8346 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
8347 vect_do_peeling_for_alignment (loop_vinfo);
8348
8349 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
8350 compile time constant), or it is a constant that doesn't divide by the
8351 vectorization factor, then an epilog loop needs to be created.
8352 We therefore duplicate the loop: the original loop will be vectorized,
8353 and will compute the first (n/VF) iterations. The second copy of the loop
8354 will remain scalar and will compute the remaining (n%VF) iterations.
8355 (VF is the vectorization factor). */
8356
8357 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8358 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8359 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0))
8360 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio);
8361 else
8362 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8363 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
8364
8365 /* 1) Make sure the loop header has exactly two entries
8366 2) Make sure we have a preheader basic block. */
8367
8368 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8369
8370 split_edge (loop_preheader_edge (loop));
8371
8372 /* FORNOW: the vectorizer supports only loops which body consist
8373 of one basic block (header + empty latch). When the vectorizer will
8374 support more involved loop forms, the order by which the BBs are
8375 traversed need to be reconsidered. */
8376
8377 for (i = 0; i < nbbs; i++)
8378 {
8379 basic_block bb = bbs[i];
8380 stmt_vec_info stmt_info;
8381 gimple phi;
8382
8383 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
8384 {
8385 phi = gsi_stmt (si);
8386 if (vect_print_dump_info (REPORT_DETAILS))
8387 {
8388 fprintf (vect_dump, "------>vectorizing phi: ");
8389 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
8390 }
8391 stmt_info = vinfo_for_stmt (phi);
8392 if (!stmt_info)
8393 continue;
8394
8395 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8396 && !STMT_VINFO_LIVE_P (stmt_info))
8397 continue;
8398
8399 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
8400 != (unsigned HOST_WIDE_INT) vectorization_factor)
8401 && vect_print_dump_info (REPORT_DETAILS))
8402 fprintf (vect_dump, "multiple-types.");
8403
8404 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
8405 {
8406 if (vect_print_dump_info (REPORT_DETAILS))
8407 fprintf (vect_dump, "transform phi.");
8408 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8409 }
8410 }
8411
8412 for (si = gsi_start_bb (bb); !gsi_end_p (si);)
8413 {
8414 gimple stmt = gsi_stmt (si);
8415 bool is_store;
8416
8417 if (vect_print_dump_info (REPORT_DETAILS))
8418 {
8419 fprintf (vect_dump, "------>vectorizing statement: ");
8420 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
8421 }
8422
8423 stmt_info = vinfo_for_stmt (stmt);
8424
8425 /* vector stmts created in the outer-loop during vectorization of
8426 stmts in an inner-loop may not have a stmt_info, and do not
8427 need to be vectorized. */
8428 if (!stmt_info)
8429 {
8430 gsi_next (&si);
8431 continue;
8432 }
8433
8434 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8435 && !STMT_VINFO_LIVE_P (stmt_info))
8436 {
8437 gsi_next (&si);
8438 continue;
8439 }
8440
8441 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
8442 nunits =
8443 (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8444 if (!STMT_SLP_TYPE (stmt_info)
8445 && nunits != (unsigned int) vectorization_factor
8446 && vect_print_dump_info (REPORT_DETAILS))
8447 /* For SLP VF is set according to unrolling factor, and not to
8448 vector size, hence for SLP this print is not valid. */
8449 fprintf (vect_dump, "multiple-types.");
8450
8451 /* SLP. Schedule all the SLP instances when the first SLP stmt is
8452 reached. */
8453 if (STMT_SLP_TYPE (stmt_info))
8454 {
8455 if (!slp_scheduled)
8456 {
8457 slp_scheduled = true;
8458
8459 if (vect_print_dump_info (REPORT_DETAILS))
8460 fprintf (vect_dump, "=== scheduling SLP instances ===");
8461
8462 vect_schedule_slp (loop_vinfo);
8463 }
8464
8465 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
8466 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
8467 {
8468 gsi_next (&si);
8469 continue;
8470 }
8471 }
8472
8473 /* -------- vectorize statement ------------ */
8474 if (vect_print_dump_info (REPORT_DETAILS))
8475 fprintf (vect_dump, "transform statement.");
8476
8477 strided_store = false;
8478 is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
8479 if (is_store)
8480 {
8481 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
8482 {
8483 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
8484 interleaving chain was completed - free all the stores in
8485 the chain. */
8486 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
8487 gsi_remove (&si, true);
8488 continue;
8489 }
8490 else
8491 {
8492 /* Free the attached stmt_vec_info and remove the stmt. */
8493 free_stmt_vec_info (stmt);
8494 gsi_remove (&si, true);
8495 continue;
8496 }
8497 }
8498 gsi_next (&si);
8499 } /* stmts in BB */
8500 } /* BBs in loop */
8501
8502 slpeel_make_loop_iterate_ntimes (loop, ratio);
8503
8504 mark_set_for_renaming (vect_memsyms_to_rename);
8505
8506 /* The memory tags and pointers in vectorized statements need to
8507 have their SSA forms updated. FIXME, why can't this be delayed
8508 until all the loops have been transformed? */
8509 update_ssa (TODO_update_ssa);
8510
8511 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
8512 fprintf (vect_dump, "LOOP VECTORIZED.");
8513 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
8514 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");
8515 }