Mercurial > hg > CbC > CbC_gcc
comparison gcc/tree-vect-transform.c @ 0:a06113de4d67
first commit
author | kent <kent@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Fri, 17 Jul 2009 14:47:48 +0900 |
parents | |
children | 855418dad1a3 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:a06113de4d67 |
---|---|
1 /* Transformation Utilities for Loop Vectorization. | |
2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009 | |
3 Free Software Foundation, Inc. | |
4 Contributed by Dorit Naishlos <dorit@il.ibm.com> | |
5 | |
6 This file is part of GCC. | |
7 | |
8 GCC is free software; you can redistribute it and/or modify it under | |
9 the terms of the GNU General Public License as published by the Free | |
10 Software Foundation; either version 3, or (at your option) any later | |
11 version. | |
12 | |
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY | |
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
16 for more details. | |
17 | |
18 You should have received a copy of the GNU General Public License | |
19 along with GCC; see the file COPYING3. If not see | |
20 <http://www.gnu.org/licenses/>. */ | |
21 | |
22 #include "config.h" | |
23 #include "system.h" | |
24 #include "coretypes.h" | |
25 #include "tm.h" | |
26 #include "ggc.h" | |
27 #include "tree.h" | |
28 #include "target.h" | |
29 #include "rtl.h" | |
30 #include "basic-block.h" | |
31 #include "diagnostic.h" | |
32 #include "tree-flow.h" | |
33 #include "tree-dump.h" | |
34 #include "timevar.h" | |
35 #include "cfgloop.h" | |
36 #include "expr.h" | |
37 #include "optabs.h" | |
38 #include "params.h" | |
39 #include "recog.h" | |
40 #include "tree-data-ref.h" | |
41 #include "tree-chrec.h" | |
42 #include "tree-scalar-evolution.h" | |
43 #include "tree-vectorizer.h" | |
44 #include "langhooks.h" | |
45 #include "tree-pass.h" | |
46 #include "toplev.h" | |
47 #include "real.h" | |
48 | |
49 /* Utility functions for the code transformation. */ | |
50 static bool vect_transform_stmt (gimple, gimple_stmt_iterator *, bool *, | |
51 slp_tree, slp_instance); | |
52 static tree vect_create_destination_var (tree, tree); | |
53 static tree vect_create_data_ref_ptr | |
54 (gimple, struct loop*, tree, tree *, gimple *, bool, bool *, tree); | |
55 static tree vect_create_addr_base_for_vector_ref | |
56 (gimple, gimple_seq *, tree, struct loop *); | |
57 static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *); | |
58 static tree vect_get_vec_def_for_operand (tree, gimple, tree *); | |
59 static tree vect_init_vector (gimple, tree, tree, gimple_stmt_iterator *); | |
60 static void vect_finish_stmt_generation | |
61 (gimple stmt, gimple vec_stmt, gimple_stmt_iterator *); | |
62 static bool vect_is_simple_cond (tree, loop_vec_info); | |
63 static void vect_create_epilog_for_reduction | |
64 (tree, gimple, int, enum tree_code, gimple); | |
65 static tree get_initial_def_for_reduction (gimple, tree, tree *); | |
66 | |
67 /* Utility function dealing with loop peeling (not peeling itself). */ | |
68 static void vect_generate_tmps_on_preheader | |
69 (loop_vec_info, tree *, tree *, tree *); | |
70 static tree vect_build_loop_niters (loop_vec_info); | |
71 static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge); | |
72 static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree); | |
73 static void vect_update_init_of_dr (struct data_reference *, tree niters); | |
74 static void vect_update_inits_of_drs (loop_vec_info, tree); | |
75 static int vect_min_worthwhile_factor (enum tree_code); | |
76 | |
77 | |
78 static int | |
79 cost_for_stmt (gimple stmt) | |
80 { | |
81 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
82 | |
83 switch (STMT_VINFO_TYPE (stmt_info)) | |
84 { | |
85 case load_vec_info_type: | |
86 return TARG_SCALAR_LOAD_COST; | |
87 case store_vec_info_type: | |
88 return TARG_SCALAR_STORE_COST; | |
89 case op_vec_info_type: | |
90 case condition_vec_info_type: | |
91 case assignment_vec_info_type: | |
92 case reduc_vec_info_type: | |
93 case induc_vec_info_type: | |
94 case type_promotion_vec_info_type: | |
95 case type_demotion_vec_info_type: | |
96 case type_conversion_vec_info_type: | |
97 case call_vec_info_type: | |
98 return TARG_SCALAR_STMT_COST; | |
99 case undef_vec_info_type: | |
100 default: | |
101 gcc_unreachable (); | |
102 } | |
103 } | |
104 | |
105 | |
106 /* Function vect_estimate_min_profitable_iters | |
107 | |
108 Return the number of iterations required for the vector version of the | |
109 loop to be profitable relative to the cost of the scalar version of the | |
110 loop. | |
111 | |
112 TODO: Take profile info into account before making vectorization | |
113 decisions, if available. */ | |
114 | |
115 int | |
116 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo) | |
117 { | |
118 int i; | |
119 int min_profitable_iters; | |
120 int peel_iters_prologue; | |
121 int peel_iters_epilogue; | |
122 int vec_inside_cost = 0; | |
123 int vec_outside_cost = 0; | |
124 int scalar_single_iter_cost = 0; | |
125 int scalar_outside_cost = 0; | |
126 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); | |
127 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
128 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); | |
129 int nbbs = loop->num_nodes; | |
130 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo); | |
131 int peel_guard_costs = 0; | |
132 int innerloop_iters = 0, factor; | |
133 VEC (slp_instance, heap) *slp_instances; | |
134 slp_instance instance; | |
135 | |
136 /* Cost model disabled. */ | |
137 if (!flag_vect_cost_model) | |
138 { | |
139 if (vect_print_dump_info (REPORT_COST)) | |
140 fprintf (vect_dump, "cost model disabled."); | |
141 return 0; | |
142 } | |
143 | |
144 /* Requires loop versioning tests to handle misalignment. */ | |
145 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))) | |
146 { | |
147 /* FIXME: Make cost depend on complexity of individual check. */ | |
148 vec_outside_cost += | |
149 VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)); | |
150 if (vect_print_dump_info (REPORT_COST)) | |
151 fprintf (vect_dump, "cost model: Adding cost of checks for loop " | |
152 "versioning to treat misalignment.\n"); | |
153 } | |
154 | |
155 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo))) | |
156 { | |
157 /* FIXME: Make cost depend on complexity of individual check. */ | |
158 vec_outside_cost += | |
159 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)); | |
160 if (vect_print_dump_info (REPORT_COST)) | |
161 fprintf (vect_dump, "cost model: Adding cost of checks for loop " | |
162 "versioning aliasing.\n"); | |
163 } | |
164 | |
165 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)) | |
166 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo))) | |
167 { | |
168 vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST; | |
169 } | |
170 | |
171 /* Count statements in scalar loop. Using this as scalar cost for a single | |
172 iteration for now. | |
173 | |
174 TODO: Add outer loop support. | |
175 | |
176 TODO: Consider assigning different costs to different scalar | |
177 statements. */ | |
178 | |
179 /* FORNOW. */ | |
180 if (loop->inner) | |
181 innerloop_iters = 50; /* FIXME */ | |
182 | |
183 for (i = 0; i < nbbs; i++) | |
184 { | |
185 gimple_stmt_iterator si; | |
186 basic_block bb = bbs[i]; | |
187 | |
188 if (bb->loop_father == loop->inner) | |
189 factor = innerloop_iters; | |
190 else | |
191 factor = 1; | |
192 | |
193 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) | |
194 { | |
195 gimple stmt = gsi_stmt (si); | |
196 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
197 /* Skip stmts that are not vectorized inside the loop. */ | |
198 if (!STMT_VINFO_RELEVANT_P (stmt_info) | |
199 && (!STMT_VINFO_LIVE_P (stmt_info) | |
200 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)) | |
201 continue; | |
202 scalar_single_iter_cost += cost_for_stmt (stmt) * factor; | |
203 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor; | |
204 /* FIXME: for stmts in the inner-loop in outer-loop vectorization, | |
205 some of the "outside" costs are generated inside the outer-loop. */ | |
206 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info); | |
207 } | |
208 } | |
209 | |
210 /* Add additional cost for the peeled instructions in prologue and epilogue | |
211 loop. | |
212 | |
213 FORNOW: If we don't know the value of peel_iters for prologue or epilogue | |
214 at compile-time - we assume it's vf/2 (the worst would be vf-1). | |
215 | |
216 TODO: Build an expression that represents peel_iters for prologue and | |
217 epilogue to be used in a run-time test. */ | |
218 | |
219 if (byte_misalign < 0) | |
220 { | |
221 peel_iters_prologue = vf/2; | |
222 if (vect_print_dump_info (REPORT_COST)) | |
223 fprintf (vect_dump, "cost model: " | |
224 "prologue peel iters set to vf/2."); | |
225 | |
226 /* If peeling for alignment is unknown, loop bound of main loop becomes | |
227 unknown. */ | |
228 peel_iters_epilogue = vf/2; | |
229 if (vect_print_dump_info (REPORT_COST)) | |
230 fprintf (vect_dump, "cost model: " | |
231 "epilogue peel iters set to vf/2 because " | |
232 "peeling for alignment is unknown ."); | |
233 | |
234 /* If peeled iterations are unknown, count a taken branch and a not taken | |
235 branch per peeled loop. Even if scalar loop iterations are known, | |
236 vector iterations are not known since peeled prologue iterations are | |
237 not known. Hence guards remain the same. */ | |
238 peel_guard_costs += 2 * (TARG_COND_TAKEN_BRANCH_COST | |
239 + TARG_COND_NOT_TAKEN_BRANCH_COST); | |
240 } | |
241 else | |
242 { | |
243 if (byte_misalign) | |
244 { | |
245 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo); | |
246 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr)))); | |
247 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))); | |
248 int nelements = TYPE_VECTOR_SUBPARTS (vectype); | |
249 | |
250 peel_iters_prologue = nelements - (byte_misalign / element_size); | |
251 } | |
252 else | |
253 peel_iters_prologue = 0; | |
254 | |
255 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) | |
256 { | |
257 peel_iters_epilogue = vf/2; | |
258 if (vect_print_dump_info (REPORT_COST)) | |
259 fprintf (vect_dump, "cost model: " | |
260 "epilogue peel iters set to vf/2 because " | |
261 "loop iterations are unknown ."); | |
262 | |
263 /* If peeled iterations are known but number of scalar loop | |
264 iterations are unknown, count a taken branch per peeled loop. */ | |
265 peel_guard_costs += 2 * TARG_COND_TAKEN_BRANCH_COST; | |
266 | |
267 } | |
268 else | |
269 { | |
270 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo); | |
271 peel_iters_prologue = niters < peel_iters_prologue ? | |
272 niters : peel_iters_prologue; | |
273 peel_iters_epilogue = (niters - peel_iters_prologue) % vf; | |
274 } | |
275 } | |
276 | |
277 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost) | |
278 + (peel_iters_epilogue * scalar_single_iter_cost) | |
279 + peel_guard_costs; | |
280 | |
281 /* FORNOW: The scalar outside cost is incremented in one of the | |
282 following ways: | |
283 | |
284 1. The vectorizer checks for alignment and aliasing and generates | |
285 a condition that allows dynamic vectorization. A cost model | |
286 check is ANDED with the versioning condition. Hence scalar code | |
287 path now has the added cost of the versioning check. | |
288 | |
289 if (cost > th & versioning_check) | |
290 jmp to vector code | |
291 | |
292 Hence run-time scalar is incremented by not-taken branch cost. | |
293 | |
294 2. The vectorizer then checks if a prologue is required. If the | |
295 cost model check was not done before during versioning, it has to | |
296 be done before the prologue check. | |
297 | |
298 if (cost <= th) | |
299 prologue = scalar_iters | |
300 if (prologue == 0) | |
301 jmp to vector code | |
302 else | |
303 execute prologue | |
304 if (prologue == num_iters) | |
305 go to exit | |
306 | |
307 Hence the run-time scalar cost is incremented by a taken branch, | |
308 plus a not-taken branch, plus a taken branch cost. | |
309 | |
310 3. The vectorizer then checks if an epilogue is required. If the | |
311 cost model check was not done before during prologue check, it | |
312 has to be done with the epilogue check. | |
313 | |
314 if (prologue == 0) | |
315 jmp to vector code | |
316 else | |
317 execute prologue | |
318 if (prologue == num_iters) | |
319 go to exit | |
320 vector code: | |
321 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0)) | |
322 jmp to epilogue | |
323 | |
324 Hence the run-time scalar cost should be incremented by 2 taken | |
325 branches. | |
326 | |
327 TODO: The back end may reorder the BBS's differently and reverse | |
328 conditions/branch directions. Change the estimates below to | |
329 something more reasonable. */ | |
330 | |
331 /* If the number of iterations is known and we do not do versioning, we can | |
332 decide whether to vectorize at compile time. Hence the scalar version | |
333 do not carry cost model guard costs. */ | |
334 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) | |
335 || VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)) | |
336 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo))) | |
337 { | |
338 /* Cost model check occurs at versioning. */ | |
339 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)) | |
340 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo))) | |
341 scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST; | |
342 else | |
343 { | |
344 /* Cost model check occurs at prologue generation. */ | |
345 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0) | |
346 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST | |
347 + TARG_COND_NOT_TAKEN_BRANCH_COST; | |
348 /* Cost model check occurs at epilogue generation. */ | |
349 else | |
350 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST; | |
351 } | |
352 } | |
353 | |
354 /* Add SLP costs. */ | |
355 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo); | |
356 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++) | |
357 { | |
358 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance); | |
359 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance); | |
360 } | |
361 | |
362 /* Calculate number of iterations required to make the vector version | |
363 profitable, relative to the loop bodies only. The following condition | |
364 must hold true: | |
365 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC | |
366 where | |
367 SIC = scalar iteration cost, VIC = vector iteration cost, | |
368 VOC = vector outside cost, VF = vectorization factor, | |
369 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations | |
370 SOC = scalar outside cost for run time cost model check. */ | |
371 | |
372 if ((scalar_single_iter_cost * vf) > vec_inside_cost) | |
373 { | |
374 if (vec_outside_cost <= 0) | |
375 min_profitable_iters = 1; | |
376 else | |
377 { | |
378 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf | |
379 - vec_inside_cost * peel_iters_prologue | |
380 - vec_inside_cost * peel_iters_epilogue) | |
381 / ((scalar_single_iter_cost * vf) | |
382 - vec_inside_cost); | |
383 | |
384 if ((scalar_single_iter_cost * vf * min_profitable_iters) | |
385 <= ((vec_inside_cost * min_profitable_iters) | |
386 + ((vec_outside_cost - scalar_outside_cost) * vf))) | |
387 min_profitable_iters++; | |
388 } | |
389 } | |
390 /* vector version will never be profitable. */ | |
391 else | |
392 { | |
393 if (vect_print_dump_info (REPORT_COST)) | |
394 fprintf (vect_dump, "cost model: vector iteration cost = %d " | |
395 "is divisible by scalar iteration cost = %d by a factor " | |
396 "greater than or equal to the vectorization factor = %d .", | |
397 vec_inside_cost, scalar_single_iter_cost, vf); | |
398 return -1; | |
399 } | |
400 | |
401 if (vect_print_dump_info (REPORT_COST)) | |
402 { | |
403 fprintf (vect_dump, "Cost model analysis: \n"); | |
404 fprintf (vect_dump, " Vector inside of loop cost: %d\n", | |
405 vec_inside_cost); | |
406 fprintf (vect_dump, " Vector outside of loop cost: %d\n", | |
407 vec_outside_cost); | |
408 fprintf (vect_dump, " Scalar iteration cost: %d\n", | |
409 scalar_single_iter_cost); | |
410 fprintf (vect_dump, " Scalar outside cost: %d\n", scalar_outside_cost); | |
411 fprintf (vect_dump, " prologue iterations: %d\n", | |
412 peel_iters_prologue); | |
413 fprintf (vect_dump, " epilogue iterations: %d\n", | |
414 peel_iters_epilogue); | |
415 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n", | |
416 min_profitable_iters); | |
417 } | |
418 | |
419 min_profitable_iters = | |
420 min_profitable_iters < vf ? vf : min_profitable_iters; | |
421 | |
422 /* Because the condition we create is: | |
423 if (niters <= min_profitable_iters) | |
424 then skip the vectorized loop. */ | |
425 min_profitable_iters--; | |
426 | |
427 if (vect_print_dump_info (REPORT_COST)) | |
428 fprintf (vect_dump, " Profitability threshold = %d\n", | |
429 min_profitable_iters); | |
430 | |
431 return min_profitable_iters; | |
432 } | |
433 | |
434 | |
435 /* TODO: Close dependency between vect_model_*_cost and vectorizable_* | |
436 functions. Design better to avoid maintenance issues. */ | |
437 | |
438 /* Function vect_model_reduction_cost. | |
439 | |
440 Models cost for a reduction operation, including the vector ops | |
441 generated within the strip-mine loop, the initial definition before | |
442 the loop, and the epilogue code that must be generated. */ | |
443 | |
444 static bool | |
445 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code, | |
446 int ncopies) | |
447 { | |
448 int outer_cost = 0; | |
449 enum tree_code code; | |
450 optab optab; | |
451 tree vectype; | |
452 gimple stmt, orig_stmt; | |
453 tree reduction_op; | |
454 enum machine_mode mode; | |
455 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
456 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
457 | |
458 | |
459 /* Cost of reduction op inside loop. */ | |
460 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST; | |
461 | |
462 stmt = STMT_VINFO_STMT (stmt_info); | |
463 | |
464 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt))) | |
465 { | |
466 case GIMPLE_SINGLE_RHS: | |
467 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op); | |
468 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2); | |
469 break; | |
470 case GIMPLE_UNARY_RHS: | |
471 reduction_op = gimple_assign_rhs1 (stmt); | |
472 break; | |
473 case GIMPLE_BINARY_RHS: | |
474 reduction_op = gimple_assign_rhs2 (stmt); | |
475 break; | |
476 default: | |
477 gcc_unreachable (); | |
478 } | |
479 | |
480 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op)); | |
481 if (!vectype) | |
482 { | |
483 if (vect_print_dump_info (REPORT_COST)) | |
484 { | |
485 fprintf (vect_dump, "unsupported data-type "); | |
486 print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM); | |
487 } | |
488 return false; | |
489 } | |
490 | |
491 mode = TYPE_MODE (vectype); | |
492 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info); | |
493 | |
494 if (!orig_stmt) | |
495 orig_stmt = STMT_VINFO_STMT (stmt_info); | |
496 | |
497 code = gimple_assign_rhs_code (orig_stmt); | |
498 | |
499 /* Add in cost for initial definition. */ | |
500 outer_cost += TARG_SCALAR_TO_VEC_COST; | |
501 | |
502 /* Determine cost of epilogue code. | |
503 | |
504 We have a reduction operator that will reduce the vector in one statement. | |
505 Also requires scalar extract. */ | |
506 | |
507 if (!nested_in_vect_loop_p (loop, orig_stmt)) | |
508 { | |
509 if (reduc_code < NUM_TREE_CODES) | |
510 outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST; | |
511 else | |
512 { | |
513 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1); | |
514 tree bitsize = | |
515 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt))); | |
516 int element_bitsize = tree_low_cst (bitsize, 1); | |
517 int nelements = vec_size_in_bits / element_bitsize; | |
518 | |
519 optab = optab_for_tree_code (code, vectype, optab_default); | |
520 | |
521 /* We have a whole vector shift available. */ | |
522 if (VECTOR_MODE_P (mode) | |
523 && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing | |
524 && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing) | |
525 /* Final reduction via vector shifts and the reduction operator. Also | |
526 requires scalar extract. */ | |
527 outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST | |
528 + TARG_VEC_TO_SCALAR_COST); | |
529 else | |
530 /* Use extracts and reduction op for final reduction. For N elements, | |
531 we have N extracts and N-1 reduction ops. */ | |
532 outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST); | |
533 } | |
534 } | |
535 | |
536 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost; | |
537 | |
538 if (vect_print_dump_info (REPORT_COST)) | |
539 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, " | |
540 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info), | |
541 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info)); | |
542 | |
543 return true; | |
544 } | |
545 | |
546 | |
547 /* Function vect_model_induction_cost. | |
548 | |
549 Models cost for induction operations. */ | |
550 | |
551 static void | |
552 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies) | |
553 { | |
554 /* loop cost for vec_loop. */ | |
555 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST; | |
556 /* prologue cost for vec_init and vec_step. */ | |
557 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST; | |
558 | |
559 if (vect_print_dump_info (REPORT_COST)) | |
560 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, " | |
561 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info), | |
562 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info)); | |
563 } | |
564 | |
565 | |
566 /* Function vect_model_simple_cost. | |
567 | |
568 Models cost for simple operations, i.e. those that only emit ncopies of a | |
569 single op. Right now, this does not account for multiple insns that could | |
570 be generated for the single vector op. We will handle that shortly. */ | |
571 | |
572 void | |
573 vect_model_simple_cost (stmt_vec_info stmt_info, int ncopies, | |
574 enum vect_def_type *dt, slp_tree slp_node) | |
575 { | |
576 int i; | |
577 int inside_cost = 0, outside_cost = 0; | |
578 | |
579 /* The SLP costs were already calculated during SLP tree build. */ | |
580 if (PURE_SLP_STMT (stmt_info)) | |
581 return; | |
582 | |
583 inside_cost = ncopies * TARG_VEC_STMT_COST; | |
584 | |
585 /* FORNOW: Assuming maximum 2 args per stmts. */ | |
586 for (i = 0; i < 2; i++) | |
587 { | |
588 if (dt[i] == vect_constant_def || dt[i] == vect_invariant_def) | |
589 outside_cost += TARG_SCALAR_TO_VEC_COST; | |
590 } | |
591 | |
592 if (vect_print_dump_info (REPORT_COST)) | |
593 fprintf (vect_dump, "vect_model_simple_cost: inside_cost = %d, " | |
594 "outside_cost = %d .", inside_cost, outside_cost); | |
595 | |
596 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */ | |
597 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost); | |
598 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost); | |
599 } | |
600 | |
601 | |
602 /* Function vect_cost_strided_group_size | |
603 | |
604 For strided load or store, return the group_size only if it is the first | |
605 load or store of a group, else return 1. This ensures that group size is | |
606 only returned once per group. */ | |
607 | |
608 static int | |
609 vect_cost_strided_group_size (stmt_vec_info stmt_info) | |
610 { | |
611 gimple first_stmt = DR_GROUP_FIRST_DR (stmt_info); | |
612 | |
613 if (first_stmt == STMT_VINFO_STMT (stmt_info)) | |
614 return DR_GROUP_SIZE (stmt_info); | |
615 | |
616 return 1; | |
617 } | |
618 | |
619 | |
620 /* Function vect_model_store_cost | |
621 | |
622 Models cost for stores. In the case of strided accesses, one access | |
623 has the overhead of the strided access attributed to it. */ | |
624 | |
625 void | |
626 vect_model_store_cost (stmt_vec_info stmt_info, int ncopies, | |
627 enum vect_def_type dt, slp_tree slp_node) | |
628 { | |
629 int group_size; | |
630 int inside_cost = 0, outside_cost = 0; | |
631 | |
632 /* The SLP costs were already calculated during SLP tree build. */ | |
633 if (PURE_SLP_STMT (stmt_info)) | |
634 return; | |
635 | |
636 if (dt == vect_constant_def || dt == vect_invariant_def) | |
637 outside_cost = TARG_SCALAR_TO_VEC_COST; | |
638 | |
639 /* Strided access? */ | |
640 if (DR_GROUP_FIRST_DR (stmt_info) && !slp_node) | |
641 group_size = vect_cost_strided_group_size (stmt_info); | |
642 /* Not a strided access. */ | |
643 else | |
644 group_size = 1; | |
645 | |
646 /* Is this an access in a group of stores, which provide strided access? | |
647 If so, add in the cost of the permutes. */ | |
648 if (group_size > 1) | |
649 { | |
650 /* Uses a high and low interleave operation for each needed permute. */ | |
651 inside_cost = ncopies * exact_log2(group_size) * group_size | |
652 * TARG_VEC_STMT_COST; | |
653 | |
654 if (vect_print_dump_info (REPORT_COST)) | |
655 fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .", | |
656 group_size); | |
657 | |
658 } | |
659 | |
660 /* Costs of the stores. */ | |
661 inside_cost += ncopies * TARG_VEC_STORE_COST; | |
662 | |
663 if (vect_print_dump_info (REPORT_COST)) | |
664 fprintf (vect_dump, "vect_model_store_cost: inside_cost = %d, " | |
665 "outside_cost = %d .", inside_cost, outside_cost); | |
666 | |
667 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */ | |
668 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost); | |
669 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost); | |
670 } | |
671 | |
672 | |
673 /* Function vect_model_load_cost | |
674 | |
675 Models cost for loads. In the case of strided accesses, the last access | |
676 has the overhead of the strided access attributed to it. Since unaligned | |
677 accesses are supported for loads, we also account for the costs of the | |
678 access scheme chosen. */ | |
679 | |
680 void | |
681 vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node) | |
682 | |
683 { | |
684 int group_size; | |
685 int alignment_support_cheme; | |
686 gimple first_stmt; | |
687 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr; | |
688 int inside_cost = 0, outside_cost = 0; | |
689 | |
690 /* The SLP costs were already calculated during SLP tree build. */ | |
691 if (PURE_SLP_STMT (stmt_info)) | |
692 return; | |
693 | |
694 /* Strided accesses? */ | |
695 first_stmt = DR_GROUP_FIRST_DR (stmt_info); | |
696 if (first_stmt && !slp_node) | |
697 { | |
698 group_size = vect_cost_strided_group_size (stmt_info); | |
699 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)); | |
700 } | |
701 /* Not a strided access. */ | |
702 else | |
703 { | |
704 group_size = 1; | |
705 first_dr = dr; | |
706 } | |
707 | |
708 alignment_support_cheme = vect_supportable_dr_alignment (first_dr); | |
709 | |
710 /* Is this an access in a group of loads providing strided access? | |
711 If so, add in the cost of the permutes. */ | |
712 if (group_size > 1) | |
713 { | |
714 /* Uses an even and odd extract operations for each needed permute. */ | |
715 inside_cost = ncopies * exact_log2(group_size) * group_size | |
716 * TARG_VEC_STMT_COST; | |
717 | |
718 if (vect_print_dump_info (REPORT_COST)) | |
719 fprintf (vect_dump, "vect_model_load_cost: strided group_size = %d .", | |
720 group_size); | |
721 | |
722 } | |
723 | |
724 /* The loads themselves. */ | |
725 switch (alignment_support_cheme) | |
726 { | |
727 case dr_aligned: | |
728 { | |
729 inside_cost += ncopies * TARG_VEC_LOAD_COST; | |
730 | |
731 if (vect_print_dump_info (REPORT_COST)) | |
732 fprintf (vect_dump, "vect_model_load_cost: aligned."); | |
733 | |
734 break; | |
735 } | |
736 case dr_unaligned_supported: | |
737 { | |
738 /* Here, we assign an additional cost for the unaligned load. */ | |
739 inside_cost += ncopies * TARG_VEC_UNALIGNED_LOAD_COST; | |
740 | |
741 if (vect_print_dump_info (REPORT_COST)) | |
742 fprintf (vect_dump, "vect_model_load_cost: unaligned supported by " | |
743 "hardware."); | |
744 | |
745 break; | |
746 } | |
747 case dr_explicit_realign: | |
748 { | |
749 inside_cost += ncopies * (2*TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST); | |
750 | |
751 /* FIXME: If the misalignment remains fixed across the iterations of | |
752 the containing loop, the following cost should be added to the | |
753 outside costs. */ | |
754 if (targetm.vectorize.builtin_mask_for_load) | |
755 inside_cost += TARG_VEC_STMT_COST; | |
756 | |
757 break; | |
758 } | |
759 case dr_explicit_realign_optimized: | |
760 { | |
761 if (vect_print_dump_info (REPORT_COST)) | |
762 fprintf (vect_dump, "vect_model_load_cost: unaligned software " | |
763 "pipelined."); | |
764 | |
765 /* Unaligned software pipeline has a load of an address, an initial | |
766 load, and possibly a mask operation to "prime" the loop. However, | |
767 if this is an access in a group of loads, which provide strided | |
768 access, then the above cost should only be considered for one | |
769 access in the group. Inside the loop, there is a load op | |
770 and a realignment op. */ | |
771 | |
772 if ((!DR_GROUP_FIRST_DR (stmt_info)) || group_size > 1 || slp_node) | |
773 { | |
774 outside_cost = 2*TARG_VEC_STMT_COST; | |
775 if (targetm.vectorize.builtin_mask_for_load) | |
776 outside_cost += TARG_VEC_STMT_COST; | |
777 } | |
778 | |
779 inside_cost += ncopies * (TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST); | |
780 | |
781 break; | |
782 } | |
783 | |
784 default: | |
785 gcc_unreachable (); | |
786 } | |
787 | |
788 if (vect_print_dump_info (REPORT_COST)) | |
789 fprintf (vect_dump, "vect_model_load_cost: inside_cost = %d, " | |
790 "outside_cost = %d .", inside_cost, outside_cost); | |
791 | |
792 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */ | |
793 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost); | |
794 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost); | |
795 } | |
796 | |
797 | |
798 /* Function vect_get_new_vect_var. | |
799 | |
800 Returns a name for a new variable. The current naming scheme appends the | |
801 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to | |
802 the name of vectorizer generated variables, and appends that to NAME if | |
803 provided. */ | |
804 | |
805 static tree | |
806 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name) | |
807 { | |
808 const char *prefix; | |
809 tree new_vect_var; | |
810 | |
811 switch (var_kind) | |
812 { | |
813 case vect_simple_var: | |
814 prefix = "vect_"; | |
815 break; | |
816 case vect_scalar_var: | |
817 prefix = "stmp_"; | |
818 break; | |
819 case vect_pointer_var: | |
820 prefix = "vect_p"; | |
821 break; | |
822 default: | |
823 gcc_unreachable (); | |
824 } | |
825 | |
826 if (name) | |
827 { | |
828 char* tmp = concat (prefix, name, NULL); | |
829 new_vect_var = create_tmp_var (type, tmp); | |
830 free (tmp); | |
831 } | |
832 else | |
833 new_vect_var = create_tmp_var (type, prefix); | |
834 | |
835 /* Mark vector typed variable as a gimple register variable. */ | |
836 if (TREE_CODE (type) == VECTOR_TYPE) | |
837 DECL_GIMPLE_REG_P (new_vect_var) = true; | |
838 | |
839 return new_vect_var; | |
840 } | |
841 | |
842 | |
843 /* Function vect_create_addr_base_for_vector_ref. | |
844 | |
845 Create an expression that computes the address of the first memory location | |
846 that will be accessed for a data reference. | |
847 | |
848 Input: | |
849 STMT: The statement containing the data reference. | |
850 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list. | |
851 OFFSET: Optional. If supplied, it is be added to the initial address. | |
852 LOOP: Specify relative to which loop-nest should the address be computed. | |
853 For example, when the dataref is in an inner-loop nested in an | |
854 outer-loop that is now being vectorized, LOOP can be either the | |
855 outer-loop, or the inner-loop. The first memory location accessed | |
856 by the following dataref ('in' points to short): | |
857 | |
858 for (i=0; i<N; i++) | |
859 for (j=0; j<M; j++) | |
860 s += in[i+j] | |
861 | |
862 is as follows: | |
863 if LOOP=i_loop: &in (relative to i_loop) | |
864 if LOOP=j_loop: &in+i*2B (relative to j_loop) | |
865 | |
866 Output: | |
867 1. Return an SSA_NAME whose value is the address of the memory location of | |
868 the first vector of the data reference. | |
869 2. If new_stmt_list is not NULL_TREE after return then the caller must insert | |
870 these statement(s) which define the returned SSA_NAME. | |
871 | |
872 FORNOW: We are only handling array accesses with step 1. */ | |
873 | |
874 static tree | |
875 vect_create_addr_base_for_vector_ref (gimple stmt, | |
876 gimple_seq *new_stmt_list, | |
877 tree offset, | |
878 struct loop *loop) | |
879 { | |
880 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
881 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); | |
882 struct loop *containing_loop = (gimple_bb (stmt))->loop_father; | |
883 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr)); | |
884 tree base_name; | |
885 tree data_ref_base_var; | |
886 tree vec_stmt; | |
887 tree addr_base, addr_expr; | |
888 tree dest; | |
889 gimple_seq seq = NULL; | |
890 tree base_offset = unshare_expr (DR_OFFSET (dr)); | |
891 tree init = unshare_expr (DR_INIT (dr)); | |
892 tree vect_ptr_type, addr_expr2; | |
893 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr))); | |
894 | |
895 gcc_assert (loop); | |
896 if (loop != containing_loop) | |
897 { | |
898 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
899 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
900 | |
901 gcc_assert (nested_in_vect_loop_p (loop, stmt)); | |
902 | |
903 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info)); | |
904 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info)); | |
905 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info)); | |
906 } | |
907 | |
908 /* Create data_ref_base */ | |
909 base_name = build_fold_indirect_ref (data_ref_base); | |
910 data_ref_base_var = create_tmp_var (TREE_TYPE (data_ref_base), "batmp"); | |
911 add_referenced_var (data_ref_base_var); | |
912 data_ref_base = force_gimple_operand (data_ref_base, &seq, true, | |
913 data_ref_base_var); | |
914 gimple_seq_add_seq (new_stmt_list, seq); | |
915 | |
916 /* Create base_offset */ | |
917 base_offset = size_binop (PLUS_EXPR, | |
918 fold_convert (sizetype, base_offset), | |
919 fold_convert (sizetype, init)); | |
920 dest = create_tmp_var (sizetype, "base_off"); | |
921 add_referenced_var (dest); | |
922 base_offset = force_gimple_operand (base_offset, &seq, true, dest); | |
923 gimple_seq_add_seq (new_stmt_list, seq); | |
924 | |
925 if (offset) | |
926 { | |
927 tree tmp = create_tmp_var (sizetype, "offset"); | |
928 | |
929 add_referenced_var (tmp); | |
930 offset = fold_build2 (MULT_EXPR, sizetype, | |
931 fold_convert (sizetype, offset), step); | |
932 base_offset = fold_build2 (PLUS_EXPR, sizetype, | |
933 base_offset, offset); | |
934 base_offset = force_gimple_operand (base_offset, &seq, false, tmp); | |
935 gimple_seq_add_seq (new_stmt_list, seq); | |
936 } | |
937 | |
938 /* base + base_offset */ | |
939 addr_base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (data_ref_base), | |
940 data_ref_base, base_offset); | |
941 | |
942 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info)); | |
943 | |
944 /* addr_expr = addr_base */ | |
945 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, | |
946 get_name (base_name)); | |
947 add_referenced_var (addr_expr); | |
948 vec_stmt = fold_convert (vect_ptr_type, addr_base); | |
949 addr_expr2 = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, | |
950 get_name (base_name)); | |
951 add_referenced_var (addr_expr2); | |
952 vec_stmt = force_gimple_operand (vec_stmt, &seq, false, addr_expr2); | |
953 gimple_seq_add_seq (new_stmt_list, seq); | |
954 | |
955 if (vect_print_dump_info (REPORT_DETAILS)) | |
956 { | |
957 fprintf (vect_dump, "created "); | |
958 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM); | |
959 } | |
960 return vec_stmt; | |
961 } | |
962 | |
963 | |
964 /* Function vect_create_data_ref_ptr. | |
965 | |
966 Create a new pointer to vector type (vp), that points to the first location | |
967 accessed in the loop by STMT, along with the def-use update chain to | |
968 appropriately advance the pointer through the loop iterations. Also set | |
969 aliasing information for the pointer. This vector pointer is used by the | |
970 callers to this function to create a memory reference expression for vector | |
971 load/store access. | |
972 | |
973 Input: | |
974 1. STMT: a stmt that references memory. Expected to be of the form | |
975 GIMPLE_ASSIGN <name, data-ref> or | |
976 GIMPLE_ASSIGN <data-ref, name>. | |
977 2. AT_LOOP: the loop where the vector memref is to be created. | |
978 3. OFFSET (optional): an offset to be added to the initial address accessed | |
979 by the data-ref in STMT. | |
980 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain | |
981 pointing to the initial address. | |
982 5. TYPE: if not NULL indicates the required type of the data-ref. | |
983 | |
984 Output: | |
985 1. Declare a new ptr to vector_type, and have it point to the base of the | |
986 data reference (initial addressed accessed by the data reference). | |
987 For example, for vector of type V8HI, the following code is generated: | |
988 | |
989 v8hi *vp; | |
990 vp = (v8hi *)initial_address; | |
991 | |
992 if OFFSET is not supplied: | |
993 initial_address = &a[init]; | |
994 if OFFSET is supplied: | |
995 initial_address = &a[init + OFFSET]; | |
996 | |
997 Return the initial_address in INITIAL_ADDRESS. | |
998 | |
999 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also | |
1000 update the pointer in each iteration of the loop. | |
1001 | |
1002 Return the increment stmt that updates the pointer in PTR_INCR. | |
1003 | |
1004 3. Set INV_P to true if the access pattern of the data reference in the | |
1005 vectorized loop is invariant. Set it to false otherwise. | |
1006 | |
1007 4. Return the pointer. */ | |
1008 | |
1009 static tree | |
1010 vect_create_data_ref_ptr (gimple stmt, struct loop *at_loop, | |
1011 tree offset, tree *initial_address, gimple *ptr_incr, | |
1012 bool only_init, bool *inv_p, tree type) | |
1013 { | |
1014 tree base_name; | |
1015 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
1016 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
1017 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
1018 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt); | |
1019 struct loop *containing_loop = (gimple_bb (stmt))->loop_father; | |
1020 tree vectype = STMT_VINFO_VECTYPE (stmt_info); | |
1021 tree vect_ptr_type; | |
1022 tree vect_ptr; | |
1023 tree tag; | |
1024 tree new_temp; | |
1025 gimple vec_stmt; | |
1026 gimple_seq new_stmt_list = NULL; | |
1027 edge pe; | |
1028 basic_block new_bb; | |
1029 tree vect_ptr_init; | |
1030 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); | |
1031 tree vptr; | |
1032 gimple_stmt_iterator incr_gsi; | |
1033 bool insert_after; | |
1034 tree indx_before_incr, indx_after_incr; | |
1035 gimple incr; | |
1036 tree step; | |
1037 | |
1038 /* Check the step (evolution) of the load in LOOP, and record | |
1039 whether it's invariant. */ | |
1040 if (nested_in_vect_loop) | |
1041 step = STMT_VINFO_DR_STEP (stmt_info); | |
1042 else | |
1043 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info)); | |
1044 | |
1045 if (tree_int_cst_compare (step, size_zero_node) == 0) | |
1046 *inv_p = true; | |
1047 else | |
1048 *inv_p = false; | |
1049 | |
1050 /* Create an expression for the first address accessed by this load | |
1051 in LOOP. */ | |
1052 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr))); | |
1053 | |
1054 if (vect_print_dump_info (REPORT_DETAILS)) | |
1055 { | |
1056 tree data_ref_base = base_name; | |
1057 fprintf (vect_dump, "create vector-pointer variable to type: "); | |
1058 print_generic_expr (vect_dump, vectype, TDF_SLIM); | |
1059 if (TREE_CODE (data_ref_base) == VAR_DECL) | |
1060 fprintf (vect_dump, " vectorizing a one dimensional array ref: "); | |
1061 else if (TREE_CODE (data_ref_base) == ARRAY_REF) | |
1062 fprintf (vect_dump, " vectorizing a multidimensional array ref: "); | |
1063 else if (TREE_CODE (data_ref_base) == COMPONENT_REF) | |
1064 fprintf (vect_dump, " vectorizing a record based array ref: "); | |
1065 else if (TREE_CODE (data_ref_base) == SSA_NAME) | |
1066 fprintf (vect_dump, " vectorizing a pointer ref: "); | |
1067 print_generic_expr (vect_dump, base_name, TDF_SLIM); | |
1068 } | |
1069 | |
1070 /** (1) Create the new vector-pointer variable: **/ | |
1071 if (type) | |
1072 vect_ptr_type = build_pointer_type (type); | |
1073 else | |
1074 vect_ptr_type = build_pointer_type (vectype); | |
1075 | |
1076 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == SSA_NAME | |
1077 && TYPE_RESTRICT (TREE_TYPE (DR_BASE_ADDRESS (dr)))) | |
1078 vect_ptr_type = build_qualified_type (vect_ptr_type, TYPE_QUAL_RESTRICT); | |
1079 vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, | |
1080 get_name (base_name)); | |
1081 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == SSA_NAME | |
1082 && TYPE_RESTRICT (TREE_TYPE (DR_BASE_ADDRESS (dr)))) | |
1083 { | |
1084 get_alias_set (base_name); | |
1085 DECL_POINTER_ALIAS_SET (vect_ptr) | |
1086 = DECL_POINTER_ALIAS_SET (SSA_NAME_VAR (DR_BASE_ADDRESS (dr))); | |
1087 } | |
1088 | |
1089 add_referenced_var (vect_ptr); | |
1090 | |
1091 /** (2) Add aliasing information to the new vector-pointer: | |
1092 (The points-to info (DR_PTR_INFO) may be defined later.) **/ | |
1093 | |
1094 tag = DR_SYMBOL_TAG (dr); | |
1095 gcc_assert (tag); | |
1096 | |
1097 /* If tag is a variable (and NOT_A_TAG) than a new symbol memory | |
1098 tag must be created with tag added to its may alias list. */ | |
1099 if (!MTAG_P (tag)) | |
1100 new_type_alias (vect_ptr, tag, DR_REF (dr)); | |
1101 else | |
1102 { | |
1103 set_symbol_mem_tag (vect_ptr, tag); | |
1104 mark_sym_for_renaming (tag); | |
1105 } | |
1106 | |
1107 /** Note: If the dataref is in an inner-loop nested in LOOP, and we are | |
1108 vectorizing LOOP (i.e. outer-loop vectorization), we need to create two | |
1109 def-use update cycles for the pointer: One relative to the outer-loop | |
1110 (LOOP), which is what steps (3) and (4) below do. The other is relative | |
1111 to the inner-loop (which is the inner-most loop containing the dataref), | |
1112 and this is done be step (5) below. | |
1113 | |
1114 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the | |
1115 inner-most loop, and so steps (3),(4) work the same, and step (5) is | |
1116 redundant. Steps (3),(4) create the following: | |
1117 | |
1118 vp0 = &base_addr; | |
1119 LOOP: vp1 = phi(vp0,vp2) | |
1120 ... | |
1121 ... | |
1122 vp2 = vp1 + step | |
1123 goto LOOP | |
1124 | |
1125 If there is an inner-loop nested in loop, then step (5) will also be | |
1126 applied, and an additional update in the inner-loop will be created: | |
1127 | |
1128 vp0 = &base_addr; | |
1129 LOOP: vp1 = phi(vp0,vp2) | |
1130 ... | |
1131 inner: vp3 = phi(vp1,vp4) | |
1132 vp4 = vp3 + inner_step | |
1133 if () goto inner | |
1134 ... | |
1135 vp2 = vp1 + step | |
1136 if () goto LOOP */ | |
1137 | |
1138 /** (3) Calculate the initial address the vector-pointer, and set | |
1139 the vector-pointer to point to it before the loop: **/ | |
1140 | |
1141 /* Create: (&(base[init_val+offset]) in the loop preheader. */ | |
1142 | |
1143 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list, | |
1144 offset, loop); | |
1145 pe = loop_preheader_edge (loop); | |
1146 if (new_stmt_list) | |
1147 { | |
1148 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list); | |
1149 gcc_assert (!new_bb); | |
1150 } | |
1151 | |
1152 *initial_address = new_temp; | |
1153 | |
1154 /* Create: p = (vectype *) initial_base */ | |
1155 vec_stmt = gimple_build_assign (vect_ptr, | |
1156 fold_convert (vect_ptr_type, new_temp)); | |
1157 vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt); | |
1158 gimple_assign_set_lhs (vec_stmt, vect_ptr_init); | |
1159 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt); | |
1160 gcc_assert (!new_bb); | |
1161 | |
1162 | |
1163 /** (4) Handle the updating of the vector-pointer inside the loop. | |
1164 This is needed when ONLY_INIT is false, and also when AT_LOOP | |
1165 is the inner-loop nested in LOOP (during outer-loop vectorization). | |
1166 **/ | |
1167 | |
1168 if (only_init && at_loop == loop) /* No update in loop is required. */ | |
1169 { | |
1170 /* Copy the points-to information if it exists. */ | |
1171 if (DR_PTR_INFO (dr)) | |
1172 duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr)); | |
1173 vptr = vect_ptr_init; | |
1174 } | |
1175 else | |
1176 { | |
1177 /* The step of the vector pointer is the Vector Size. */ | |
1178 tree step = TYPE_SIZE_UNIT (vectype); | |
1179 /* One exception to the above is when the scalar step of the load in | |
1180 LOOP is zero. In this case the step here is also zero. */ | |
1181 if (*inv_p) | |
1182 step = size_zero_node; | |
1183 | |
1184 standard_iv_increment_position (loop, &incr_gsi, &insert_after); | |
1185 | |
1186 create_iv (vect_ptr_init, | |
1187 fold_convert (vect_ptr_type, step), | |
1188 vect_ptr, loop, &incr_gsi, insert_after, | |
1189 &indx_before_incr, &indx_after_incr); | |
1190 incr = gsi_stmt (incr_gsi); | |
1191 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo)); | |
1192 | |
1193 /* Copy the points-to information if it exists. */ | |
1194 if (DR_PTR_INFO (dr)) | |
1195 { | |
1196 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr)); | |
1197 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr)); | |
1198 } | |
1199 merge_alias_info (vect_ptr_init, indx_before_incr); | |
1200 merge_alias_info (vect_ptr_init, indx_after_incr); | |
1201 if (ptr_incr) | |
1202 *ptr_incr = incr; | |
1203 | |
1204 vptr = indx_before_incr; | |
1205 } | |
1206 | |
1207 if (!nested_in_vect_loop || only_init) | |
1208 return vptr; | |
1209 | |
1210 | |
1211 /** (5) Handle the updating of the vector-pointer inside the inner-loop | |
1212 nested in LOOP, if exists: **/ | |
1213 | |
1214 gcc_assert (nested_in_vect_loop); | |
1215 if (!only_init) | |
1216 { | |
1217 standard_iv_increment_position (containing_loop, &incr_gsi, | |
1218 &insert_after); | |
1219 create_iv (vptr, fold_convert (vect_ptr_type, DR_STEP (dr)), vect_ptr, | |
1220 containing_loop, &incr_gsi, insert_after, &indx_before_incr, | |
1221 &indx_after_incr); | |
1222 incr = gsi_stmt (incr_gsi); | |
1223 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo)); | |
1224 | |
1225 /* Copy the points-to information if it exists. */ | |
1226 if (DR_PTR_INFO (dr)) | |
1227 { | |
1228 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr)); | |
1229 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr)); | |
1230 } | |
1231 merge_alias_info (vect_ptr_init, indx_before_incr); | |
1232 merge_alias_info (vect_ptr_init, indx_after_incr); | |
1233 if (ptr_incr) | |
1234 *ptr_incr = incr; | |
1235 | |
1236 return indx_before_incr; | |
1237 } | |
1238 else | |
1239 gcc_unreachable (); | |
1240 } | |
1241 | |
1242 | |
1243 /* Function bump_vector_ptr | |
1244 | |
1245 Increment a pointer (to a vector type) by vector-size. If requested, | |
1246 i.e. if PTR-INCR is given, then also connect the new increment stmt | |
1247 to the existing def-use update-chain of the pointer, by modifying | |
1248 the PTR_INCR as illustrated below: | |
1249 | |
1250 The pointer def-use update-chain before this function: | |
1251 DATAREF_PTR = phi (p_0, p_2) | |
1252 .... | |
1253 PTR_INCR: p_2 = DATAREF_PTR + step | |
1254 | |
1255 The pointer def-use update-chain after this function: | |
1256 DATAREF_PTR = phi (p_0, p_2) | |
1257 .... | |
1258 NEW_DATAREF_PTR = DATAREF_PTR + BUMP | |
1259 .... | |
1260 PTR_INCR: p_2 = NEW_DATAREF_PTR + step | |
1261 | |
1262 Input: | |
1263 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated | |
1264 in the loop. | |
1265 PTR_INCR - optional. The stmt that updates the pointer in each iteration of | |
1266 the loop. The increment amount across iterations is expected | |
1267 to be vector_size. | |
1268 BSI - location where the new update stmt is to be placed. | |
1269 STMT - the original scalar memory-access stmt that is being vectorized. | |
1270 BUMP - optional. The offset by which to bump the pointer. If not given, | |
1271 the offset is assumed to be vector_size. | |
1272 | |
1273 Output: Return NEW_DATAREF_PTR as illustrated above. | |
1274 | |
1275 */ | |
1276 | |
1277 static tree | |
1278 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi, | |
1279 gimple stmt, tree bump) | |
1280 { | |
1281 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
1282 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); | |
1283 tree vectype = STMT_VINFO_VECTYPE (stmt_info); | |
1284 tree ptr_var = SSA_NAME_VAR (dataref_ptr); | |
1285 tree update = TYPE_SIZE_UNIT (vectype); | |
1286 gimple incr_stmt; | |
1287 ssa_op_iter iter; | |
1288 use_operand_p use_p; | |
1289 tree new_dataref_ptr; | |
1290 | |
1291 if (bump) | |
1292 update = bump; | |
1293 | |
1294 incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, ptr_var, | |
1295 dataref_ptr, update); | |
1296 new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt); | |
1297 gimple_assign_set_lhs (incr_stmt, new_dataref_ptr); | |
1298 vect_finish_stmt_generation (stmt, incr_stmt, gsi); | |
1299 | |
1300 /* Copy the points-to information if it exists. */ | |
1301 if (DR_PTR_INFO (dr)) | |
1302 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr)); | |
1303 merge_alias_info (new_dataref_ptr, dataref_ptr); | |
1304 | |
1305 if (!ptr_incr) | |
1306 return new_dataref_ptr; | |
1307 | |
1308 /* Update the vector-pointer's cross-iteration increment. */ | |
1309 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE) | |
1310 { | |
1311 tree use = USE_FROM_PTR (use_p); | |
1312 | |
1313 if (use == dataref_ptr) | |
1314 SET_USE (use_p, new_dataref_ptr); | |
1315 else | |
1316 gcc_assert (tree_int_cst_compare (use, update) == 0); | |
1317 } | |
1318 | |
1319 return new_dataref_ptr; | |
1320 } | |
1321 | |
1322 | |
1323 /* Function vect_create_destination_var. | |
1324 | |
1325 Create a new temporary of type VECTYPE. */ | |
1326 | |
1327 static tree | |
1328 vect_create_destination_var (tree scalar_dest, tree vectype) | |
1329 { | |
1330 tree vec_dest; | |
1331 const char *new_name; | |
1332 tree type; | |
1333 enum vect_var_kind kind; | |
1334 | |
1335 kind = vectype ? vect_simple_var : vect_scalar_var; | |
1336 type = vectype ? vectype : TREE_TYPE (scalar_dest); | |
1337 | |
1338 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME); | |
1339 | |
1340 new_name = get_name (scalar_dest); | |
1341 if (!new_name) | |
1342 new_name = "var_"; | |
1343 vec_dest = vect_get_new_vect_var (type, kind, new_name); | |
1344 add_referenced_var (vec_dest); | |
1345 | |
1346 return vec_dest; | |
1347 } | |
1348 | |
1349 | |
1350 /* Function vect_init_vector. | |
1351 | |
1352 Insert a new stmt (INIT_STMT) that initializes a new vector variable with | |
1353 the vector elements of VECTOR_VAR. Place the initialization at BSI if it | |
1354 is not NULL. Otherwise, place the initialization at the loop preheader. | |
1355 Return the DEF of INIT_STMT. | |
1356 It will be used in the vectorization of STMT. */ | |
1357 | |
1358 static tree | |
1359 vect_init_vector (gimple stmt, tree vector_var, tree vector_type, | |
1360 gimple_stmt_iterator *gsi) | |
1361 { | |
1362 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt); | |
1363 tree new_var; | |
1364 gimple init_stmt; | |
1365 tree vec_oprnd; | |
1366 edge pe; | |
1367 tree new_temp; | |
1368 basic_block new_bb; | |
1369 | |
1370 new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_"); | |
1371 add_referenced_var (new_var); | |
1372 init_stmt = gimple_build_assign (new_var, vector_var); | |
1373 new_temp = make_ssa_name (new_var, init_stmt); | |
1374 gimple_assign_set_lhs (init_stmt, new_temp); | |
1375 | |
1376 if (gsi) | |
1377 vect_finish_stmt_generation (stmt, init_stmt, gsi); | |
1378 else | |
1379 { | |
1380 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo); | |
1381 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
1382 | |
1383 if (nested_in_vect_loop_p (loop, stmt)) | |
1384 loop = loop->inner; | |
1385 pe = loop_preheader_edge (loop); | |
1386 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt); | |
1387 gcc_assert (!new_bb); | |
1388 } | |
1389 | |
1390 if (vect_print_dump_info (REPORT_DETAILS)) | |
1391 { | |
1392 fprintf (vect_dump, "created new init_stmt: "); | |
1393 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM); | |
1394 } | |
1395 | |
1396 vec_oprnd = gimple_assign_lhs (init_stmt); | |
1397 return vec_oprnd; | |
1398 } | |
1399 | |
1400 | |
1401 /* For constant and loop invariant defs of SLP_NODE this function returns | |
1402 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts. | |
1403 OP_NUM determines if we gather defs for operand 0 or operand 1 of the scalar | |
1404 stmts. NUMBER_OF_VECTORS is the number of vector defs to create. */ | |
1405 | |
1406 static void | |
1407 vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds, | |
1408 unsigned int op_num, unsigned int number_of_vectors) | |
1409 { | |
1410 VEC (gimple, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node); | |
1411 gimple stmt = VEC_index (gimple, stmts, 0); | |
1412 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt); | |
1413 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo); | |
1414 int nunits; | |
1415 tree vec_cst; | |
1416 tree t = NULL_TREE; | |
1417 int j, number_of_places_left_in_vector; | |
1418 tree vector_type; | |
1419 tree op, vop; | |
1420 int group_size = VEC_length (gimple, stmts); | |
1421 unsigned int vec_num, i; | |
1422 int number_of_copies = 1; | |
1423 VEC (tree, heap) *voprnds = VEC_alloc (tree, heap, number_of_vectors); | |
1424 bool constant_p, is_store; | |
1425 | |
1426 if (STMT_VINFO_DATA_REF (stmt_vinfo)) | |
1427 { | |
1428 is_store = true; | |
1429 op = gimple_assign_rhs1 (stmt); | |
1430 } | |
1431 else | |
1432 { | |
1433 is_store = false; | |
1434 op = gimple_op (stmt, op_num + 1); | |
1435 } | |
1436 | |
1437 if (CONSTANT_CLASS_P (op)) | |
1438 { | |
1439 vector_type = vectype; | |
1440 constant_p = true; | |
1441 } | |
1442 else | |
1443 { | |
1444 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op)); | |
1445 gcc_assert (vector_type); | |
1446 constant_p = false; | |
1447 } | |
1448 | |
1449 nunits = TYPE_VECTOR_SUBPARTS (vector_type); | |
1450 | |
1451 /* NUMBER_OF_COPIES is the number of times we need to use the same values in | |
1452 created vectors. It is greater than 1 if unrolling is performed. | |
1453 | |
1454 For example, we have two scalar operands, s1 and s2 (e.g., group of | |
1455 strided accesses of size two), while NUNITS is four (i.e., four scalars | |
1456 of this type can be packed in a vector). The output vector will contain | |
1457 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES | |
1458 will be 2). | |
1459 | |
1460 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors | |
1461 containing the operands. | |
1462 | |
1463 For example, NUNITS is four as before, and the group size is 8 | |
1464 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and | |
1465 {s5, s6, s7, s8}. */ | |
1466 | |
1467 number_of_copies = least_common_multiple (nunits, group_size) / group_size; | |
1468 | |
1469 number_of_places_left_in_vector = nunits; | |
1470 for (j = 0; j < number_of_copies; j++) | |
1471 { | |
1472 for (i = group_size - 1; VEC_iterate (gimple, stmts, i, stmt); i--) | |
1473 { | |
1474 if (is_store) | |
1475 op = gimple_assign_rhs1 (stmt); | |
1476 else | |
1477 op = gimple_op (stmt, op_num + 1); | |
1478 | |
1479 /* Create 'vect_ = {op0,op1,...,opn}'. */ | |
1480 t = tree_cons (NULL_TREE, op, t); | |
1481 | |
1482 number_of_places_left_in_vector--; | |
1483 | |
1484 if (number_of_places_left_in_vector == 0) | |
1485 { | |
1486 number_of_places_left_in_vector = nunits; | |
1487 | |
1488 if (constant_p) | |
1489 vec_cst = build_vector (vector_type, t); | |
1490 else | |
1491 vec_cst = build_constructor_from_list (vector_type, t); | |
1492 VEC_quick_push (tree, voprnds, | |
1493 vect_init_vector (stmt, vec_cst, vector_type, NULL)); | |
1494 t = NULL_TREE; | |
1495 } | |
1496 } | |
1497 } | |
1498 | |
1499 /* Since the vectors are created in the reverse order, we should invert | |
1500 them. */ | |
1501 vec_num = VEC_length (tree, voprnds); | |
1502 for (j = vec_num - 1; j >= 0; j--) | |
1503 { | |
1504 vop = VEC_index (tree, voprnds, j); | |
1505 VEC_quick_push (tree, *vec_oprnds, vop); | |
1506 } | |
1507 | |
1508 VEC_free (tree, heap, voprnds); | |
1509 | |
1510 /* In case that VF is greater than the unrolling factor needed for the SLP | |
1511 group of stmts, NUMBER_OF_VECTORS to be created is greater than | |
1512 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have | |
1513 to replicate the vectors. */ | |
1514 while (number_of_vectors > VEC_length (tree, *vec_oprnds)) | |
1515 { | |
1516 for (i = 0; VEC_iterate (tree, *vec_oprnds, i, vop) && i < vec_num; i++) | |
1517 VEC_quick_push (tree, *vec_oprnds, vop); | |
1518 } | |
1519 } | |
1520 | |
1521 | |
1522 /* Get vectorized definitions from SLP_NODE that contains corresponding | |
1523 vectorized def-stmts. */ | |
1524 | |
1525 static void | |
1526 vect_get_slp_vect_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds) | |
1527 { | |
1528 tree vec_oprnd; | |
1529 gimple vec_def_stmt; | |
1530 unsigned int i; | |
1531 | |
1532 gcc_assert (SLP_TREE_VEC_STMTS (slp_node)); | |
1533 | |
1534 for (i = 0; | |
1535 VEC_iterate (gimple, SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt); | |
1536 i++) | |
1537 { | |
1538 gcc_assert (vec_def_stmt); | |
1539 vec_oprnd = gimple_get_lhs (vec_def_stmt); | |
1540 VEC_quick_push (tree, *vec_oprnds, vec_oprnd); | |
1541 } | |
1542 } | |
1543 | |
1544 | |
1545 /* Get vectorized definitions for SLP_NODE. | |
1546 If the scalar definitions are loop invariants or constants, collect them and | |
1547 call vect_get_constant_vectors() to create vector stmts. | |
1548 Otherwise, the def-stmts must be already vectorized and the vectorized stmts | |
1549 must be stored in the LEFT/RIGHT node of SLP_NODE, and we call | |
1550 vect_get_slp_vect_defs() to retrieve them. | |
1551 If VEC_OPRNDS1 is NULL, don't get vector defs for the second operand (from | |
1552 the right node. This is used when the second operand must remain scalar. */ | |
1553 | |
1554 static void | |
1555 vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0, | |
1556 VEC (tree,heap) **vec_oprnds1) | |
1557 { | |
1558 gimple first_stmt; | |
1559 enum tree_code code; | |
1560 int number_of_vects; | |
1561 HOST_WIDE_INT lhs_size_unit, rhs_size_unit; | |
1562 | |
1563 first_stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (slp_node), 0); | |
1564 /* The number of vector defs is determined by the number of vector statements | |
1565 in the node from which we get those statements. */ | |
1566 if (SLP_TREE_LEFT (slp_node)) | |
1567 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_LEFT (slp_node)); | |
1568 else | |
1569 { | |
1570 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); | |
1571 /* Number of vector stmts was calculated according to LHS in | |
1572 vect_schedule_slp_instance(), fix it by replacing LHS with RHS, if | |
1573 necessary. See vect_get_smallest_scalar_type() for details. */ | |
1574 vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit, | |
1575 &rhs_size_unit); | |
1576 if (rhs_size_unit != lhs_size_unit) | |
1577 { | |
1578 number_of_vects *= rhs_size_unit; | |
1579 number_of_vects /= lhs_size_unit; | |
1580 } | |
1581 } | |
1582 | |
1583 /* Allocate memory for vectorized defs. */ | |
1584 *vec_oprnds0 = VEC_alloc (tree, heap, number_of_vects); | |
1585 | |
1586 /* SLP_NODE corresponds either to a group of stores or to a group of | |
1587 unary/binary operations. We don't call this function for loads. */ | |
1588 if (SLP_TREE_LEFT (slp_node)) | |
1589 /* The defs are already vectorized. */ | |
1590 vect_get_slp_vect_defs (SLP_TREE_LEFT (slp_node), vec_oprnds0); | |
1591 else | |
1592 /* Build vectors from scalar defs. */ | |
1593 vect_get_constant_vectors (slp_node, vec_oprnds0, 0, number_of_vects); | |
1594 | |
1595 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt))) | |
1596 /* Since we don't call this function with loads, this is a group of | |
1597 stores. */ | |
1598 return; | |
1599 | |
1600 code = gimple_assign_rhs_code (first_stmt); | |
1601 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS || !vec_oprnds1) | |
1602 return; | |
1603 | |
1604 /* The number of vector defs is determined by the number of vector statements | |
1605 in the node from which we get those statements. */ | |
1606 if (SLP_TREE_RIGHT (slp_node)) | |
1607 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_RIGHT (slp_node)); | |
1608 else | |
1609 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); | |
1610 | |
1611 *vec_oprnds1 = VEC_alloc (tree, heap, number_of_vects); | |
1612 | |
1613 if (SLP_TREE_RIGHT (slp_node)) | |
1614 /* The defs are already vectorized. */ | |
1615 vect_get_slp_vect_defs (SLP_TREE_RIGHT (slp_node), vec_oprnds1); | |
1616 else | |
1617 /* Build vectors from scalar defs. */ | |
1618 vect_get_constant_vectors (slp_node, vec_oprnds1, 1, number_of_vects); | |
1619 } | |
1620 | |
1621 | |
1622 /* Function get_initial_def_for_induction | |
1623 | |
1624 Input: | |
1625 STMT - a stmt that performs an induction operation in the loop. | |
1626 IV_PHI - the initial value of the induction variable | |
1627 | |
1628 Output: | |
1629 Return a vector variable, initialized with the first VF values of | |
1630 the induction variable. E.g., for an iv with IV_PHI='X' and | |
1631 evolution S, for a vector of 4 units, we want to return: | |
1632 [X, X + S, X + 2*S, X + 3*S]. */ | |
1633 | |
1634 static tree | |
1635 get_initial_def_for_induction (gimple iv_phi) | |
1636 { | |
1637 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi); | |
1638 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo); | |
1639 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
1640 tree scalar_type = TREE_TYPE (gimple_phi_result (iv_phi)); | |
1641 tree vectype; | |
1642 int nunits; | |
1643 edge pe = loop_preheader_edge (loop); | |
1644 struct loop *iv_loop; | |
1645 basic_block new_bb; | |
1646 tree vec, vec_init, vec_step, t; | |
1647 tree access_fn; | |
1648 tree new_var; | |
1649 tree new_name; | |
1650 gimple init_stmt, induction_phi, new_stmt; | |
1651 tree induc_def, vec_def, vec_dest; | |
1652 tree init_expr, step_expr; | |
1653 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); | |
1654 int i; | |
1655 bool ok; | |
1656 int ncopies; | |
1657 tree expr; | |
1658 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi); | |
1659 bool nested_in_vect_loop = false; | |
1660 gimple_seq stmts = NULL; | |
1661 imm_use_iterator imm_iter; | |
1662 use_operand_p use_p; | |
1663 gimple exit_phi; | |
1664 edge latch_e; | |
1665 tree loop_arg; | |
1666 gimple_stmt_iterator si; | |
1667 basic_block bb = gimple_bb (iv_phi); | |
1668 | |
1669 vectype = get_vectype_for_scalar_type (scalar_type); | |
1670 gcc_assert (vectype); | |
1671 nunits = TYPE_VECTOR_SUBPARTS (vectype); | |
1672 ncopies = vf / nunits; | |
1673 | |
1674 gcc_assert (phi_info); | |
1675 gcc_assert (ncopies >= 1); | |
1676 | |
1677 /* Find the first insertion point in the BB. */ | |
1678 si = gsi_after_labels (bb); | |
1679 | |
1680 if (INTEGRAL_TYPE_P (scalar_type) || POINTER_TYPE_P (scalar_type)) | |
1681 step_expr = build_int_cst (scalar_type, 0); | |
1682 else | |
1683 step_expr = build_real (scalar_type, dconst0); | |
1684 | |
1685 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */ | |
1686 if (nested_in_vect_loop_p (loop, iv_phi)) | |
1687 { | |
1688 nested_in_vect_loop = true; | |
1689 iv_loop = loop->inner; | |
1690 } | |
1691 else | |
1692 iv_loop = loop; | |
1693 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father); | |
1694 | |
1695 latch_e = loop_latch_edge (iv_loop); | |
1696 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e); | |
1697 | |
1698 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi)); | |
1699 gcc_assert (access_fn); | |
1700 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn, | |
1701 &init_expr, &step_expr); | |
1702 gcc_assert (ok); | |
1703 pe = loop_preheader_edge (iv_loop); | |
1704 | |
1705 /* Create the vector that holds the initial_value of the induction. */ | |
1706 if (nested_in_vect_loop) | |
1707 { | |
1708 /* iv_loop is nested in the loop to be vectorized. init_expr had already | |
1709 been created during vectorization of previous stmts; We obtain it from | |
1710 the STMT_VINFO_VEC_STMT of the defining stmt. */ | |
1711 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi, loop_preheader_edge (iv_loop)); | |
1712 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL); | |
1713 } | |
1714 else | |
1715 { | |
1716 /* iv_loop is the loop to be vectorized. Create: | |
1717 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */ | |
1718 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_"); | |
1719 add_referenced_var (new_var); | |
1720 | |
1721 new_name = force_gimple_operand (init_expr, &stmts, false, new_var); | |
1722 if (stmts) | |
1723 { | |
1724 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); | |
1725 gcc_assert (!new_bb); | |
1726 } | |
1727 | |
1728 t = NULL_TREE; | |
1729 t = tree_cons (NULL_TREE, init_expr, t); | |
1730 for (i = 1; i < nunits; i++) | |
1731 { | |
1732 /* Create: new_name_i = new_name + step_expr */ | |
1733 enum tree_code code = POINTER_TYPE_P (scalar_type) | |
1734 ? POINTER_PLUS_EXPR : PLUS_EXPR; | |
1735 init_stmt = gimple_build_assign_with_ops (code, new_var, | |
1736 new_name, step_expr); | |
1737 new_name = make_ssa_name (new_var, init_stmt); | |
1738 gimple_assign_set_lhs (init_stmt, new_name); | |
1739 | |
1740 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt); | |
1741 gcc_assert (!new_bb); | |
1742 | |
1743 if (vect_print_dump_info (REPORT_DETAILS)) | |
1744 { | |
1745 fprintf (vect_dump, "created new init_stmt: "); | |
1746 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM); | |
1747 } | |
1748 t = tree_cons (NULL_TREE, new_name, t); | |
1749 } | |
1750 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */ | |
1751 vec = build_constructor_from_list (vectype, nreverse (t)); | |
1752 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL); | |
1753 } | |
1754 | |
1755 | |
1756 /* Create the vector that holds the step of the induction. */ | |
1757 if (nested_in_vect_loop) | |
1758 /* iv_loop is nested in the loop to be vectorized. Generate: | |
1759 vec_step = [S, S, S, S] */ | |
1760 new_name = step_expr; | |
1761 else | |
1762 { | |
1763 /* iv_loop is the loop to be vectorized. Generate: | |
1764 vec_step = [VF*S, VF*S, VF*S, VF*S] */ | |
1765 expr = build_int_cst (scalar_type, vf); | |
1766 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr); | |
1767 } | |
1768 | |
1769 t = NULL_TREE; | |
1770 for (i = 0; i < nunits; i++) | |
1771 t = tree_cons (NULL_TREE, unshare_expr (new_name), t); | |
1772 gcc_assert (CONSTANT_CLASS_P (new_name)); | |
1773 vec = build_vector (vectype, t); | |
1774 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL); | |
1775 | |
1776 | |
1777 /* Create the following def-use cycle: | |
1778 loop prolog: | |
1779 vec_init = ... | |
1780 vec_step = ... | |
1781 loop: | |
1782 vec_iv = PHI <vec_init, vec_loop> | |
1783 ... | |
1784 STMT | |
1785 ... | |
1786 vec_loop = vec_iv + vec_step; */ | |
1787 | |
1788 /* Create the induction-phi that defines the induction-operand. */ | |
1789 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"); | |
1790 add_referenced_var (vec_dest); | |
1791 induction_phi = create_phi_node (vec_dest, iv_loop->header); | |
1792 set_vinfo_for_stmt (induction_phi, | |
1793 new_stmt_vec_info (induction_phi, loop_vinfo)); | |
1794 induc_def = PHI_RESULT (induction_phi); | |
1795 | |
1796 /* Create the iv update inside the loop */ | |
1797 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest, | |
1798 induc_def, vec_step); | |
1799 vec_def = make_ssa_name (vec_dest, new_stmt); | |
1800 gimple_assign_set_lhs (new_stmt, vec_def); | |
1801 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT); | |
1802 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo)); | |
1803 | |
1804 /* Set the arguments of the phi node: */ | |
1805 add_phi_arg (induction_phi, vec_init, pe); | |
1806 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop)); | |
1807 | |
1808 | |
1809 /* In case that vectorization factor (VF) is bigger than the number | |
1810 of elements that we can fit in a vectype (nunits), we have to generate | |
1811 more than one vector stmt - i.e - we need to "unroll" the | |
1812 vector stmt by a factor VF/nunits. For more details see documentation | |
1813 in vectorizable_operation. */ | |
1814 | |
1815 if (ncopies > 1) | |
1816 { | |
1817 stmt_vec_info prev_stmt_vinfo; | |
1818 /* FORNOW. This restriction should be relaxed. */ | |
1819 gcc_assert (!nested_in_vect_loop); | |
1820 | |
1821 /* Create the vector that holds the step of the induction. */ | |
1822 expr = build_int_cst (scalar_type, nunits); | |
1823 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr); | |
1824 t = NULL_TREE; | |
1825 for (i = 0; i < nunits; i++) | |
1826 t = tree_cons (NULL_TREE, unshare_expr (new_name), t); | |
1827 gcc_assert (CONSTANT_CLASS_P (new_name)); | |
1828 vec = build_vector (vectype, t); | |
1829 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL); | |
1830 | |
1831 vec_def = induc_def; | |
1832 prev_stmt_vinfo = vinfo_for_stmt (induction_phi); | |
1833 for (i = 1; i < ncopies; i++) | |
1834 { | |
1835 /* vec_i = vec_prev + vec_step */ | |
1836 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest, | |
1837 vec_def, vec_step); | |
1838 vec_def = make_ssa_name (vec_dest, new_stmt); | |
1839 gimple_assign_set_lhs (new_stmt, vec_def); | |
1840 | |
1841 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT); | |
1842 set_vinfo_for_stmt (new_stmt, | |
1843 new_stmt_vec_info (new_stmt, loop_vinfo)); | |
1844 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt; | |
1845 prev_stmt_vinfo = vinfo_for_stmt (new_stmt); | |
1846 } | |
1847 } | |
1848 | |
1849 if (nested_in_vect_loop) | |
1850 { | |
1851 /* Find the loop-closed exit-phi of the induction, and record | |
1852 the final vector of induction results: */ | |
1853 exit_phi = NULL; | |
1854 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg) | |
1855 { | |
1856 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p)))) | |
1857 { | |
1858 exit_phi = USE_STMT (use_p); | |
1859 break; | |
1860 } | |
1861 } | |
1862 if (exit_phi) | |
1863 { | |
1864 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi); | |
1865 /* FORNOW. Currently not supporting the case that an inner-loop induction | |
1866 is not used in the outer-loop (i.e. only outside the outer-loop). */ | |
1867 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo) | |
1868 && !STMT_VINFO_LIVE_P (stmt_vinfo)); | |
1869 | |
1870 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt; | |
1871 if (vect_print_dump_info (REPORT_DETAILS)) | |
1872 { | |
1873 fprintf (vect_dump, "vector of inductions after inner-loop:"); | |
1874 print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM); | |
1875 } | |
1876 } | |
1877 } | |
1878 | |
1879 | |
1880 if (vect_print_dump_info (REPORT_DETAILS)) | |
1881 { | |
1882 fprintf (vect_dump, "transform induction: created def-use cycle: "); | |
1883 print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM); | |
1884 fprintf (vect_dump, "\n"); | |
1885 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM); | |
1886 } | |
1887 | |
1888 STMT_VINFO_VEC_STMT (phi_info) = induction_phi; | |
1889 return induc_def; | |
1890 } | |
1891 | |
1892 | |
1893 /* Function vect_get_vec_def_for_operand. | |
1894 | |
1895 OP is an operand in STMT. This function returns a (vector) def that will be | |
1896 used in the vectorized stmt for STMT. | |
1897 | |
1898 In the case that OP is an SSA_NAME which is defined in the loop, then | |
1899 STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def. | |
1900 | |
1901 In case OP is an invariant or constant, a new stmt that creates a vector def | |
1902 needs to be introduced. */ | |
1903 | |
1904 static tree | |
1905 vect_get_vec_def_for_operand (tree op, gimple stmt, tree *scalar_def) | |
1906 { | |
1907 tree vec_oprnd; | |
1908 gimple vec_stmt; | |
1909 gimple def_stmt; | |
1910 stmt_vec_info def_stmt_info = NULL; | |
1911 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt); | |
1912 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo); | |
1913 unsigned int nunits = TYPE_VECTOR_SUBPARTS (vectype); | |
1914 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo); | |
1915 tree vec_inv; | |
1916 tree vec_cst; | |
1917 tree t = NULL_TREE; | |
1918 tree def; | |
1919 int i; | |
1920 enum vect_def_type dt; | |
1921 bool is_simple_use; | |
1922 tree vector_type; | |
1923 | |
1924 if (vect_print_dump_info (REPORT_DETAILS)) | |
1925 { | |
1926 fprintf (vect_dump, "vect_get_vec_def_for_operand: "); | |
1927 print_generic_expr (vect_dump, op, TDF_SLIM); | |
1928 } | |
1929 | |
1930 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt); | |
1931 gcc_assert (is_simple_use); | |
1932 if (vect_print_dump_info (REPORT_DETAILS)) | |
1933 { | |
1934 if (def) | |
1935 { | |
1936 fprintf (vect_dump, "def = "); | |
1937 print_generic_expr (vect_dump, def, TDF_SLIM); | |
1938 } | |
1939 if (def_stmt) | |
1940 { | |
1941 fprintf (vect_dump, " def_stmt = "); | |
1942 print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM); | |
1943 } | |
1944 } | |
1945 | |
1946 switch (dt) | |
1947 { | |
1948 /* Case 1: operand is a constant. */ | |
1949 case vect_constant_def: | |
1950 { | |
1951 if (scalar_def) | |
1952 *scalar_def = op; | |
1953 | |
1954 /* Create 'vect_cst_ = {cst,cst,...,cst}' */ | |
1955 if (vect_print_dump_info (REPORT_DETAILS)) | |
1956 fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits); | |
1957 | |
1958 for (i = nunits - 1; i >= 0; --i) | |
1959 { | |
1960 t = tree_cons (NULL_TREE, op, t); | |
1961 } | |
1962 vec_cst = build_vector (vectype, t); | |
1963 return vect_init_vector (stmt, vec_cst, vectype, NULL); | |
1964 } | |
1965 | |
1966 /* Case 2: operand is defined outside the loop - loop invariant. */ | |
1967 case vect_invariant_def: | |
1968 { | |
1969 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def)); | |
1970 gcc_assert (vector_type); | |
1971 nunits = TYPE_VECTOR_SUBPARTS (vector_type); | |
1972 | |
1973 if (scalar_def) | |
1974 *scalar_def = def; | |
1975 | |
1976 /* Create 'vec_inv = {inv,inv,..,inv}' */ | |
1977 if (vect_print_dump_info (REPORT_DETAILS)) | |
1978 fprintf (vect_dump, "Create vector_inv."); | |
1979 | |
1980 for (i = nunits - 1; i >= 0; --i) | |
1981 { | |
1982 t = tree_cons (NULL_TREE, def, t); | |
1983 } | |
1984 | |
1985 /* FIXME: use build_constructor directly. */ | |
1986 vec_inv = build_constructor_from_list (vector_type, t); | |
1987 return vect_init_vector (stmt, vec_inv, vector_type, NULL); | |
1988 } | |
1989 | |
1990 /* Case 3: operand is defined inside the loop. */ | |
1991 case vect_loop_def: | |
1992 { | |
1993 if (scalar_def) | |
1994 *scalar_def = NULL/* FIXME tuples: def_stmt*/; | |
1995 | |
1996 /* Get the def from the vectorized stmt. */ | |
1997 def_stmt_info = vinfo_for_stmt (def_stmt); | |
1998 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info); | |
1999 gcc_assert (vec_stmt); | |
2000 if (gimple_code (vec_stmt) == GIMPLE_PHI) | |
2001 vec_oprnd = PHI_RESULT (vec_stmt); | |
2002 else if (is_gimple_call (vec_stmt)) | |
2003 vec_oprnd = gimple_call_lhs (vec_stmt); | |
2004 else | |
2005 vec_oprnd = gimple_assign_lhs (vec_stmt); | |
2006 return vec_oprnd; | |
2007 } | |
2008 | |
2009 /* Case 4: operand is defined by a loop header phi - reduction */ | |
2010 case vect_reduction_def: | |
2011 { | |
2012 struct loop *loop; | |
2013 | |
2014 gcc_assert (gimple_code (def_stmt) == GIMPLE_PHI); | |
2015 loop = (gimple_bb (def_stmt))->loop_father; | |
2016 | |
2017 /* Get the def before the loop */ | |
2018 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop)); | |
2019 return get_initial_def_for_reduction (stmt, op, scalar_def); | |
2020 } | |
2021 | |
2022 /* Case 5: operand is defined by loop-header phi - induction. */ | |
2023 case vect_induction_def: | |
2024 { | |
2025 gcc_assert (gimple_code (def_stmt) == GIMPLE_PHI); | |
2026 | |
2027 /* Get the def from the vectorized stmt. */ | |
2028 def_stmt_info = vinfo_for_stmt (def_stmt); | |
2029 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info); | |
2030 gcc_assert (vec_stmt && gimple_code (vec_stmt) == GIMPLE_PHI); | |
2031 vec_oprnd = PHI_RESULT (vec_stmt); | |
2032 return vec_oprnd; | |
2033 } | |
2034 | |
2035 default: | |
2036 gcc_unreachable (); | |
2037 } | |
2038 } | |
2039 | |
2040 | |
2041 /* Function vect_get_vec_def_for_stmt_copy | |
2042 | |
2043 Return a vector-def for an operand. This function is used when the | |
2044 vectorized stmt to be created (by the caller to this function) is a "copy" | |
2045 created in case the vectorized result cannot fit in one vector, and several | |
2046 copies of the vector-stmt are required. In this case the vector-def is | |
2047 retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field | |
2048 of the stmt that defines VEC_OPRND. | |
2049 DT is the type of the vector def VEC_OPRND. | |
2050 | |
2051 Context: | |
2052 In case the vectorization factor (VF) is bigger than the number | |
2053 of elements that can fit in a vectype (nunits), we have to generate | |
2054 more than one vector stmt to vectorize the scalar stmt. This situation | |
2055 arises when there are multiple data-types operated upon in the loop; the | |
2056 smallest data-type determines the VF, and as a result, when vectorizing | |
2057 stmts operating on wider types we need to create 'VF/nunits' "copies" of the | |
2058 vector stmt (each computing a vector of 'nunits' results, and together | |
2059 computing 'VF' results in each iteration). This function is called when | |
2060 vectorizing such a stmt (e.g. vectorizing S2 in the illustration below, in | |
2061 which VF=16 and nunits=4, so the number of copies required is 4): | |
2062 | |
2063 scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT | |
2064 | |
2065 S1: x = load VS1.0: vx.0 = memref0 VS1.1 | |
2066 VS1.1: vx.1 = memref1 VS1.2 | |
2067 VS1.2: vx.2 = memref2 VS1.3 | |
2068 VS1.3: vx.3 = memref3 | |
2069 | |
2070 S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1 | |
2071 VSnew.1: vz1 = vx.1 + ... VSnew.2 | |
2072 VSnew.2: vz2 = vx.2 + ... VSnew.3 | |
2073 VSnew.3: vz3 = vx.3 + ... | |
2074 | |
2075 The vectorization of S1 is explained in vectorizable_load. | |
2076 The vectorization of S2: | |
2077 To create the first vector-stmt out of the 4 copies - VSnew.0 - | |
2078 the function 'vect_get_vec_def_for_operand' is called to | |
2079 get the relevant vector-def for each operand of S2. For operand x it | |
2080 returns the vector-def 'vx.0'. | |
2081 | |
2082 To create the remaining copies of the vector-stmt (VSnew.j), this | |
2083 function is called to get the relevant vector-def for each operand. It is | |
2084 obtained from the respective VS1.j stmt, which is recorded in the | |
2085 STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND. | |
2086 | |
2087 For example, to obtain the vector-def 'vx.1' in order to create the | |
2088 vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'. | |
2089 Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the | |
2090 STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1', | |
2091 and return its def ('vx.1'). | |
2092 Overall, to create the above sequence this function will be called 3 times: | |
2093 vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0); | |
2094 vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1); | |
2095 vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */ | |
2096 | |
2097 static tree | |
2098 vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd) | |
2099 { | |
2100 gimple vec_stmt_for_operand; | |
2101 stmt_vec_info def_stmt_info; | |
2102 | |
2103 /* Do nothing; can reuse same def. */ | |
2104 if (dt == vect_invariant_def || dt == vect_constant_def ) | |
2105 return vec_oprnd; | |
2106 | |
2107 vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd); | |
2108 def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand); | |
2109 gcc_assert (def_stmt_info); | |
2110 vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info); | |
2111 gcc_assert (vec_stmt_for_operand); | |
2112 vec_oprnd = gimple_get_lhs (vec_stmt_for_operand); | |
2113 if (gimple_code (vec_stmt_for_operand) == GIMPLE_PHI) | |
2114 vec_oprnd = PHI_RESULT (vec_stmt_for_operand); | |
2115 else | |
2116 vec_oprnd = gimple_get_lhs (vec_stmt_for_operand); | |
2117 return vec_oprnd; | |
2118 } | |
2119 | |
2120 | |
2121 /* Get vectorized definitions for the operands to create a copy of an original | |
2122 stmt. See vect_get_vec_def_for_stmt_copy() for details. */ | |
2123 | |
2124 static void | |
2125 vect_get_vec_defs_for_stmt_copy (enum vect_def_type *dt, | |
2126 VEC(tree,heap) **vec_oprnds0, | |
2127 VEC(tree,heap) **vec_oprnds1) | |
2128 { | |
2129 tree vec_oprnd = VEC_pop (tree, *vec_oprnds0); | |
2130 | |
2131 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd); | |
2132 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd); | |
2133 | |
2134 if (vec_oprnds1 && *vec_oprnds1) | |
2135 { | |
2136 vec_oprnd = VEC_pop (tree, *vec_oprnds1); | |
2137 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd); | |
2138 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd); | |
2139 } | |
2140 } | |
2141 | |
2142 | |
2143 /* Get vectorized definitions for OP0 and OP1, or SLP_NODE if it is not NULL. */ | |
2144 | |
2145 static void | |
2146 vect_get_vec_defs (tree op0, tree op1, gimple stmt, | |
2147 VEC(tree,heap) **vec_oprnds0, VEC(tree,heap) **vec_oprnds1, | |
2148 slp_tree slp_node) | |
2149 { | |
2150 if (slp_node) | |
2151 vect_get_slp_defs (slp_node, vec_oprnds0, vec_oprnds1); | |
2152 else | |
2153 { | |
2154 tree vec_oprnd; | |
2155 | |
2156 *vec_oprnds0 = VEC_alloc (tree, heap, 1); | |
2157 vec_oprnd = vect_get_vec_def_for_operand (op0, stmt, NULL); | |
2158 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd); | |
2159 | |
2160 if (op1) | |
2161 { | |
2162 *vec_oprnds1 = VEC_alloc (tree, heap, 1); | |
2163 vec_oprnd = vect_get_vec_def_for_operand (op1, stmt, NULL); | |
2164 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd); | |
2165 } | |
2166 } | |
2167 } | |
2168 | |
2169 | |
2170 /* Function vect_finish_stmt_generation. | |
2171 | |
2172 Insert a new stmt. */ | |
2173 | |
2174 static void | |
2175 vect_finish_stmt_generation (gimple stmt, gimple vec_stmt, | |
2176 gimple_stmt_iterator *gsi) | |
2177 { | |
2178 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
2179 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
2180 | |
2181 gcc_assert (gimple_code (stmt) != GIMPLE_LABEL); | |
2182 | |
2183 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT); | |
2184 | |
2185 set_vinfo_for_stmt (vec_stmt, new_stmt_vec_info (vec_stmt, loop_vinfo)); | |
2186 | |
2187 if (vect_print_dump_info (REPORT_DETAILS)) | |
2188 { | |
2189 fprintf (vect_dump, "add new stmt: "); | |
2190 print_gimple_stmt (vect_dump, vec_stmt, 0, TDF_SLIM); | |
2191 } | |
2192 | |
2193 gimple_set_location (vec_stmt, gimple_location (gsi_stmt (*gsi))); | |
2194 } | |
2195 | |
2196 | |
2197 /* Function get_initial_def_for_reduction | |
2198 | |
2199 Input: | |
2200 STMT - a stmt that performs a reduction operation in the loop. | |
2201 INIT_VAL - the initial value of the reduction variable | |
2202 | |
2203 Output: | |
2204 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result | |
2205 of the reduction (used for adjusting the epilog - see below). | |
2206 Return a vector variable, initialized according to the operation that STMT | |
2207 performs. This vector will be used as the initial value of the | |
2208 vector of partial results. | |
2209 | |
2210 Option1 (adjust in epilog): Initialize the vector as follows: | |
2211 add: [0,0,...,0,0] | |
2212 mult: [1,1,...,1,1] | |
2213 min/max: [init_val,init_val,..,init_val,init_val] | |
2214 bit and/or: [init_val,init_val,..,init_val,init_val] | |
2215 and when necessary (e.g. add/mult case) let the caller know | |
2216 that it needs to adjust the result by init_val. | |
2217 | |
2218 Option2: Initialize the vector as follows: | |
2219 add: [0,0,...,0,init_val] | |
2220 mult: [1,1,...,1,init_val] | |
2221 min/max: [init_val,init_val,...,init_val] | |
2222 bit and/or: [init_val,init_val,...,init_val] | |
2223 and no adjustments are needed. | |
2224 | |
2225 For example, for the following code: | |
2226 | |
2227 s = init_val; | |
2228 for (i=0;i<n;i++) | |
2229 s = s + a[i]; | |
2230 | |
2231 STMT is 's = s + a[i]', and the reduction variable is 's'. | |
2232 For a vector of 4 units, we want to return either [0,0,0,init_val], | |
2233 or [0,0,0,0] and let the caller know that it needs to adjust | |
2234 the result at the end by 'init_val'. | |
2235 | |
2236 FORNOW, we are using the 'adjust in epilog' scheme, because this way the | |
2237 initialization vector is simpler (same element in all entries). | |
2238 A cost model should help decide between these two schemes. */ | |
2239 | |
2240 static tree | |
2241 get_initial_def_for_reduction (gimple stmt, tree init_val, tree *adjustment_def) | |
2242 { | |
2243 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt); | |
2244 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo); | |
2245 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
2246 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo); | |
2247 int nunits = TYPE_VECTOR_SUBPARTS (vectype); | |
2248 tree scalar_type = TREE_TYPE (vectype); | |
2249 enum tree_code code = gimple_assign_rhs_code (stmt); | |
2250 tree type = TREE_TYPE (init_val); | |
2251 tree vecdef; | |
2252 tree def_for_init; | |
2253 tree init_def; | |
2254 tree t = NULL_TREE; | |
2255 int i; | |
2256 bool nested_in_vect_loop = false; | |
2257 | |
2258 gcc_assert (POINTER_TYPE_P (type) || INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)); | |
2259 if (nested_in_vect_loop_p (loop, stmt)) | |
2260 nested_in_vect_loop = true; | |
2261 else | |
2262 gcc_assert (loop == (gimple_bb (stmt))->loop_father); | |
2263 | |
2264 vecdef = vect_get_vec_def_for_operand (init_val, stmt, NULL); | |
2265 | |
2266 switch (code) | |
2267 { | |
2268 case WIDEN_SUM_EXPR: | |
2269 case DOT_PROD_EXPR: | |
2270 case PLUS_EXPR: | |
2271 if (nested_in_vect_loop) | |
2272 *adjustment_def = vecdef; | |
2273 else | |
2274 *adjustment_def = init_val; | |
2275 /* Create a vector of zeros for init_def. */ | |
2276 if (SCALAR_FLOAT_TYPE_P (scalar_type)) | |
2277 def_for_init = build_real (scalar_type, dconst0); | |
2278 else | |
2279 def_for_init = build_int_cst (scalar_type, 0); | |
2280 | |
2281 for (i = nunits - 1; i >= 0; --i) | |
2282 t = tree_cons (NULL_TREE, def_for_init, t); | |
2283 init_def = build_vector (vectype, t); | |
2284 break; | |
2285 | |
2286 case MIN_EXPR: | |
2287 case MAX_EXPR: | |
2288 *adjustment_def = NULL_TREE; | |
2289 init_def = vecdef; | |
2290 break; | |
2291 | |
2292 default: | |
2293 gcc_unreachable (); | |
2294 } | |
2295 | |
2296 return init_def; | |
2297 } | |
2298 | |
2299 | |
2300 /* Function vect_create_epilog_for_reduction | |
2301 | |
2302 Create code at the loop-epilog to finalize the result of a reduction | |
2303 computation. | |
2304 | |
2305 VECT_DEF is a vector of partial results. | |
2306 REDUC_CODE is the tree-code for the epilog reduction. | |
2307 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the | |
2308 number of elements that we can fit in a vectype (nunits). In this case | |
2309 we have to generate more than one vector stmt - i.e - we need to "unroll" | |
2310 the vector stmt by a factor VF/nunits. For more details see documentation | |
2311 in vectorizable_operation. | |
2312 STMT is the scalar reduction stmt that is being vectorized. | |
2313 REDUCTION_PHI is the phi-node that carries the reduction computation. | |
2314 | |
2315 This function: | |
2316 1. Creates the reduction def-use cycle: sets the arguments for | |
2317 REDUCTION_PHI: | |
2318 The loop-entry argument is the vectorized initial-value of the reduction. | |
2319 The loop-latch argument is VECT_DEF - the vector of partial sums. | |
2320 2. "Reduces" the vector of partial results VECT_DEF into a single result, | |
2321 by applying the operation specified by REDUC_CODE if available, or by | |
2322 other means (whole-vector shifts or a scalar loop). | |
2323 The function also creates a new phi node at the loop exit to preserve | |
2324 loop-closed form, as illustrated below. | |
2325 | |
2326 The flow at the entry to this function: | |
2327 | |
2328 loop: | |
2329 vec_def = phi <null, null> # REDUCTION_PHI | |
2330 VECT_DEF = vector_stmt # vectorized form of STMT | |
2331 s_loop = scalar_stmt # (scalar) STMT | |
2332 loop_exit: | |
2333 s_out0 = phi <s_loop> # (scalar) EXIT_PHI | |
2334 use <s_out0> | |
2335 use <s_out0> | |
2336 | |
2337 The above is transformed by this function into: | |
2338 | |
2339 loop: | |
2340 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI | |
2341 VECT_DEF = vector_stmt # vectorized form of STMT | |
2342 s_loop = scalar_stmt # (scalar) STMT | |
2343 loop_exit: | |
2344 s_out0 = phi <s_loop> # (scalar) EXIT_PHI | |
2345 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI | |
2346 v_out2 = reduce <v_out1> | |
2347 s_out3 = extract_field <v_out2, 0> | |
2348 s_out4 = adjust_result <s_out3> | |
2349 use <s_out4> | |
2350 use <s_out4> | |
2351 */ | |
2352 | |
2353 static void | |
2354 vect_create_epilog_for_reduction (tree vect_def, gimple stmt, | |
2355 int ncopies, | |
2356 enum tree_code reduc_code, | |
2357 gimple reduction_phi) | |
2358 { | |
2359 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
2360 stmt_vec_info prev_phi_info; | |
2361 tree vectype; | |
2362 enum machine_mode mode; | |
2363 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
2364 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
2365 basic_block exit_bb; | |
2366 tree scalar_dest; | |
2367 tree scalar_type; | |
2368 gimple new_phi = NULL, phi; | |
2369 gimple_stmt_iterator exit_gsi; | |
2370 tree vec_dest; | |
2371 tree new_temp = NULL_TREE; | |
2372 tree new_name; | |
2373 gimple epilog_stmt = NULL; | |
2374 tree new_scalar_dest, new_dest; | |
2375 gimple exit_phi; | |
2376 tree bitsize, bitpos, bytesize; | |
2377 enum tree_code code = gimple_assign_rhs_code (stmt); | |
2378 tree adjustment_def; | |
2379 tree vec_initial_def, def; | |
2380 tree orig_name; | |
2381 imm_use_iterator imm_iter; | |
2382 use_operand_p use_p; | |
2383 bool extract_scalar_result = false; | |
2384 tree reduction_op, expr; | |
2385 gimple orig_stmt; | |
2386 gimple use_stmt; | |
2387 bool nested_in_vect_loop = false; | |
2388 VEC(gimple,heap) *phis = NULL; | |
2389 enum vect_def_type dt = vect_unknown_def_type; | |
2390 int j, i; | |
2391 | |
2392 if (nested_in_vect_loop_p (loop, stmt)) | |
2393 { | |
2394 loop = loop->inner; | |
2395 nested_in_vect_loop = true; | |
2396 } | |
2397 | |
2398 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt))) | |
2399 { | |
2400 case GIMPLE_SINGLE_RHS: | |
2401 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op); | |
2402 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2); | |
2403 break; | |
2404 case GIMPLE_UNARY_RHS: | |
2405 reduction_op = gimple_assign_rhs1 (stmt); | |
2406 break; | |
2407 case GIMPLE_BINARY_RHS: | |
2408 reduction_op = gimple_assign_rhs2 (stmt); | |
2409 break; | |
2410 default: | |
2411 gcc_unreachable (); | |
2412 } | |
2413 | |
2414 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op)); | |
2415 gcc_assert (vectype); | |
2416 mode = TYPE_MODE (vectype); | |
2417 | |
2418 /*** 1. Create the reduction def-use cycle ***/ | |
2419 | |
2420 /* For the case of reduction, vect_get_vec_def_for_operand returns | |
2421 the scalar def before the loop, that defines the initial value | |
2422 of the reduction variable. */ | |
2423 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt, | |
2424 &adjustment_def); | |
2425 | |
2426 phi = reduction_phi; | |
2427 def = vect_def; | |
2428 for (j = 0; j < ncopies; j++) | |
2429 { | |
2430 /* 1.1 set the loop-entry arg of the reduction-phi: */ | |
2431 add_phi_arg (phi, vec_initial_def, loop_preheader_edge (loop)); | |
2432 | |
2433 /* 1.2 set the loop-latch arg for the reduction-phi: */ | |
2434 if (j > 0) | |
2435 def = vect_get_vec_def_for_stmt_copy (dt, def); | |
2436 add_phi_arg (phi, def, loop_latch_edge (loop)); | |
2437 | |
2438 if (vect_print_dump_info (REPORT_DETAILS)) | |
2439 { | |
2440 fprintf (vect_dump, "transform reduction: created def-use cycle: "); | |
2441 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM); | |
2442 fprintf (vect_dump, "\n"); | |
2443 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0, TDF_SLIM); | |
2444 } | |
2445 | |
2446 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)); | |
2447 } | |
2448 | |
2449 /*** 2. Create epilog code | |
2450 The reduction epilog code operates across the elements of the vector | |
2451 of partial results computed by the vectorized loop. | |
2452 The reduction epilog code consists of: | |
2453 step 1: compute the scalar result in a vector (v_out2) | |
2454 step 2: extract the scalar result (s_out3) from the vector (v_out2) | |
2455 step 3: adjust the scalar result (s_out3) if needed. | |
2456 | |
2457 Step 1 can be accomplished using one the following three schemes: | |
2458 (scheme 1) using reduc_code, if available. | |
2459 (scheme 2) using whole-vector shifts, if available. | |
2460 (scheme 3) using a scalar loop. In this case steps 1+2 above are | |
2461 combined. | |
2462 | |
2463 The overall epilog code looks like this: | |
2464 | |
2465 s_out0 = phi <s_loop> # original EXIT_PHI | |
2466 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI | |
2467 v_out2 = reduce <v_out1> # step 1 | |
2468 s_out3 = extract_field <v_out2, 0> # step 2 | |
2469 s_out4 = adjust_result <s_out3> # step 3 | |
2470 | |
2471 (step 3 is optional, and steps 1 and 2 may be combined). | |
2472 Lastly, the uses of s_out0 are replaced by s_out4. | |
2473 | |
2474 ***/ | |
2475 | |
2476 /* 2.1 Create new loop-exit-phi to preserve loop-closed form: | |
2477 v_out1 = phi <v_loop> */ | |
2478 | |
2479 exit_bb = single_exit (loop)->dest; | |
2480 def = vect_def; | |
2481 prev_phi_info = NULL; | |
2482 for (j = 0; j < ncopies; j++) | |
2483 { | |
2484 phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb); | |
2485 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo)); | |
2486 if (j == 0) | |
2487 new_phi = phi; | |
2488 else | |
2489 { | |
2490 def = vect_get_vec_def_for_stmt_copy (dt, def); | |
2491 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi; | |
2492 } | |
2493 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def); | |
2494 prev_phi_info = vinfo_for_stmt (phi); | |
2495 } | |
2496 exit_gsi = gsi_after_labels (exit_bb); | |
2497 | |
2498 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3 | |
2499 (i.e. when reduc_code is not available) and in the final adjustment | |
2500 code (if needed). Also get the original scalar reduction variable as | |
2501 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it | |
2502 represents a reduction pattern), the tree-code and scalar-def are | |
2503 taken from the original stmt that the pattern-stmt (STMT) replaces. | |
2504 Otherwise (it is a regular reduction) - the tree-code and scalar-def | |
2505 are taken from STMT. */ | |
2506 | |
2507 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info); | |
2508 if (!orig_stmt) | |
2509 { | |
2510 /* Regular reduction */ | |
2511 orig_stmt = stmt; | |
2512 } | |
2513 else | |
2514 { | |
2515 /* Reduction pattern */ | |
2516 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt); | |
2517 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo)); | |
2518 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt); | |
2519 } | |
2520 code = gimple_assign_rhs_code (orig_stmt); | |
2521 scalar_dest = gimple_assign_lhs (orig_stmt); | |
2522 scalar_type = TREE_TYPE (scalar_dest); | |
2523 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL); | |
2524 bitsize = TYPE_SIZE (scalar_type); | |
2525 bytesize = TYPE_SIZE_UNIT (scalar_type); | |
2526 | |
2527 | |
2528 /* In case this is a reduction in an inner-loop while vectorizing an outer | |
2529 loop - we don't need to extract a single scalar result at the end of the | |
2530 inner-loop. The final vector of partial results will be used in the | |
2531 vectorized outer-loop, or reduced to a scalar result at the end of the | |
2532 outer-loop. */ | |
2533 if (nested_in_vect_loop) | |
2534 goto vect_finalize_reduction; | |
2535 | |
2536 /* FORNOW */ | |
2537 gcc_assert (ncopies == 1); | |
2538 | |
2539 /* 2.3 Create the reduction code, using one of the three schemes described | |
2540 above. */ | |
2541 | |
2542 if (reduc_code < NUM_TREE_CODES) | |
2543 { | |
2544 tree tmp; | |
2545 | |
2546 /*** Case 1: Create: | |
2547 v_out2 = reduc_expr <v_out1> */ | |
2548 | |
2549 if (vect_print_dump_info (REPORT_DETAILS)) | |
2550 fprintf (vect_dump, "Reduce using direct vector reduction."); | |
2551 | |
2552 vec_dest = vect_create_destination_var (scalar_dest, vectype); | |
2553 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi)); | |
2554 epilog_stmt = gimple_build_assign (vec_dest, tmp); | |
2555 new_temp = make_ssa_name (vec_dest, epilog_stmt); | |
2556 gimple_assign_set_lhs (epilog_stmt, new_temp); | |
2557 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); | |
2558 | |
2559 extract_scalar_result = true; | |
2560 } | |
2561 else | |
2562 { | |
2563 enum tree_code shift_code = 0; | |
2564 bool have_whole_vector_shift = true; | |
2565 int bit_offset; | |
2566 int element_bitsize = tree_low_cst (bitsize, 1); | |
2567 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1); | |
2568 tree vec_temp; | |
2569 | |
2570 if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing) | |
2571 shift_code = VEC_RSHIFT_EXPR; | |
2572 else | |
2573 have_whole_vector_shift = false; | |
2574 | |
2575 /* Regardless of whether we have a whole vector shift, if we're | |
2576 emulating the operation via tree-vect-generic, we don't want | |
2577 to use it. Only the first round of the reduction is likely | |
2578 to still be profitable via emulation. */ | |
2579 /* ??? It might be better to emit a reduction tree code here, so that | |
2580 tree-vect-generic can expand the first round via bit tricks. */ | |
2581 if (!VECTOR_MODE_P (mode)) | |
2582 have_whole_vector_shift = false; | |
2583 else | |
2584 { | |
2585 optab optab = optab_for_tree_code (code, vectype, optab_default); | |
2586 if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing) | |
2587 have_whole_vector_shift = false; | |
2588 } | |
2589 | |
2590 if (have_whole_vector_shift) | |
2591 { | |
2592 /*** Case 2: Create: | |
2593 for (offset = VS/2; offset >= element_size; offset/=2) | |
2594 { | |
2595 Create: va' = vec_shift <va, offset> | |
2596 Create: va = vop <va, va'> | |
2597 } */ | |
2598 | |
2599 if (vect_print_dump_info (REPORT_DETAILS)) | |
2600 fprintf (vect_dump, "Reduce using vector shifts"); | |
2601 | |
2602 vec_dest = vect_create_destination_var (scalar_dest, vectype); | |
2603 new_temp = PHI_RESULT (new_phi); | |
2604 | |
2605 for (bit_offset = vec_size_in_bits/2; | |
2606 bit_offset >= element_bitsize; | |
2607 bit_offset /= 2) | |
2608 { | |
2609 tree bitpos = size_int (bit_offset); | |
2610 epilog_stmt = gimple_build_assign_with_ops (shift_code, vec_dest, | |
2611 new_temp, bitpos); | |
2612 new_name = make_ssa_name (vec_dest, epilog_stmt); | |
2613 gimple_assign_set_lhs (epilog_stmt, new_name); | |
2614 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); | |
2615 | |
2616 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest, | |
2617 new_name, new_temp); | |
2618 new_temp = make_ssa_name (vec_dest, epilog_stmt); | |
2619 gimple_assign_set_lhs (epilog_stmt, new_temp); | |
2620 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); | |
2621 } | |
2622 | |
2623 extract_scalar_result = true; | |
2624 } | |
2625 else | |
2626 { | |
2627 tree rhs; | |
2628 | |
2629 /*** Case 3: Create: | |
2630 s = extract_field <v_out2, 0> | |
2631 for (offset = element_size; | |
2632 offset < vector_size; | |
2633 offset += element_size;) | |
2634 { | |
2635 Create: s' = extract_field <v_out2, offset> | |
2636 Create: s = op <s, s'> | |
2637 } */ | |
2638 | |
2639 if (vect_print_dump_info (REPORT_DETAILS)) | |
2640 fprintf (vect_dump, "Reduce using scalar code. "); | |
2641 | |
2642 vec_temp = PHI_RESULT (new_phi); | |
2643 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1); | |
2644 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize, | |
2645 bitsize_zero_node); | |
2646 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs); | |
2647 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); | |
2648 gimple_assign_set_lhs (epilog_stmt, new_temp); | |
2649 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); | |
2650 | |
2651 for (bit_offset = element_bitsize; | |
2652 bit_offset < vec_size_in_bits; | |
2653 bit_offset += element_bitsize) | |
2654 { | |
2655 tree bitpos = bitsize_int (bit_offset); | |
2656 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize, | |
2657 bitpos); | |
2658 | |
2659 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs); | |
2660 new_name = make_ssa_name (new_scalar_dest, epilog_stmt); | |
2661 gimple_assign_set_lhs (epilog_stmt, new_name); | |
2662 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); | |
2663 | |
2664 epilog_stmt = gimple_build_assign_with_ops (code, | |
2665 new_scalar_dest, | |
2666 new_name, new_temp); | |
2667 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); | |
2668 gimple_assign_set_lhs (epilog_stmt, new_temp); | |
2669 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); | |
2670 } | |
2671 | |
2672 extract_scalar_result = false; | |
2673 } | |
2674 } | |
2675 | |
2676 /* 2.4 Extract the final scalar result. Create: | |
2677 s_out3 = extract_field <v_out2, bitpos> */ | |
2678 | |
2679 if (extract_scalar_result) | |
2680 { | |
2681 tree rhs; | |
2682 | |
2683 gcc_assert (!nested_in_vect_loop); | |
2684 if (vect_print_dump_info (REPORT_DETAILS)) | |
2685 fprintf (vect_dump, "extract scalar result"); | |
2686 | |
2687 if (BYTES_BIG_ENDIAN) | |
2688 bitpos = size_binop (MULT_EXPR, | |
2689 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1), | |
2690 TYPE_SIZE (scalar_type)); | |
2691 else | |
2692 bitpos = bitsize_zero_node; | |
2693 | |
2694 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos); | |
2695 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs); | |
2696 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); | |
2697 gimple_assign_set_lhs (epilog_stmt, new_temp); | |
2698 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); | |
2699 } | |
2700 | |
2701 vect_finalize_reduction: | |
2702 | |
2703 /* 2.5 Adjust the final result by the initial value of the reduction | |
2704 variable. (When such adjustment is not needed, then | |
2705 'adjustment_def' is zero). For example, if code is PLUS we create: | |
2706 new_temp = loop_exit_def + adjustment_def */ | |
2707 | |
2708 if (adjustment_def) | |
2709 { | |
2710 if (nested_in_vect_loop) | |
2711 { | |
2712 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE); | |
2713 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def); | |
2714 new_dest = vect_create_destination_var (scalar_dest, vectype); | |
2715 } | |
2716 else | |
2717 { | |
2718 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE); | |
2719 expr = build2 (code, scalar_type, new_temp, adjustment_def); | |
2720 new_dest = vect_create_destination_var (scalar_dest, scalar_type); | |
2721 } | |
2722 epilog_stmt = gimple_build_assign (new_dest, expr); | |
2723 new_temp = make_ssa_name (new_dest, epilog_stmt); | |
2724 gimple_assign_set_lhs (epilog_stmt, new_temp); | |
2725 SSA_NAME_DEF_STMT (new_temp) = epilog_stmt; | |
2726 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); | |
2727 } | |
2728 | |
2729 | |
2730 /* 2.6 Handle the loop-exit phi */ | |
2731 | |
2732 /* Replace uses of s_out0 with uses of s_out3: | |
2733 Find the loop-closed-use at the loop exit of the original scalar result. | |
2734 (The reduction result is expected to have two immediate uses - one at the | |
2735 latch block, and one at the loop exit). */ | |
2736 phis = VEC_alloc (gimple, heap, 10); | |
2737 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest) | |
2738 { | |
2739 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))) | |
2740 { | |
2741 exit_phi = USE_STMT (use_p); | |
2742 VEC_quick_push (gimple, phis, exit_phi); | |
2743 } | |
2744 } | |
2745 /* We expect to have found an exit_phi because of loop-closed-ssa form. */ | |
2746 gcc_assert (!VEC_empty (gimple, phis)); | |
2747 | |
2748 for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++) | |
2749 { | |
2750 if (nested_in_vect_loop) | |
2751 { | |
2752 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi); | |
2753 | |
2754 /* FORNOW. Currently not supporting the case that an inner-loop | |
2755 reduction is not used in the outer-loop (but only outside the | |
2756 outer-loop). */ | |
2757 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo) | |
2758 && !STMT_VINFO_LIVE_P (stmt_vinfo)); | |
2759 | |
2760 epilog_stmt = adjustment_def ? epilog_stmt : new_phi; | |
2761 STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt; | |
2762 set_vinfo_for_stmt (epilog_stmt, | |
2763 new_stmt_vec_info (epilog_stmt, loop_vinfo)); | |
2764 if (adjustment_def) | |
2765 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) = | |
2766 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi)); | |
2767 continue; | |
2768 } | |
2769 | |
2770 /* Replace the uses: */ | |
2771 orig_name = PHI_RESULT (exit_phi); | |
2772 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name) | |
2773 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) | |
2774 SET_USE (use_p, new_temp); | |
2775 } | |
2776 VEC_free (gimple, heap, phis); | |
2777 } | |
2778 | |
2779 | |
2780 /* Function vectorizable_reduction. | |
2781 | |
2782 Check if STMT performs a reduction operation that can be vectorized. | |
2783 If VEC_STMT is also passed, vectorize the STMT: create a vectorized | |
2784 stmt to replace it, put it in VEC_STMT, and insert it at BSI. | |
2785 Return FALSE if not a vectorizable STMT, TRUE otherwise. | |
2786 | |
2787 This function also handles reduction idioms (patterns) that have been | |
2788 recognized in advance during vect_pattern_recog. In this case, STMT may be | |
2789 of this form: | |
2790 X = pattern_expr (arg0, arg1, ..., X) | |
2791 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original | |
2792 sequence that had been detected and replaced by the pattern-stmt (STMT). | |
2793 | |
2794 In some cases of reduction patterns, the type of the reduction variable X is | |
2795 different than the type of the other arguments of STMT. | |
2796 In such cases, the vectype that is used when transforming STMT into a vector | |
2797 stmt is different than the vectype that is used to determine the | |
2798 vectorization factor, because it consists of a different number of elements | |
2799 than the actual number of elements that are being operated upon in parallel. | |
2800 | |
2801 For example, consider an accumulation of shorts into an int accumulator. | |
2802 On some targets it's possible to vectorize this pattern operating on 8 | |
2803 shorts at a time (hence, the vectype for purposes of determining the | |
2804 vectorization factor should be V8HI); on the other hand, the vectype that | |
2805 is used to create the vector form is actually V4SI (the type of the result). | |
2806 | |
2807 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that | |
2808 indicates what is the actual level of parallelism (V8HI in the example), so | |
2809 that the right vectorization factor would be derived. This vectype | |
2810 corresponds to the type of arguments to the reduction stmt, and should *NOT* | |
2811 be used to create the vectorized stmt. The right vectype for the vectorized | |
2812 stmt is obtained from the type of the result X: | |
2813 get_vectype_for_scalar_type (TREE_TYPE (X)) | |
2814 | |
2815 This means that, contrary to "regular" reductions (or "regular" stmts in | |
2816 general), the following equation: | |
2817 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X)) | |
2818 does *NOT* necessarily hold for reduction patterns. */ | |
2819 | |
2820 bool | |
2821 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi, | |
2822 gimple *vec_stmt) | |
2823 { | |
2824 tree vec_dest; | |
2825 tree scalar_dest; | |
2826 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE; | |
2827 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
2828 tree vectype = STMT_VINFO_VECTYPE (stmt_info); | |
2829 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
2830 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
2831 enum tree_code code, orig_code, epilog_reduc_code = 0; | |
2832 enum machine_mode vec_mode; | |
2833 int op_type; | |
2834 optab optab, reduc_optab; | |
2835 tree new_temp = NULL_TREE; | |
2836 tree def; | |
2837 gimple def_stmt; | |
2838 enum vect_def_type dt; | |
2839 gimple new_phi = NULL; | |
2840 tree scalar_type; | |
2841 bool is_simple_use; | |
2842 gimple orig_stmt; | |
2843 stmt_vec_info orig_stmt_info; | |
2844 tree expr = NULL_TREE; | |
2845 int i; | |
2846 int nunits = TYPE_VECTOR_SUBPARTS (vectype); | |
2847 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits; | |
2848 int epilog_copies; | |
2849 stmt_vec_info prev_stmt_info, prev_phi_info; | |
2850 gimple first_phi = NULL; | |
2851 bool single_defuse_cycle = false; | |
2852 tree reduc_def; | |
2853 gimple new_stmt = NULL; | |
2854 int j; | |
2855 tree ops[3]; | |
2856 | |
2857 if (nested_in_vect_loop_p (loop, stmt)) | |
2858 loop = loop->inner; | |
2859 | |
2860 gcc_assert (ncopies >= 1); | |
2861 | |
2862 /* FORNOW: SLP not supported. */ | |
2863 if (STMT_SLP_TYPE (stmt_info)) | |
2864 return false; | |
2865 | |
2866 /* 1. Is vectorizable reduction? */ | |
2867 | |
2868 /* Not supportable if the reduction variable is used in the loop. */ | |
2869 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer) | |
2870 return false; | |
2871 | |
2872 /* Reductions that are not used even in an enclosing outer-loop, | |
2873 are expected to be "live" (used out of the loop). */ | |
2874 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_loop | |
2875 && !STMT_VINFO_LIVE_P (stmt_info)) | |
2876 return false; | |
2877 | |
2878 /* Make sure it was already recognized as a reduction computation. */ | |
2879 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def) | |
2880 return false; | |
2881 | |
2882 /* 2. Has this been recognized as a reduction pattern? | |
2883 | |
2884 Check if STMT represents a pattern that has been recognized | |
2885 in earlier analysis stages. For stmts that represent a pattern, | |
2886 the STMT_VINFO_RELATED_STMT field records the last stmt in | |
2887 the original sequence that constitutes the pattern. */ | |
2888 | |
2889 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info); | |
2890 if (orig_stmt) | |
2891 { | |
2892 orig_stmt_info = vinfo_for_stmt (orig_stmt); | |
2893 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt); | |
2894 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)); | |
2895 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info)); | |
2896 } | |
2897 | |
2898 /* 3. Check the operands of the operation. The first operands are defined | |
2899 inside the loop body. The last operand is the reduction variable, | |
2900 which is defined by the loop-header-phi. */ | |
2901 | |
2902 gcc_assert (is_gimple_assign (stmt)); | |
2903 | |
2904 /* Flatten RHS */ | |
2905 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt))) | |
2906 { | |
2907 case GIMPLE_SINGLE_RHS: | |
2908 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)); | |
2909 if (op_type == ternary_op) | |
2910 { | |
2911 tree rhs = gimple_assign_rhs1 (stmt); | |
2912 ops[0] = TREE_OPERAND (rhs, 0); | |
2913 ops[1] = TREE_OPERAND (rhs, 1); | |
2914 ops[2] = TREE_OPERAND (rhs, 2); | |
2915 code = TREE_CODE (rhs); | |
2916 } | |
2917 else | |
2918 return false; | |
2919 break; | |
2920 | |
2921 case GIMPLE_BINARY_RHS: | |
2922 code = gimple_assign_rhs_code (stmt); | |
2923 op_type = TREE_CODE_LENGTH (code); | |
2924 gcc_assert (op_type == binary_op); | |
2925 ops[0] = gimple_assign_rhs1 (stmt); | |
2926 ops[1] = gimple_assign_rhs2 (stmt); | |
2927 break; | |
2928 | |
2929 case GIMPLE_UNARY_RHS: | |
2930 return false; | |
2931 | |
2932 default: | |
2933 gcc_unreachable (); | |
2934 } | |
2935 | |
2936 scalar_dest = gimple_assign_lhs (stmt); | |
2937 scalar_type = TREE_TYPE (scalar_dest); | |
2938 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type) | |
2939 && !SCALAR_FLOAT_TYPE_P (scalar_type)) | |
2940 return false; | |
2941 | |
2942 /* All uses but the last are expected to be defined in the loop. | |
2943 The last use is the reduction variable. */ | |
2944 for (i = 0; i < op_type-1; i++) | |
2945 { | |
2946 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &def_stmt, | |
2947 &def, &dt); | |
2948 gcc_assert (is_simple_use); | |
2949 if (dt != vect_loop_def | |
2950 && dt != vect_invariant_def | |
2951 && dt != vect_constant_def | |
2952 && dt != vect_induction_def) | |
2953 return false; | |
2954 } | |
2955 | |
2956 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &def_stmt, &def, &dt); | |
2957 gcc_assert (is_simple_use); | |
2958 gcc_assert (dt == vect_reduction_def); | |
2959 gcc_assert (gimple_code (def_stmt) == GIMPLE_PHI); | |
2960 if (orig_stmt) | |
2961 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo, def_stmt)); | |
2962 else | |
2963 gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, def_stmt)); | |
2964 | |
2965 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt))) | |
2966 return false; | |
2967 | |
2968 /* 4. Supportable by target? */ | |
2969 | |
2970 /* 4.1. check support for the operation in the loop */ | |
2971 optab = optab_for_tree_code (code, vectype, optab_default); | |
2972 if (!optab) | |
2973 { | |
2974 if (vect_print_dump_info (REPORT_DETAILS)) | |
2975 fprintf (vect_dump, "no optab."); | |
2976 return false; | |
2977 } | |
2978 vec_mode = TYPE_MODE (vectype); | |
2979 if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing) | |
2980 { | |
2981 if (vect_print_dump_info (REPORT_DETAILS)) | |
2982 fprintf (vect_dump, "op not supported by target."); | |
2983 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD | |
2984 || LOOP_VINFO_VECT_FACTOR (loop_vinfo) | |
2985 < vect_min_worthwhile_factor (code)) | |
2986 return false; | |
2987 if (vect_print_dump_info (REPORT_DETAILS)) | |
2988 fprintf (vect_dump, "proceeding using word mode."); | |
2989 } | |
2990 | |
2991 /* Worthwhile without SIMD support? */ | |
2992 if (!VECTOR_MODE_P (TYPE_MODE (vectype)) | |
2993 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) | |
2994 < vect_min_worthwhile_factor (code)) | |
2995 { | |
2996 if (vect_print_dump_info (REPORT_DETAILS)) | |
2997 fprintf (vect_dump, "not worthwhile without SIMD support."); | |
2998 return false; | |
2999 } | |
3000 | |
3001 /* 4.2. Check support for the epilog operation. | |
3002 | |
3003 If STMT represents a reduction pattern, then the type of the | |
3004 reduction variable may be different than the type of the rest | |
3005 of the arguments. For example, consider the case of accumulation | |
3006 of shorts into an int accumulator; The original code: | |
3007 S1: int_a = (int) short_a; | |
3008 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>; | |
3009 | |
3010 was replaced with: | |
3011 STMT: int_acc = widen_sum <short_a, int_acc> | |
3012 | |
3013 This means that: | |
3014 1. The tree-code that is used to create the vector operation in the | |
3015 epilog code (that reduces the partial results) is not the | |
3016 tree-code of STMT, but is rather the tree-code of the original | |
3017 stmt from the pattern that STMT is replacing. I.e, in the example | |
3018 above we want to use 'widen_sum' in the loop, but 'plus' in the | |
3019 epilog. | |
3020 2. The type (mode) we use to check available target support | |
3021 for the vector operation to be created in the *epilog*, is | |
3022 determined by the type of the reduction variable (in the example | |
3023 above we'd check this: plus_optab[vect_int_mode]). | |
3024 However the type (mode) we use to check available target support | |
3025 for the vector operation to be created *inside the loop*, is | |
3026 determined by the type of the other arguments to STMT (in the | |
3027 example we'd check this: widen_sum_optab[vect_short_mode]). | |
3028 | |
3029 This is contrary to "regular" reductions, in which the types of all | |
3030 the arguments are the same as the type of the reduction variable. | |
3031 For "regular" reductions we can therefore use the same vector type | |
3032 (and also the same tree-code) when generating the epilog code and | |
3033 when generating the code inside the loop. */ | |
3034 | |
3035 if (orig_stmt) | |
3036 { | |
3037 /* This is a reduction pattern: get the vectype from the type of the | |
3038 reduction variable, and get the tree-code from orig_stmt. */ | |
3039 orig_code = gimple_assign_rhs_code (orig_stmt); | |
3040 vectype = get_vectype_for_scalar_type (TREE_TYPE (def)); | |
3041 if (!vectype) | |
3042 { | |
3043 if (vect_print_dump_info (REPORT_DETAILS)) | |
3044 { | |
3045 fprintf (vect_dump, "unsupported data-type "); | |
3046 print_generic_expr (vect_dump, TREE_TYPE (def), TDF_SLIM); | |
3047 } | |
3048 return false; | |
3049 } | |
3050 | |
3051 vec_mode = TYPE_MODE (vectype); | |
3052 } | |
3053 else | |
3054 { | |
3055 /* Regular reduction: use the same vectype and tree-code as used for | |
3056 the vector code inside the loop can be used for the epilog code. */ | |
3057 orig_code = code; | |
3058 } | |
3059 | |
3060 if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code)) | |
3061 return false; | |
3062 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype, optab_default); | |
3063 if (!reduc_optab) | |
3064 { | |
3065 if (vect_print_dump_info (REPORT_DETAILS)) | |
3066 fprintf (vect_dump, "no optab for reduction."); | |
3067 epilog_reduc_code = NUM_TREE_CODES; | |
3068 } | |
3069 if (optab_handler (reduc_optab, vec_mode)->insn_code == CODE_FOR_nothing) | |
3070 { | |
3071 if (vect_print_dump_info (REPORT_DETAILS)) | |
3072 fprintf (vect_dump, "reduc op not supported by target."); | |
3073 epilog_reduc_code = NUM_TREE_CODES; | |
3074 } | |
3075 | |
3076 if (!vec_stmt) /* transformation not required. */ | |
3077 { | |
3078 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; | |
3079 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies)) | |
3080 return false; | |
3081 return true; | |
3082 } | |
3083 | |
3084 /** Transform. **/ | |
3085 | |
3086 if (vect_print_dump_info (REPORT_DETAILS)) | |
3087 fprintf (vect_dump, "transform reduction."); | |
3088 | |
3089 /* Create the destination vector */ | |
3090 vec_dest = vect_create_destination_var (scalar_dest, vectype); | |
3091 | |
3092 /* In case the vectorization factor (VF) is bigger than the number | |
3093 of elements that we can fit in a vectype (nunits), we have to generate | |
3094 more than one vector stmt - i.e - we need to "unroll" the | |
3095 vector stmt by a factor VF/nunits. For more details see documentation | |
3096 in vectorizable_operation. */ | |
3097 | |
3098 /* If the reduction is used in an outer loop we need to generate | |
3099 VF intermediate results, like so (e.g. for ncopies=2): | |
3100 r0 = phi (init, r0) | |
3101 r1 = phi (init, r1) | |
3102 r0 = x0 + r0; | |
3103 r1 = x1 + r1; | |
3104 (i.e. we generate VF results in 2 registers). | |
3105 In this case we have a separate def-use cycle for each copy, and therefore | |
3106 for each copy we get the vector def for the reduction variable from the | |
3107 respective phi node created for this copy. | |
3108 | |
3109 Otherwise (the reduction is unused in the loop nest), we can combine | |
3110 together intermediate results, like so (e.g. for ncopies=2): | |
3111 r = phi (init, r) | |
3112 r = x0 + r; | |
3113 r = x1 + r; | |
3114 (i.e. we generate VF/2 results in a single register). | |
3115 In this case for each copy we get the vector def for the reduction variable | |
3116 from the vectorized reduction operation generated in the previous iteration. | |
3117 */ | |
3118 | |
3119 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_loop) | |
3120 { | |
3121 single_defuse_cycle = true; | |
3122 epilog_copies = 1; | |
3123 } | |
3124 else | |
3125 epilog_copies = ncopies; | |
3126 | |
3127 prev_stmt_info = NULL; | |
3128 prev_phi_info = NULL; | |
3129 for (j = 0; j < ncopies; j++) | |
3130 { | |
3131 if (j == 0 || !single_defuse_cycle) | |
3132 { | |
3133 /* Create the reduction-phi that defines the reduction-operand. */ | |
3134 new_phi = create_phi_node (vec_dest, loop->header); | |
3135 set_vinfo_for_stmt (new_phi, new_stmt_vec_info (new_phi, loop_vinfo)); | |
3136 } | |
3137 | |
3138 /* Handle uses. */ | |
3139 if (j == 0) | |
3140 { | |
3141 loop_vec_def0 = vect_get_vec_def_for_operand (ops[0], stmt, NULL); | |
3142 if (op_type == ternary_op) | |
3143 { | |
3144 loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt, NULL); | |
3145 } | |
3146 | |
3147 /* Get the vector def for the reduction variable from the phi node */ | |
3148 reduc_def = PHI_RESULT (new_phi); | |
3149 first_phi = new_phi; | |
3150 } | |
3151 else | |
3152 { | |
3153 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */ | |
3154 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0); | |
3155 if (op_type == ternary_op) | |
3156 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1); | |
3157 | |
3158 if (single_defuse_cycle) | |
3159 reduc_def = gimple_assign_lhs (new_stmt); | |
3160 else | |
3161 reduc_def = PHI_RESULT (new_phi); | |
3162 | |
3163 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi; | |
3164 } | |
3165 | |
3166 /* Arguments are ready. create the new vector stmt. */ | |
3167 if (op_type == binary_op) | |
3168 expr = build2 (code, vectype, loop_vec_def0, reduc_def); | |
3169 else | |
3170 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1, | |
3171 reduc_def); | |
3172 new_stmt = gimple_build_assign (vec_dest, expr); | |
3173 new_temp = make_ssa_name (vec_dest, new_stmt); | |
3174 gimple_assign_set_lhs (new_stmt, new_temp); | |
3175 vect_finish_stmt_generation (stmt, new_stmt, gsi); | |
3176 | |
3177 if (j == 0) | |
3178 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt; | |
3179 else | |
3180 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; | |
3181 prev_stmt_info = vinfo_for_stmt (new_stmt); | |
3182 prev_phi_info = vinfo_for_stmt (new_phi); | |
3183 } | |
3184 | |
3185 /* Finalize the reduction-phi (set its arguments) and create the | |
3186 epilog reduction code. */ | |
3187 if (!single_defuse_cycle) | |
3188 new_temp = gimple_assign_lhs (*vec_stmt); | |
3189 vect_create_epilog_for_reduction (new_temp, stmt, epilog_copies, | |
3190 epilog_reduc_code, first_phi); | |
3191 return true; | |
3192 } | |
3193 | |
3194 /* Checks if CALL can be vectorized in type VECTYPE. Returns | |
3195 a function declaration if the target has a vectorized version | |
3196 of the function, or NULL_TREE if the function cannot be vectorized. */ | |
3197 | |
3198 tree | |
3199 vectorizable_function (gimple call, tree vectype_out, tree vectype_in) | |
3200 { | |
3201 tree fndecl = gimple_call_fndecl (call); | |
3202 enum built_in_function code; | |
3203 | |
3204 /* We only handle functions that do not read or clobber memory -- i.e. | |
3205 const or novops ones. */ | |
3206 if (!(gimple_call_flags (call) & (ECF_CONST | ECF_NOVOPS))) | |
3207 return NULL_TREE; | |
3208 | |
3209 if (!fndecl | |
3210 || TREE_CODE (fndecl) != FUNCTION_DECL | |
3211 || !DECL_BUILT_IN (fndecl)) | |
3212 return NULL_TREE; | |
3213 | |
3214 code = DECL_FUNCTION_CODE (fndecl); | |
3215 return targetm.vectorize.builtin_vectorized_function (code, vectype_out, | |
3216 vectype_in); | |
3217 } | |
3218 | |
3219 /* Function vectorizable_call. | |
3220 | |
3221 Check if STMT performs a function call that can be vectorized. | |
3222 If VEC_STMT is also passed, vectorize the STMT: create a vectorized | |
3223 stmt to replace it, put it in VEC_STMT, and insert it at BSI. | |
3224 Return FALSE if not a vectorizable STMT, TRUE otherwise. */ | |
3225 | |
3226 bool | |
3227 vectorizable_call (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt) | |
3228 { | |
3229 tree vec_dest; | |
3230 tree scalar_dest; | |
3231 tree op, type; | |
3232 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE; | |
3233 stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info; | |
3234 tree vectype_out, vectype_in; | |
3235 int nunits_in; | |
3236 int nunits_out; | |
3237 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
3238 tree fndecl, new_temp, def, rhs_type, lhs_type; | |
3239 gimple def_stmt; | |
3240 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type}; | |
3241 gimple new_stmt; | |
3242 int ncopies, j; | |
3243 VEC(tree, heap) *vargs = NULL; | |
3244 enum { NARROW, NONE, WIDEN } modifier; | |
3245 size_t i, nargs; | |
3246 | |
3247 if (!STMT_VINFO_RELEVANT_P (stmt_info)) | |
3248 return false; | |
3249 | |
3250 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def) | |
3251 return false; | |
3252 | |
3253 /* FORNOW: SLP not supported. */ | |
3254 if (STMT_SLP_TYPE (stmt_info)) | |
3255 return false; | |
3256 | |
3257 /* Is STMT a vectorizable call? */ | |
3258 if (!is_gimple_call (stmt)) | |
3259 return false; | |
3260 | |
3261 if (TREE_CODE (gimple_call_lhs (stmt)) != SSA_NAME) | |
3262 return false; | |
3263 | |
3264 /* Process function arguments. */ | |
3265 rhs_type = NULL_TREE; | |
3266 nargs = gimple_call_num_args (stmt); | |
3267 | |
3268 /* Bail out if the function has more than two arguments, we | |
3269 do not have interesting builtin functions to vectorize with | |
3270 more than two arguments. No arguments is also not good. */ | |
3271 if (nargs == 0 || nargs > 2) | |
3272 return false; | |
3273 | |
3274 for (i = 0; i < nargs; i++) | |
3275 { | |
3276 op = gimple_call_arg (stmt, i); | |
3277 | |
3278 /* We can only handle calls with arguments of the same type. */ | |
3279 if (rhs_type | |
3280 && rhs_type != TREE_TYPE (op)) | |
3281 { | |
3282 if (vect_print_dump_info (REPORT_DETAILS)) | |
3283 fprintf (vect_dump, "argument types differ."); | |
3284 return false; | |
3285 } | |
3286 rhs_type = TREE_TYPE (op); | |
3287 | |
3288 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[i])) | |
3289 { | |
3290 if (vect_print_dump_info (REPORT_DETAILS)) | |
3291 fprintf (vect_dump, "use not simple."); | |
3292 return false; | |
3293 } | |
3294 } | |
3295 | |
3296 vectype_in = get_vectype_for_scalar_type (rhs_type); | |
3297 if (!vectype_in) | |
3298 return false; | |
3299 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in); | |
3300 | |
3301 lhs_type = TREE_TYPE (gimple_call_lhs (stmt)); | |
3302 vectype_out = get_vectype_for_scalar_type (lhs_type); | |
3303 if (!vectype_out) | |
3304 return false; | |
3305 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out); | |
3306 | |
3307 /* FORNOW */ | |
3308 if (nunits_in == nunits_out / 2) | |
3309 modifier = NARROW; | |
3310 else if (nunits_out == nunits_in) | |
3311 modifier = NONE; | |
3312 else if (nunits_out == nunits_in / 2) | |
3313 modifier = WIDEN; | |
3314 else | |
3315 return false; | |
3316 | |
3317 /* For now, we only vectorize functions if a target specific builtin | |
3318 is available. TODO -- in some cases, it might be profitable to | |
3319 insert the calls for pieces of the vector, in order to be able | |
3320 to vectorize other operations in the loop. */ | |
3321 fndecl = vectorizable_function (stmt, vectype_out, vectype_in); | |
3322 if (fndecl == NULL_TREE) | |
3323 { | |
3324 if (vect_print_dump_info (REPORT_DETAILS)) | |
3325 fprintf (vect_dump, "function is not vectorizable."); | |
3326 | |
3327 return false; | |
3328 } | |
3329 | |
3330 gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS)); | |
3331 | |
3332 if (modifier == NARROW) | |
3333 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out; | |
3334 else | |
3335 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in; | |
3336 | |
3337 /* Sanity check: make sure that at least one copy of the vectorized stmt | |
3338 needs to be generated. */ | |
3339 gcc_assert (ncopies >= 1); | |
3340 | |
3341 if (!vec_stmt) /* transformation not required. */ | |
3342 { | |
3343 STMT_VINFO_TYPE (stmt_info) = call_vec_info_type; | |
3344 if (vect_print_dump_info (REPORT_DETAILS)) | |
3345 fprintf (vect_dump, "=== vectorizable_call ==="); | |
3346 vect_model_simple_cost (stmt_info, ncopies, dt, NULL); | |
3347 return true; | |
3348 } | |
3349 | |
3350 /** Transform. **/ | |
3351 | |
3352 if (vect_print_dump_info (REPORT_DETAILS)) | |
3353 fprintf (vect_dump, "transform operation."); | |
3354 | |
3355 /* Handle def. */ | |
3356 scalar_dest = gimple_call_lhs (stmt); | |
3357 vec_dest = vect_create_destination_var (scalar_dest, vectype_out); | |
3358 | |
3359 prev_stmt_info = NULL; | |
3360 switch (modifier) | |
3361 { | |
3362 case NONE: | |
3363 for (j = 0; j < ncopies; ++j) | |
3364 { | |
3365 /* Build argument list for the vectorized call. */ | |
3366 if (j == 0) | |
3367 vargs = VEC_alloc (tree, heap, nargs); | |
3368 else | |
3369 VEC_truncate (tree, vargs, 0); | |
3370 | |
3371 for (i = 0; i < nargs; i++) | |
3372 { | |
3373 op = gimple_call_arg (stmt, i); | |
3374 if (j == 0) | |
3375 vec_oprnd0 | |
3376 = vect_get_vec_def_for_operand (op, stmt, NULL); | |
3377 else | |
3378 vec_oprnd0 | |
3379 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0); | |
3380 | |
3381 VEC_quick_push (tree, vargs, vec_oprnd0); | |
3382 } | |
3383 | |
3384 new_stmt = gimple_build_call_vec (fndecl, vargs); | |
3385 new_temp = make_ssa_name (vec_dest, new_stmt); | |
3386 gimple_call_set_lhs (new_stmt, new_temp); | |
3387 | |
3388 vect_finish_stmt_generation (stmt, new_stmt, gsi); | |
3389 | |
3390 if (j == 0) | |
3391 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt; | |
3392 else | |
3393 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; | |
3394 | |
3395 prev_stmt_info = vinfo_for_stmt (new_stmt); | |
3396 } | |
3397 | |
3398 break; | |
3399 | |
3400 case NARROW: | |
3401 for (j = 0; j < ncopies; ++j) | |
3402 { | |
3403 /* Build argument list for the vectorized call. */ | |
3404 if (j == 0) | |
3405 vargs = VEC_alloc (tree, heap, nargs * 2); | |
3406 else | |
3407 VEC_truncate (tree, vargs, 0); | |
3408 | |
3409 for (i = 0; i < nargs; i++) | |
3410 { | |
3411 op = gimple_call_arg (stmt, i); | |
3412 if (j == 0) | |
3413 { | |
3414 vec_oprnd0 | |
3415 = vect_get_vec_def_for_operand (op, stmt, NULL); | |
3416 vec_oprnd1 | |
3417 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0); | |
3418 } | |
3419 else | |
3420 { | |
3421 vec_oprnd0 | |
3422 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd1); | |
3423 vec_oprnd1 | |
3424 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0); | |
3425 } | |
3426 | |
3427 VEC_quick_push (tree, vargs, vec_oprnd0); | |
3428 VEC_quick_push (tree, vargs, vec_oprnd1); | |
3429 } | |
3430 | |
3431 new_stmt = gimple_build_call_vec (fndecl, vargs); | |
3432 new_temp = make_ssa_name (vec_dest, new_stmt); | |
3433 gimple_call_set_lhs (new_stmt, new_temp); | |
3434 | |
3435 vect_finish_stmt_generation (stmt, new_stmt, gsi); | |
3436 | |
3437 if (j == 0) | |
3438 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt; | |
3439 else | |
3440 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; | |
3441 | |
3442 prev_stmt_info = vinfo_for_stmt (new_stmt); | |
3443 } | |
3444 | |
3445 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info); | |
3446 | |
3447 break; | |
3448 | |
3449 case WIDEN: | |
3450 /* No current target implements this case. */ | |
3451 return false; | |
3452 } | |
3453 | |
3454 VEC_free (tree, heap, vargs); | |
3455 | |
3456 /* Update the exception handling table with the vector stmt if necessary. */ | |
3457 if (maybe_clean_or_replace_eh_stmt (stmt, *vec_stmt)) | |
3458 gimple_purge_dead_eh_edges (gimple_bb (stmt)); | |
3459 | |
3460 /* The call in STMT might prevent it from being removed in dce. | |
3461 We however cannot remove it here, due to the way the ssa name | |
3462 it defines is mapped to the new definition. So just replace | |
3463 rhs of the statement with something harmless. */ | |
3464 | |
3465 type = TREE_TYPE (scalar_dest); | |
3466 new_stmt = gimple_build_assign (gimple_call_lhs (stmt), | |
3467 fold_convert (type, integer_zero_node)); | |
3468 set_vinfo_for_stmt (new_stmt, stmt_info); | |
3469 set_vinfo_for_stmt (stmt, NULL); | |
3470 STMT_VINFO_STMT (stmt_info) = new_stmt; | |
3471 gsi_replace (gsi, new_stmt, false); | |
3472 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt; | |
3473 | |
3474 return true; | |
3475 } | |
3476 | |
3477 | |
3478 /* Function vect_gen_widened_results_half | |
3479 | |
3480 Create a vector stmt whose code, type, number of arguments, and result | |
3481 variable are CODE, OP_TYPE, and VEC_DEST, and its arguments are | |
3482 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI. | |
3483 In the case that CODE is a CALL_EXPR, this means that a call to DECL | |
3484 needs to be created (DECL is a function-decl of a target-builtin). | |
3485 STMT is the original scalar stmt that we are vectorizing. */ | |
3486 | |
3487 static gimple | |
3488 vect_gen_widened_results_half (enum tree_code code, | |
3489 tree decl, | |
3490 tree vec_oprnd0, tree vec_oprnd1, int op_type, | |
3491 tree vec_dest, gimple_stmt_iterator *gsi, | |
3492 gimple stmt) | |
3493 { | |
3494 gimple new_stmt; | |
3495 tree new_temp; | |
3496 tree sym; | |
3497 ssa_op_iter iter; | |
3498 | |
3499 /* Generate half of the widened result: */ | |
3500 if (code == CALL_EXPR) | |
3501 { | |
3502 /* Target specific support */ | |
3503 if (op_type == binary_op) | |
3504 new_stmt = gimple_build_call (decl, 2, vec_oprnd0, vec_oprnd1); | |
3505 else | |
3506 new_stmt = gimple_build_call (decl, 1, vec_oprnd0); | |
3507 new_temp = make_ssa_name (vec_dest, new_stmt); | |
3508 gimple_call_set_lhs (new_stmt, new_temp); | |
3509 } | |
3510 else | |
3511 { | |
3512 /* Generic support */ | |
3513 gcc_assert (op_type == TREE_CODE_LENGTH (code)); | |
3514 if (op_type != binary_op) | |
3515 vec_oprnd1 = NULL; | |
3516 new_stmt = gimple_build_assign_with_ops (code, vec_dest, vec_oprnd0, | |
3517 vec_oprnd1); | |
3518 new_temp = make_ssa_name (vec_dest, new_stmt); | |
3519 gimple_assign_set_lhs (new_stmt, new_temp); | |
3520 } | |
3521 vect_finish_stmt_generation (stmt, new_stmt, gsi); | |
3522 | |
3523 if (code == CALL_EXPR) | |
3524 { | |
3525 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS) | |
3526 { | |
3527 if (TREE_CODE (sym) == SSA_NAME) | |
3528 sym = SSA_NAME_VAR (sym); | |
3529 mark_sym_for_renaming (sym); | |
3530 } | |
3531 } | |
3532 | |
3533 return new_stmt; | |
3534 } | |
3535 | |
3536 | |
3537 /* Check if STMT performs a conversion operation, that can be vectorized. | |
3538 If VEC_STMT is also passed, vectorize the STMT: create a vectorized | |
3539 stmt to replace it, put it in VEC_STMT, and insert it at BSI. | |
3540 Return FALSE if not a vectorizable STMT, TRUE otherwise. */ | |
3541 | |
3542 bool | |
3543 vectorizable_conversion (gimple stmt, gimple_stmt_iterator *gsi, | |
3544 gimple *vec_stmt, slp_tree slp_node) | |
3545 { | |
3546 tree vec_dest; | |
3547 tree scalar_dest; | |
3548 tree op0; | |
3549 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE; | |
3550 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
3551 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
3552 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK; | |
3553 tree decl1 = NULL_TREE, decl2 = NULL_TREE; | |
3554 tree new_temp; | |
3555 tree def; | |
3556 gimple def_stmt; | |
3557 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type}; | |
3558 gimple new_stmt = NULL; | |
3559 stmt_vec_info prev_stmt_info; | |
3560 int nunits_in; | |
3561 int nunits_out; | |
3562 tree vectype_out, vectype_in; | |
3563 int ncopies, j; | |
3564 tree expr; | |
3565 tree rhs_type, lhs_type; | |
3566 tree builtin_decl; | |
3567 enum { NARROW, NONE, WIDEN } modifier; | |
3568 int i; | |
3569 VEC(tree,heap) *vec_oprnds0 = NULL; | |
3570 tree vop0; | |
3571 tree integral_type; | |
3572 VEC(tree,heap) *dummy = NULL; | |
3573 int dummy_int; | |
3574 | |
3575 /* Is STMT a vectorizable conversion? */ | |
3576 | |
3577 if (!STMT_VINFO_RELEVANT_P (stmt_info)) | |
3578 return false; | |
3579 | |
3580 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def) | |
3581 return false; | |
3582 | |
3583 if (!is_gimple_assign (stmt)) | |
3584 return false; | |
3585 | |
3586 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME) | |
3587 return false; | |
3588 | |
3589 code = gimple_assign_rhs_code (stmt); | |
3590 if (code != FIX_TRUNC_EXPR && code != FLOAT_EXPR) | |
3591 return false; | |
3592 | |
3593 /* Check types of lhs and rhs. */ | |
3594 op0 = gimple_assign_rhs1 (stmt); | |
3595 rhs_type = TREE_TYPE (op0); | |
3596 vectype_in = get_vectype_for_scalar_type (rhs_type); | |
3597 if (!vectype_in) | |
3598 return false; | |
3599 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in); | |
3600 | |
3601 scalar_dest = gimple_assign_lhs (stmt); | |
3602 lhs_type = TREE_TYPE (scalar_dest); | |
3603 vectype_out = get_vectype_for_scalar_type (lhs_type); | |
3604 if (!vectype_out) | |
3605 return false; | |
3606 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out); | |
3607 | |
3608 /* FORNOW */ | |
3609 if (nunits_in == nunits_out / 2) | |
3610 modifier = NARROW; | |
3611 else if (nunits_out == nunits_in) | |
3612 modifier = NONE; | |
3613 else if (nunits_out == nunits_in / 2) | |
3614 modifier = WIDEN; | |
3615 else | |
3616 return false; | |
3617 | |
3618 if (modifier == NONE) | |
3619 gcc_assert (STMT_VINFO_VECTYPE (stmt_info) == vectype_out); | |
3620 | |
3621 /* Bail out if the types are both integral or non-integral. */ | |
3622 if ((INTEGRAL_TYPE_P (rhs_type) && INTEGRAL_TYPE_P (lhs_type)) | |
3623 || (!INTEGRAL_TYPE_P (rhs_type) && !INTEGRAL_TYPE_P (lhs_type))) | |
3624 return false; | |
3625 | |
3626 integral_type = INTEGRAL_TYPE_P (rhs_type) ? vectype_in : vectype_out; | |
3627 | |
3628 if (modifier == NARROW) | |
3629 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out; | |
3630 else | |
3631 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in; | |
3632 | |
3633 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies | |
3634 this, so we can safely override NCOPIES with 1 here. */ | |
3635 if (slp_node) | |
3636 ncopies = 1; | |
3637 | |
3638 /* Sanity check: make sure that at least one copy of the vectorized stmt | |
3639 needs to be generated. */ | |
3640 gcc_assert (ncopies >= 1); | |
3641 | |
3642 /* Check the operands of the operation. */ | |
3643 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0])) | |
3644 { | |
3645 if (vect_print_dump_info (REPORT_DETAILS)) | |
3646 fprintf (vect_dump, "use not simple."); | |
3647 return false; | |
3648 } | |
3649 | |
3650 /* Supportable by target? */ | |
3651 if ((modifier == NONE | |
3652 && !targetm.vectorize.builtin_conversion (code, integral_type)) | |
3653 || (modifier == WIDEN | |
3654 && !supportable_widening_operation (code, stmt, vectype_in, | |
3655 &decl1, &decl2, | |
3656 &code1, &code2, | |
3657 &dummy_int, &dummy)) | |
3658 || (modifier == NARROW | |
3659 && !supportable_narrowing_operation (code, stmt, vectype_in, | |
3660 &code1, &dummy_int, &dummy))) | |
3661 { | |
3662 if (vect_print_dump_info (REPORT_DETAILS)) | |
3663 fprintf (vect_dump, "conversion not supported by target."); | |
3664 return false; | |
3665 } | |
3666 | |
3667 if (modifier != NONE) | |
3668 { | |
3669 STMT_VINFO_VECTYPE (stmt_info) = vectype_in; | |
3670 /* FORNOW: SLP not supported. */ | |
3671 if (STMT_SLP_TYPE (stmt_info)) | |
3672 return false; | |
3673 } | |
3674 | |
3675 if (!vec_stmt) /* transformation not required. */ | |
3676 { | |
3677 STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type; | |
3678 return true; | |
3679 } | |
3680 | |
3681 /** Transform. **/ | |
3682 if (vect_print_dump_info (REPORT_DETAILS)) | |
3683 fprintf (vect_dump, "transform conversion."); | |
3684 | |
3685 /* Handle def. */ | |
3686 vec_dest = vect_create_destination_var (scalar_dest, vectype_out); | |
3687 | |
3688 if (modifier == NONE && !slp_node) | |
3689 vec_oprnds0 = VEC_alloc (tree, heap, 1); | |
3690 | |
3691 prev_stmt_info = NULL; | |
3692 switch (modifier) | |
3693 { | |
3694 case NONE: | |
3695 for (j = 0; j < ncopies; j++) | |
3696 { | |
3697 tree sym; | |
3698 ssa_op_iter iter; | |
3699 | |
3700 if (j == 0) | |
3701 vect_get_vec_defs (op0, NULL, stmt, &vec_oprnds0, NULL, slp_node); | |
3702 else | |
3703 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, NULL); | |
3704 | |
3705 builtin_decl = | |
3706 targetm.vectorize.builtin_conversion (code, integral_type); | |
3707 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++) | |
3708 { | |
3709 /* Arguments are ready. create the new vector stmt. */ | |
3710 new_stmt = gimple_build_call (builtin_decl, 1, vop0); | |
3711 new_temp = make_ssa_name (vec_dest, new_stmt); | |
3712 gimple_call_set_lhs (new_stmt, new_temp); | |
3713 vect_finish_stmt_generation (stmt, new_stmt, gsi); | |
3714 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, | |
3715 SSA_OP_ALL_VIRTUALS) | |
3716 { | |
3717 if (TREE_CODE (sym) == SSA_NAME) | |
3718 sym = SSA_NAME_VAR (sym); | |
3719 mark_sym_for_renaming (sym); | |
3720 } | |
3721 if (slp_node) | |
3722 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt); | |
3723 } | |
3724 | |
3725 if (j == 0) | |
3726 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt; | |
3727 else | |
3728 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; | |
3729 prev_stmt_info = vinfo_for_stmt (new_stmt); | |
3730 } | |
3731 break; | |
3732 | |
3733 case WIDEN: | |
3734 /* In case the vectorization factor (VF) is bigger than the number | |
3735 of elements that we can fit in a vectype (nunits), we have to | |
3736 generate more than one vector stmt - i.e - we need to "unroll" | |
3737 the vector stmt by a factor VF/nunits. */ | |
3738 for (j = 0; j < ncopies; j++) | |
3739 { | |
3740 if (j == 0) | |
3741 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL); | |
3742 else | |
3743 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0); | |
3744 | |
3745 STMT_VINFO_VECTYPE (stmt_info) = vectype_in; | |
3746 | |
3747 /* Generate first half of the widened result: */ | |
3748 new_stmt | |
3749 = vect_gen_widened_results_half (code1, decl1, | |
3750 vec_oprnd0, vec_oprnd1, | |
3751 unary_op, vec_dest, gsi, stmt); | |
3752 if (j == 0) | |
3753 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt; | |
3754 else | |
3755 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; | |
3756 prev_stmt_info = vinfo_for_stmt (new_stmt); | |
3757 | |
3758 /* Generate second half of the widened result: */ | |
3759 new_stmt | |
3760 = vect_gen_widened_results_half (code2, decl2, | |
3761 vec_oprnd0, vec_oprnd1, | |
3762 unary_op, vec_dest, gsi, stmt); | |
3763 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; | |
3764 prev_stmt_info = vinfo_for_stmt (new_stmt); | |
3765 } | |
3766 break; | |
3767 | |
3768 case NARROW: | |
3769 /* In case the vectorization factor (VF) is bigger than the number | |
3770 of elements that we can fit in a vectype (nunits), we have to | |
3771 generate more than one vector stmt - i.e - we need to "unroll" | |
3772 the vector stmt by a factor VF/nunits. */ | |
3773 for (j = 0; j < ncopies; j++) | |
3774 { | |
3775 /* Handle uses. */ | |
3776 if (j == 0) | |
3777 { | |
3778 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL); | |
3779 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0); | |
3780 } | |
3781 else | |
3782 { | |
3783 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1); | |
3784 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0); | |
3785 } | |
3786 | |
3787 /* Arguments are ready. Create the new vector stmt. */ | |
3788 expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1); | |
3789 new_stmt = gimple_build_assign_with_ops (code1, vec_dest, vec_oprnd0, | |
3790 vec_oprnd1); | |
3791 new_temp = make_ssa_name (vec_dest, new_stmt); | |
3792 gimple_assign_set_lhs (new_stmt, new_temp); | |
3793 vect_finish_stmt_generation (stmt, new_stmt, gsi); | |
3794 | |
3795 if (j == 0) | |
3796 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt; | |
3797 else | |
3798 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; | |
3799 | |
3800 prev_stmt_info = vinfo_for_stmt (new_stmt); | |
3801 } | |
3802 | |
3803 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info); | |
3804 } | |
3805 | |
3806 if (vec_oprnds0) | |
3807 VEC_free (tree, heap, vec_oprnds0); | |
3808 | |
3809 return true; | |
3810 } | |
3811 | |
3812 | |
3813 /* Function vectorizable_assignment. | |
3814 | |
3815 Check if STMT performs an assignment (copy) that can be vectorized. | |
3816 If VEC_STMT is also passed, vectorize the STMT: create a vectorized | |
3817 stmt to replace it, put it in VEC_STMT, and insert it at BSI. | |
3818 Return FALSE if not a vectorizable STMT, TRUE otherwise. */ | |
3819 | |
3820 bool | |
3821 vectorizable_assignment (gimple stmt, gimple_stmt_iterator *gsi, | |
3822 gimple *vec_stmt, slp_tree slp_node) | |
3823 { | |
3824 tree vec_dest; | |
3825 tree scalar_dest; | |
3826 tree op; | |
3827 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
3828 tree vectype = STMT_VINFO_VECTYPE (stmt_info); | |
3829 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
3830 tree new_temp; | |
3831 tree def; | |
3832 gimple def_stmt; | |
3833 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type}; | |
3834 int nunits = TYPE_VECTOR_SUBPARTS (vectype); | |
3835 int ncopies; | |
3836 int i; | |
3837 VEC(tree,heap) *vec_oprnds = NULL; | |
3838 tree vop; | |
3839 | |
3840 /* Multiple types in SLP are handled by creating the appropriate number of | |
3841 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in | |
3842 case of SLP. */ | |
3843 if (slp_node) | |
3844 ncopies = 1; | |
3845 else | |
3846 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits; | |
3847 | |
3848 gcc_assert (ncopies >= 1); | |
3849 if (ncopies > 1) | |
3850 return false; /* FORNOW */ | |
3851 | |
3852 if (!STMT_VINFO_RELEVANT_P (stmt_info)) | |
3853 return false; | |
3854 | |
3855 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def) | |
3856 return false; | |
3857 | |
3858 /* Is vectorizable assignment? */ | |
3859 if (!is_gimple_assign (stmt)) | |
3860 return false; | |
3861 | |
3862 scalar_dest = gimple_assign_lhs (stmt); | |
3863 if (TREE_CODE (scalar_dest) != SSA_NAME) | |
3864 return false; | |
3865 | |
3866 if (gimple_assign_single_p (stmt) | |
3867 || gimple_assign_rhs_code (stmt) == PAREN_EXPR) | |
3868 op = gimple_assign_rhs1 (stmt); | |
3869 else | |
3870 return false; | |
3871 | |
3872 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[0])) | |
3873 { | |
3874 if (vect_print_dump_info (REPORT_DETAILS)) | |
3875 fprintf (vect_dump, "use not simple."); | |
3876 return false; | |
3877 } | |
3878 | |
3879 if (!vec_stmt) /* transformation not required. */ | |
3880 { | |
3881 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type; | |
3882 if (vect_print_dump_info (REPORT_DETAILS)) | |
3883 fprintf (vect_dump, "=== vectorizable_assignment ==="); | |
3884 vect_model_simple_cost (stmt_info, ncopies, dt, NULL); | |
3885 return true; | |
3886 } | |
3887 | |
3888 /** Transform. **/ | |
3889 if (vect_print_dump_info (REPORT_DETAILS)) | |
3890 fprintf (vect_dump, "transform assignment."); | |
3891 | |
3892 /* Handle def. */ | |
3893 vec_dest = vect_create_destination_var (scalar_dest, vectype); | |
3894 | |
3895 /* Handle use. */ | |
3896 vect_get_vec_defs (op, NULL, stmt, &vec_oprnds, NULL, slp_node); | |
3897 | |
3898 /* Arguments are ready. create the new vector stmt. */ | |
3899 for (i = 0; VEC_iterate (tree, vec_oprnds, i, vop); i++) | |
3900 { | |
3901 *vec_stmt = gimple_build_assign (vec_dest, vop); | |
3902 new_temp = make_ssa_name (vec_dest, *vec_stmt); | |
3903 gimple_assign_set_lhs (*vec_stmt, new_temp); | |
3904 vect_finish_stmt_generation (stmt, *vec_stmt, gsi); | |
3905 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt; | |
3906 | |
3907 if (slp_node) | |
3908 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), *vec_stmt); | |
3909 } | |
3910 | |
3911 VEC_free (tree, heap, vec_oprnds); | |
3912 return true; | |
3913 } | |
3914 | |
3915 | |
3916 /* Function vect_min_worthwhile_factor. | |
3917 | |
3918 For a loop where we could vectorize the operation indicated by CODE, | |
3919 return the minimum vectorization factor that makes it worthwhile | |
3920 to use generic vectors. */ | |
3921 static int | |
3922 vect_min_worthwhile_factor (enum tree_code code) | |
3923 { | |
3924 switch (code) | |
3925 { | |
3926 case PLUS_EXPR: | |
3927 case MINUS_EXPR: | |
3928 case NEGATE_EXPR: | |
3929 return 4; | |
3930 | |
3931 case BIT_AND_EXPR: | |
3932 case BIT_IOR_EXPR: | |
3933 case BIT_XOR_EXPR: | |
3934 case BIT_NOT_EXPR: | |
3935 return 2; | |
3936 | |
3937 default: | |
3938 return INT_MAX; | |
3939 } | |
3940 } | |
3941 | |
3942 | |
3943 /* Function vectorizable_induction | |
3944 | |
3945 Check if PHI performs an induction computation that can be vectorized. | |
3946 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized | |
3947 phi to replace it, put it in VEC_STMT, and add it to the same basic block. | |
3948 Return FALSE if not a vectorizable STMT, TRUE otherwise. */ | |
3949 | |
3950 bool | |
3951 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED, | |
3952 gimple *vec_stmt) | |
3953 { | |
3954 stmt_vec_info stmt_info = vinfo_for_stmt (phi); | |
3955 tree vectype = STMT_VINFO_VECTYPE (stmt_info); | |
3956 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
3957 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
3958 int nunits = TYPE_VECTOR_SUBPARTS (vectype); | |
3959 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits; | |
3960 tree vec_def; | |
3961 | |
3962 gcc_assert (ncopies >= 1); | |
3963 /* FORNOW. This restriction should be relaxed. */ | |
3964 if (nested_in_vect_loop_p (loop, phi) && ncopies > 1) | |
3965 { | |
3966 if (vect_print_dump_info (REPORT_DETAILS)) | |
3967 fprintf (vect_dump, "multiple types in nested loop."); | |
3968 return false; | |
3969 } | |
3970 | |
3971 if (!STMT_VINFO_RELEVANT_P (stmt_info)) | |
3972 return false; | |
3973 | |
3974 /* FORNOW: SLP not supported. */ | |
3975 if (STMT_SLP_TYPE (stmt_info)) | |
3976 return false; | |
3977 | |
3978 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def); | |
3979 | |
3980 if (gimple_code (phi) != GIMPLE_PHI) | |
3981 return false; | |
3982 | |
3983 if (!vec_stmt) /* transformation not required. */ | |
3984 { | |
3985 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type; | |
3986 if (vect_print_dump_info (REPORT_DETAILS)) | |
3987 fprintf (vect_dump, "=== vectorizable_induction ==="); | |
3988 vect_model_induction_cost (stmt_info, ncopies); | |
3989 return true; | |
3990 } | |
3991 | |
3992 /** Transform. **/ | |
3993 | |
3994 if (vect_print_dump_info (REPORT_DETAILS)) | |
3995 fprintf (vect_dump, "transform induction phi."); | |
3996 | |
3997 vec_def = get_initial_def_for_induction (phi); | |
3998 *vec_stmt = SSA_NAME_DEF_STMT (vec_def); | |
3999 return true; | |
4000 } | |
4001 | |
4002 | |
4003 /* Function vectorizable_operation. | |
4004 | |
4005 Check if STMT performs a binary or unary operation that can be vectorized. | |
4006 If VEC_STMT is also passed, vectorize the STMT: create a vectorized | |
4007 stmt to replace it, put it in VEC_STMT, and insert it at BSI. | |
4008 Return FALSE if not a vectorizable STMT, TRUE otherwise. */ | |
4009 | |
4010 bool | |
4011 vectorizable_operation (gimple stmt, gimple_stmt_iterator *gsi, | |
4012 gimple *vec_stmt, slp_tree slp_node) | |
4013 { | |
4014 tree vec_dest; | |
4015 tree scalar_dest; | |
4016 tree op0, op1 = NULL; | |
4017 tree vec_oprnd1 = NULL_TREE; | |
4018 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
4019 tree vectype = STMT_VINFO_VECTYPE (stmt_info); | |
4020 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
4021 enum tree_code code; | |
4022 enum machine_mode vec_mode; | |
4023 tree new_temp; | |
4024 int op_type; | |
4025 optab optab; | |
4026 int icode; | |
4027 enum machine_mode optab_op2_mode; | |
4028 tree def; | |
4029 gimple def_stmt; | |
4030 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type}; | |
4031 gimple new_stmt = NULL; | |
4032 stmt_vec_info prev_stmt_info; | |
4033 int nunits_in = TYPE_VECTOR_SUBPARTS (vectype); | |
4034 int nunits_out; | |
4035 tree vectype_out; | |
4036 int ncopies; | |
4037 int j, i; | |
4038 VEC(tree,heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL; | |
4039 tree vop0, vop1; | |
4040 unsigned int k; | |
4041 bool shift_p = false; | |
4042 bool scalar_shift_arg = false; | |
4043 | |
4044 /* Multiple types in SLP are handled by creating the appropriate number of | |
4045 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in | |
4046 case of SLP. */ | |
4047 if (slp_node) | |
4048 ncopies = 1; | |
4049 else | |
4050 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in; | |
4051 | |
4052 gcc_assert (ncopies >= 1); | |
4053 | |
4054 if (!STMT_VINFO_RELEVANT_P (stmt_info)) | |
4055 return false; | |
4056 | |
4057 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def) | |
4058 return false; | |
4059 | |
4060 /* Is STMT a vectorizable binary/unary operation? */ | |
4061 if (!is_gimple_assign (stmt)) | |
4062 return false; | |
4063 | |
4064 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME) | |
4065 return false; | |
4066 | |
4067 scalar_dest = gimple_assign_lhs (stmt); | |
4068 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest)); | |
4069 if (!vectype_out) | |
4070 return false; | |
4071 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out); | |
4072 if (nunits_out != nunits_in) | |
4073 return false; | |
4074 | |
4075 code = gimple_assign_rhs_code (stmt); | |
4076 | |
4077 /* For pointer addition, we should use the normal plus for | |
4078 the vector addition. */ | |
4079 if (code == POINTER_PLUS_EXPR) | |
4080 code = PLUS_EXPR; | |
4081 | |
4082 /* Support only unary or binary operations. */ | |
4083 op_type = TREE_CODE_LENGTH (code); | |
4084 if (op_type != unary_op && op_type != binary_op) | |
4085 { | |
4086 if (vect_print_dump_info (REPORT_DETAILS)) | |
4087 fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type); | |
4088 return false; | |
4089 } | |
4090 | |
4091 op0 = gimple_assign_rhs1 (stmt); | |
4092 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0])) | |
4093 { | |
4094 if (vect_print_dump_info (REPORT_DETAILS)) | |
4095 fprintf (vect_dump, "use not simple."); | |
4096 return false; | |
4097 } | |
4098 | |
4099 if (op_type == binary_op) | |
4100 { | |
4101 op1 = gimple_assign_rhs2 (stmt); | |
4102 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1])) | |
4103 { | |
4104 if (vect_print_dump_info (REPORT_DETAILS)) | |
4105 fprintf (vect_dump, "use not simple."); | |
4106 return false; | |
4107 } | |
4108 } | |
4109 | |
4110 /* If this is a shift/rotate, determine whether the shift amount is a vector, | |
4111 or scalar. If the shift/rotate amount is a vector, use the vector/vector | |
4112 shift optabs. */ | |
4113 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR || code == LROTATE_EXPR | |
4114 || code == RROTATE_EXPR) | |
4115 { | |
4116 shift_p = true; | |
4117 | |
4118 /* vector shifted by vector */ | |
4119 if (dt[1] == vect_loop_def) | |
4120 { | |
4121 optab = optab_for_tree_code (code, vectype, optab_vector); | |
4122 if (vect_print_dump_info (REPORT_DETAILS)) | |
4123 fprintf (vect_dump, "vector/vector shift/rotate found."); | |
4124 } | |
4125 | |
4126 /* See if the machine has a vector shifted by scalar insn and if not | |
4127 then see if it has a vector shifted by vector insn */ | |
4128 else if (dt[1] == vect_constant_def || dt[1] == vect_invariant_def) | |
4129 { | |
4130 optab = optab_for_tree_code (code, vectype, optab_scalar); | |
4131 if (optab | |
4132 && (optab_handler (optab, TYPE_MODE (vectype))->insn_code | |
4133 != CODE_FOR_nothing)) | |
4134 { | |
4135 scalar_shift_arg = true; | |
4136 if (vect_print_dump_info (REPORT_DETAILS)) | |
4137 fprintf (vect_dump, "vector/scalar shift/rotate found."); | |
4138 } | |
4139 else | |
4140 { | |
4141 optab = optab_for_tree_code (code, vectype, optab_vector); | |
4142 if (vect_print_dump_info (REPORT_DETAILS) | |
4143 && optab | |
4144 && (optab_handler (optab, TYPE_MODE (vectype))->insn_code | |
4145 != CODE_FOR_nothing)) | |
4146 fprintf (vect_dump, "vector/vector shift/rotate found."); | |
4147 } | |
4148 } | |
4149 | |
4150 else | |
4151 { | |
4152 if (vect_print_dump_info (REPORT_DETAILS)) | |
4153 fprintf (vect_dump, "operand mode requires invariant argument."); | |
4154 return false; | |
4155 } | |
4156 } | |
4157 else | |
4158 optab = optab_for_tree_code (code, vectype, optab_default); | |
4159 | |
4160 /* Supportable by target? */ | |
4161 if (!optab) | |
4162 { | |
4163 if (vect_print_dump_info (REPORT_DETAILS)) | |
4164 fprintf (vect_dump, "no optab."); | |
4165 return false; | |
4166 } | |
4167 vec_mode = TYPE_MODE (vectype); | |
4168 icode = (int) optab_handler (optab, vec_mode)->insn_code; | |
4169 if (icode == CODE_FOR_nothing) | |
4170 { | |
4171 if (vect_print_dump_info (REPORT_DETAILS)) | |
4172 fprintf (vect_dump, "op not supported by target."); | |
4173 /* Check only during analysis. */ | |
4174 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD | |
4175 || (LOOP_VINFO_VECT_FACTOR (loop_vinfo) | |
4176 < vect_min_worthwhile_factor (code) | |
4177 && !vec_stmt)) | |
4178 return false; | |
4179 if (vect_print_dump_info (REPORT_DETAILS)) | |
4180 fprintf (vect_dump, "proceeding using word mode."); | |
4181 } | |
4182 | |
4183 /* Worthwhile without SIMD support? Check only during analysis. */ | |
4184 if (!VECTOR_MODE_P (TYPE_MODE (vectype)) | |
4185 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) | |
4186 < vect_min_worthwhile_factor (code) | |
4187 && !vec_stmt) | |
4188 { | |
4189 if (vect_print_dump_info (REPORT_DETAILS)) | |
4190 fprintf (vect_dump, "not worthwhile without SIMD support."); | |
4191 return false; | |
4192 } | |
4193 | |
4194 if (!vec_stmt) /* transformation not required. */ | |
4195 { | |
4196 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type; | |
4197 if (vect_print_dump_info (REPORT_DETAILS)) | |
4198 fprintf (vect_dump, "=== vectorizable_operation ==="); | |
4199 vect_model_simple_cost (stmt_info, ncopies, dt, NULL); | |
4200 return true; | |
4201 } | |
4202 | |
4203 /** Transform. **/ | |
4204 | |
4205 if (vect_print_dump_info (REPORT_DETAILS)) | |
4206 fprintf (vect_dump, "transform binary/unary operation."); | |
4207 | |
4208 /* Handle def. */ | |
4209 vec_dest = vect_create_destination_var (scalar_dest, vectype); | |
4210 | |
4211 /* Allocate VECs for vector operands. In case of SLP, vector operands are | |
4212 created in the previous stages of the recursion, so no allocation is | |
4213 needed, except for the case of shift with scalar shift argument. In that | |
4214 case we store the scalar operand in VEC_OPRNDS1 for every vector stmt to | |
4215 be created to vectorize the SLP group, i.e., SLP_NODE->VEC_STMTS_SIZE. | |
4216 In case of loop-based vectorization we allocate VECs of size 1. We | |
4217 allocate VEC_OPRNDS1 only in case of binary operation. */ | |
4218 if (!slp_node) | |
4219 { | |
4220 vec_oprnds0 = VEC_alloc (tree, heap, 1); | |
4221 if (op_type == binary_op) | |
4222 vec_oprnds1 = VEC_alloc (tree, heap, 1); | |
4223 } | |
4224 else if (scalar_shift_arg) | |
4225 vec_oprnds1 = VEC_alloc (tree, heap, slp_node->vec_stmts_size); | |
4226 | |
4227 /* In case the vectorization factor (VF) is bigger than the number | |
4228 of elements that we can fit in a vectype (nunits), we have to generate | |
4229 more than one vector stmt - i.e - we need to "unroll" the | |
4230 vector stmt by a factor VF/nunits. In doing so, we record a pointer | |
4231 from one copy of the vector stmt to the next, in the field | |
4232 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following | |
4233 stages to find the correct vector defs to be used when vectorizing | |
4234 stmts that use the defs of the current stmt. The example below illustrates | |
4235 the vectorization process when VF=16 and nunits=4 (i.e - we need to create | |
4236 4 vectorized stmts): | |
4237 | |
4238 before vectorization: | |
4239 RELATED_STMT VEC_STMT | |
4240 S1: x = memref - - | |
4241 S2: z = x + 1 - - | |
4242 | |
4243 step 1: vectorize stmt S1 (done in vectorizable_load. See more details | |
4244 there): | |
4245 RELATED_STMT VEC_STMT | |
4246 VS1_0: vx0 = memref0 VS1_1 - | |
4247 VS1_1: vx1 = memref1 VS1_2 - | |
4248 VS1_2: vx2 = memref2 VS1_3 - | |
4249 VS1_3: vx3 = memref3 - - | |
4250 S1: x = load - VS1_0 | |
4251 S2: z = x + 1 - - | |
4252 | |
4253 step2: vectorize stmt S2 (done here): | |
4254 To vectorize stmt S2 we first need to find the relevant vector | |
4255 def for the first operand 'x'. This is, as usual, obtained from | |
4256 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt | |
4257 that defines 'x' (S1). This way we find the stmt VS1_0, and the | |
4258 relevant vector def 'vx0'. Having found 'vx0' we can generate | |
4259 the vector stmt VS2_0, and as usual, record it in the | |
4260 STMT_VINFO_VEC_STMT of stmt S2. | |
4261 When creating the second copy (VS2_1), we obtain the relevant vector | |
4262 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of | |
4263 stmt VS1_0. This way we find the stmt VS1_1 and the relevant | |
4264 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a | |
4265 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0. | |
4266 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting | |
4267 chain of stmts and pointers: | |
4268 RELATED_STMT VEC_STMT | |
4269 VS1_0: vx0 = memref0 VS1_1 - | |
4270 VS1_1: vx1 = memref1 VS1_2 - | |
4271 VS1_2: vx2 = memref2 VS1_3 - | |
4272 VS1_3: vx3 = memref3 - - | |
4273 S1: x = load - VS1_0 | |
4274 VS2_0: vz0 = vx0 + v1 VS2_1 - | |
4275 VS2_1: vz1 = vx1 + v1 VS2_2 - | |
4276 VS2_2: vz2 = vx2 + v1 VS2_3 - | |
4277 VS2_3: vz3 = vx3 + v1 - - | |
4278 S2: z = x + 1 - VS2_0 */ | |
4279 | |
4280 prev_stmt_info = NULL; | |
4281 for (j = 0; j < ncopies; j++) | |
4282 { | |
4283 /* Handle uses. */ | |
4284 if (j == 0) | |
4285 { | |
4286 if (op_type == binary_op && scalar_shift_arg) | |
4287 { | |
4288 /* Vector shl and shr insn patterns can be defined with scalar | |
4289 operand 2 (shift operand). In this case, use constant or loop | |
4290 invariant op1 directly, without extending it to vector mode | |
4291 first. */ | |
4292 optab_op2_mode = insn_data[icode].operand[2].mode; | |
4293 if (!VECTOR_MODE_P (optab_op2_mode)) | |
4294 { | |
4295 if (vect_print_dump_info (REPORT_DETAILS)) | |
4296 fprintf (vect_dump, "operand 1 using scalar mode."); | |
4297 vec_oprnd1 = op1; | |
4298 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1); | |
4299 if (slp_node) | |
4300 { | |
4301 /* Store vec_oprnd1 for every vector stmt to be created | |
4302 for SLP_NODE. We check during the analysis that all the | |
4303 shift arguments are the same. | |
4304 TODO: Allow different constants for different vector | |
4305 stmts generated for an SLP instance. */ | |
4306 for (k = 0; k < slp_node->vec_stmts_size - 1; k++) | |
4307 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1); | |
4308 } | |
4309 } | |
4310 } | |
4311 | |
4312 /* vec_oprnd1 is available if operand 1 should be of a scalar-type | |
4313 (a special case for certain kind of vector shifts); otherwise, | |
4314 operand 1 should be of a vector type (the usual case). */ | |
4315 if (op_type == binary_op && !vec_oprnd1) | |
4316 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1, | |
4317 slp_node); | |
4318 else | |
4319 vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, | |
4320 slp_node); | |
4321 } | |
4322 else | |
4323 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, &vec_oprnds1); | |
4324 | |
4325 /* Arguments are ready. Create the new vector stmt. */ | |
4326 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++) | |
4327 { | |
4328 vop1 = ((op_type == binary_op) | |
4329 ? VEC_index (tree, vec_oprnds1, i) : NULL); | |
4330 new_stmt = gimple_build_assign_with_ops (code, vec_dest, vop0, vop1); | |
4331 new_temp = make_ssa_name (vec_dest, new_stmt); | |
4332 gimple_assign_set_lhs (new_stmt, new_temp); | |
4333 vect_finish_stmt_generation (stmt, new_stmt, gsi); | |
4334 if (slp_node) | |
4335 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt); | |
4336 } | |
4337 | |
4338 if (slp_node) | |
4339 continue; | |
4340 | |
4341 if (j == 0) | |
4342 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt; | |
4343 else | |
4344 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; | |
4345 prev_stmt_info = vinfo_for_stmt (new_stmt); | |
4346 } | |
4347 | |
4348 VEC_free (tree, heap, vec_oprnds0); | |
4349 if (vec_oprnds1) | |
4350 VEC_free (tree, heap, vec_oprnds1); | |
4351 | |
4352 return true; | |
4353 } | |
4354 | |
4355 | |
4356 /* Get vectorized definitions for loop-based vectorization. For the first | |
4357 operand we call vect_get_vec_def_for_operand() (with OPRND containing | |
4358 scalar operand), and for the rest we get a copy with | |
4359 vect_get_vec_def_for_stmt_copy() using the previous vector definition | |
4360 (stored in OPRND). See vect_get_vec_def_for_stmt_copy() for details. | |
4361 The vectors are collected into VEC_OPRNDS. */ | |
4362 | |
4363 static void | |
4364 vect_get_loop_based_defs (tree *oprnd, gimple stmt, enum vect_def_type dt, | |
4365 VEC (tree, heap) **vec_oprnds, int multi_step_cvt) | |
4366 { | |
4367 tree vec_oprnd; | |
4368 | |
4369 /* Get first vector operand. */ | |
4370 /* All the vector operands except the very first one (that is scalar oprnd) | |
4371 are stmt copies. */ | |
4372 if (TREE_CODE (TREE_TYPE (*oprnd)) != VECTOR_TYPE) | |
4373 vec_oprnd = vect_get_vec_def_for_operand (*oprnd, stmt, NULL); | |
4374 else | |
4375 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, *oprnd); | |
4376 | |
4377 VEC_quick_push (tree, *vec_oprnds, vec_oprnd); | |
4378 | |
4379 /* Get second vector operand. */ | |
4380 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, vec_oprnd); | |
4381 VEC_quick_push (tree, *vec_oprnds, vec_oprnd); | |
4382 | |
4383 *oprnd = vec_oprnd; | |
4384 | |
4385 /* For conversion in multiple steps, continue to get operands | |
4386 recursively. */ | |
4387 if (multi_step_cvt) | |
4388 vect_get_loop_based_defs (oprnd, stmt, dt, vec_oprnds, multi_step_cvt - 1); | |
4389 } | |
4390 | |
4391 | |
4392 /* Create vectorized demotion statements for vector operands from VEC_OPRNDS. | |
4393 For multi-step conversions store the resulting vectors and call the function | |
4394 recursively. */ | |
4395 | |
4396 static void | |
4397 vect_create_vectorized_demotion_stmts (VEC (tree, heap) **vec_oprnds, | |
4398 int multi_step_cvt, gimple stmt, | |
4399 VEC (tree, heap) *vec_dsts, | |
4400 gimple_stmt_iterator *gsi, | |
4401 slp_tree slp_node, enum tree_code code, | |
4402 stmt_vec_info *prev_stmt_info) | |
4403 { | |
4404 unsigned int i; | |
4405 tree vop0, vop1, new_tmp, vec_dest; | |
4406 gimple new_stmt; | |
4407 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
4408 | |
4409 vec_dest = VEC_pop (tree, vec_dsts); | |
4410 | |
4411 for (i = 0; i < VEC_length (tree, *vec_oprnds); i += 2) | |
4412 { | |
4413 /* Create demotion operation. */ | |
4414 vop0 = VEC_index (tree, *vec_oprnds, i); | |
4415 vop1 = VEC_index (tree, *vec_oprnds, i + 1); | |
4416 new_stmt = gimple_build_assign_with_ops (code, vec_dest, vop0, vop1); | |
4417 new_tmp = make_ssa_name (vec_dest, new_stmt); | |
4418 gimple_assign_set_lhs (new_stmt, new_tmp); | |
4419 vect_finish_stmt_generation (stmt, new_stmt, gsi); | |
4420 | |
4421 if (multi_step_cvt) | |
4422 /* Store the resulting vector for next recursive call. */ | |
4423 VEC_replace (tree, *vec_oprnds, i/2, new_tmp); | |
4424 else | |
4425 { | |
4426 /* This is the last step of the conversion sequence. Store the | |
4427 vectors in SLP_NODE or in vector info of the scalar statement | |
4428 (or in STMT_VINFO_RELATED_STMT chain). */ | |
4429 if (slp_node) | |
4430 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt); | |
4431 else | |
4432 { | |
4433 if (!*prev_stmt_info) | |
4434 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt; | |
4435 else | |
4436 STMT_VINFO_RELATED_STMT (*prev_stmt_info) = new_stmt; | |
4437 | |
4438 *prev_stmt_info = vinfo_for_stmt (new_stmt); | |
4439 } | |
4440 } | |
4441 } | |
4442 | |
4443 /* For multi-step demotion operations we first generate demotion operations | |
4444 from the source type to the intermediate types, and then combine the | |
4445 results (stored in VEC_OPRNDS) in demotion operation to the destination | |
4446 type. */ | |
4447 if (multi_step_cvt) | |
4448 { | |
4449 /* At each level of recursion we have have of the operands we had at the | |
4450 previous level. */ | |
4451 VEC_truncate (tree, *vec_oprnds, (i+1)/2); | |
4452 vect_create_vectorized_demotion_stmts (vec_oprnds, multi_step_cvt - 1, | |
4453 stmt, vec_dsts, gsi, slp_node, | |
4454 code, prev_stmt_info); | |
4455 } | |
4456 } | |
4457 | |
4458 | |
4459 /* Function vectorizable_type_demotion | |
4460 | |
4461 Check if STMT performs a binary or unary operation that involves | |
4462 type demotion, and if it can be vectorized. | |
4463 If VEC_STMT is also passed, vectorize the STMT: create a vectorized | |
4464 stmt to replace it, put it in VEC_STMT, and insert it at BSI. | |
4465 Return FALSE if not a vectorizable STMT, TRUE otherwise. */ | |
4466 | |
4467 bool | |
4468 vectorizable_type_demotion (gimple stmt, gimple_stmt_iterator *gsi, | |
4469 gimple *vec_stmt, slp_tree slp_node) | |
4470 { | |
4471 tree vec_dest; | |
4472 tree scalar_dest; | |
4473 tree op0; | |
4474 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
4475 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
4476 enum tree_code code, code1 = ERROR_MARK; | |
4477 tree def; | |
4478 gimple def_stmt; | |
4479 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type}; | |
4480 stmt_vec_info prev_stmt_info; | |
4481 int nunits_in; | |
4482 int nunits_out; | |
4483 tree vectype_out; | |
4484 int ncopies; | |
4485 int j, i; | |
4486 tree vectype_in; | |
4487 int multi_step_cvt = 0; | |
4488 VEC (tree, heap) *vec_oprnds0 = NULL; | |
4489 VEC (tree, heap) *vec_dsts = NULL, *interm_types = NULL, *tmp_vec_dsts = NULL; | |
4490 tree last_oprnd, intermediate_type; | |
4491 | |
4492 if (!STMT_VINFO_RELEVANT_P (stmt_info)) | |
4493 return false; | |
4494 | |
4495 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def) | |
4496 return false; | |
4497 | |
4498 /* Is STMT a vectorizable type-demotion operation? */ | |
4499 if (!is_gimple_assign (stmt)) | |
4500 return false; | |
4501 | |
4502 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME) | |
4503 return false; | |
4504 | |
4505 code = gimple_assign_rhs_code (stmt); | |
4506 if (!CONVERT_EXPR_CODE_P (code)) | |
4507 return false; | |
4508 | |
4509 op0 = gimple_assign_rhs1 (stmt); | |
4510 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0)); | |
4511 if (!vectype_in) | |
4512 return false; | |
4513 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in); | |
4514 | |
4515 scalar_dest = gimple_assign_lhs (stmt); | |
4516 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest)); | |
4517 if (!vectype_out) | |
4518 return false; | |
4519 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out); | |
4520 if (nunits_in >= nunits_out) | |
4521 return false; | |
4522 | |
4523 /* Multiple types in SLP are handled by creating the appropriate number of | |
4524 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in | |
4525 case of SLP. */ | |
4526 if (slp_node) | |
4527 ncopies = 1; | |
4528 else | |
4529 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out; | |
4530 | |
4531 gcc_assert (ncopies >= 1); | |
4532 | |
4533 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest)) | |
4534 && INTEGRAL_TYPE_P (TREE_TYPE (op0))) | |
4535 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest)) | |
4536 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0)) | |
4537 && CONVERT_EXPR_CODE_P (code)))) | |
4538 return false; | |
4539 | |
4540 /* Check the operands of the operation. */ | |
4541 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0])) | |
4542 { | |
4543 if (vect_print_dump_info (REPORT_DETAILS)) | |
4544 fprintf (vect_dump, "use not simple."); | |
4545 return false; | |
4546 } | |
4547 | |
4548 /* Supportable by target? */ | |
4549 if (!supportable_narrowing_operation (code, stmt, vectype_in, &code1, | |
4550 &multi_step_cvt, &interm_types)) | |
4551 return false; | |
4552 | |
4553 STMT_VINFO_VECTYPE (stmt_info) = vectype_in; | |
4554 | |
4555 if (!vec_stmt) /* transformation not required. */ | |
4556 { | |
4557 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type; | |
4558 if (vect_print_dump_info (REPORT_DETAILS)) | |
4559 fprintf (vect_dump, "=== vectorizable_demotion ==="); | |
4560 vect_model_simple_cost (stmt_info, ncopies, dt, NULL); | |
4561 return true; | |
4562 } | |
4563 | |
4564 /** Transform. **/ | |
4565 if (vect_print_dump_info (REPORT_DETAILS)) | |
4566 fprintf (vect_dump, "transform type demotion operation. ncopies = %d.", | |
4567 ncopies); | |
4568 | |
4569 /* In case of multi-step demotion, we first generate demotion operations to | |
4570 the intermediate types, and then from that types to the final one. | |
4571 We create vector destinations for the intermediate type (TYPES) received | |
4572 from supportable_narrowing_operation, and store them in the correct order | |
4573 for future use in vect_create_vectorized_demotion_stmts(). */ | |
4574 if (multi_step_cvt) | |
4575 vec_dsts = VEC_alloc (tree, heap, multi_step_cvt + 1); | |
4576 else | |
4577 vec_dsts = VEC_alloc (tree, heap, 1); | |
4578 | |
4579 vec_dest = vect_create_destination_var (scalar_dest, vectype_out); | |
4580 VEC_quick_push (tree, vec_dsts, vec_dest); | |
4581 | |
4582 if (multi_step_cvt) | |
4583 { | |
4584 for (i = VEC_length (tree, interm_types) - 1; | |
4585 VEC_iterate (tree, interm_types, i, intermediate_type); i--) | |
4586 { | |
4587 vec_dest = vect_create_destination_var (scalar_dest, | |
4588 intermediate_type); | |
4589 VEC_quick_push (tree, vec_dsts, vec_dest); | |
4590 } | |
4591 } | |
4592 | |
4593 /* In case the vectorization factor (VF) is bigger than the number | |
4594 of elements that we can fit in a vectype (nunits), we have to generate | |
4595 more than one vector stmt - i.e - we need to "unroll" the | |
4596 vector stmt by a factor VF/nunits. */ | |
4597 last_oprnd = op0; | |
4598 prev_stmt_info = NULL; | |
4599 for (j = 0; j < ncopies; j++) | |
4600 { | |
4601 /* Handle uses. */ | |
4602 if (slp_node) | |
4603 vect_get_slp_defs (slp_node, &vec_oprnds0, NULL); | |
4604 else | |
4605 { | |
4606 VEC_free (tree, heap, vec_oprnds0); | |
4607 vec_oprnds0 = VEC_alloc (tree, heap, | |
4608 (multi_step_cvt ? vect_pow2 (multi_step_cvt) * 2 : 2)); | |
4609 vect_get_loop_based_defs (&last_oprnd, stmt, dt[0], &vec_oprnds0, | |
4610 vect_pow2 (multi_step_cvt) - 1); | |
4611 } | |
4612 | |
4613 /* Arguments are ready. Create the new vector stmts. */ | |
4614 tmp_vec_dsts = VEC_copy (tree, heap, vec_dsts); | |
4615 vect_create_vectorized_demotion_stmts (&vec_oprnds0, | |
4616 multi_step_cvt, stmt, tmp_vec_dsts, | |
4617 gsi, slp_node, code1, | |
4618 &prev_stmt_info); | |
4619 } | |
4620 | |
4621 VEC_free (tree, heap, vec_oprnds0); | |
4622 VEC_free (tree, heap, vec_dsts); | |
4623 VEC_free (tree, heap, tmp_vec_dsts); | |
4624 VEC_free (tree, heap, interm_types); | |
4625 | |
4626 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info); | |
4627 return true; | |
4628 } | |
4629 | |
4630 | |
4631 /* Create vectorized promotion statements for vector operands from VEC_OPRNDS0 | |
4632 and VEC_OPRNDS1 (for binary operations). For multi-step conversions store | |
4633 the resulting vectors and call the function recursively. */ | |
4634 | |
4635 static void | |
4636 vect_create_vectorized_promotion_stmts (VEC (tree, heap) **vec_oprnds0, | |
4637 VEC (tree, heap) **vec_oprnds1, | |
4638 int multi_step_cvt, gimple stmt, | |
4639 VEC (tree, heap) *vec_dsts, | |
4640 gimple_stmt_iterator *gsi, | |
4641 slp_tree slp_node, enum tree_code code1, | |
4642 enum tree_code code2, tree decl1, | |
4643 tree decl2, int op_type, | |
4644 stmt_vec_info *prev_stmt_info) | |
4645 { | |
4646 int i; | |
4647 tree vop0, vop1, new_tmp1, new_tmp2, vec_dest; | |
4648 gimple new_stmt1, new_stmt2; | |
4649 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
4650 VEC (tree, heap) *vec_tmp; | |
4651 | |
4652 vec_dest = VEC_pop (tree, vec_dsts); | |
4653 vec_tmp = VEC_alloc (tree, heap, VEC_length (tree, *vec_oprnds0) * 2); | |
4654 | |
4655 for (i = 0; VEC_iterate (tree, *vec_oprnds0, i, vop0); i++) | |
4656 { | |
4657 if (op_type == binary_op) | |
4658 vop1 = VEC_index (tree, *vec_oprnds1, i); | |
4659 else | |
4660 vop1 = NULL_TREE; | |
4661 | |
4662 /* Generate the two halves of promotion operation. */ | |
4663 new_stmt1 = vect_gen_widened_results_half (code1, decl1, vop0, vop1, | |
4664 op_type, vec_dest, gsi, stmt); | |
4665 new_stmt2 = vect_gen_widened_results_half (code2, decl2, vop0, vop1, | |
4666 op_type, vec_dest, gsi, stmt); | |
4667 if (is_gimple_call (new_stmt1)) | |
4668 { | |
4669 new_tmp1 = gimple_call_lhs (new_stmt1); | |
4670 new_tmp2 = gimple_call_lhs (new_stmt2); | |
4671 } | |
4672 else | |
4673 { | |
4674 new_tmp1 = gimple_assign_lhs (new_stmt1); | |
4675 new_tmp2 = gimple_assign_lhs (new_stmt2); | |
4676 } | |
4677 | |
4678 if (multi_step_cvt) | |
4679 { | |
4680 /* Store the results for the recursive call. */ | |
4681 VEC_quick_push (tree, vec_tmp, new_tmp1); | |
4682 VEC_quick_push (tree, vec_tmp, new_tmp2); | |
4683 } | |
4684 else | |
4685 { | |
4686 /* Last step of promotion sequience - store the results. */ | |
4687 if (slp_node) | |
4688 { | |
4689 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt1); | |
4690 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt2); | |
4691 } | |
4692 else | |
4693 { | |
4694 if (!*prev_stmt_info) | |
4695 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt1; | |
4696 else | |
4697 STMT_VINFO_RELATED_STMT (*prev_stmt_info) = new_stmt1; | |
4698 | |
4699 *prev_stmt_info = vinfo_for_stmt (new_stmt1); | |
4700 STMT_VINFO_RELATED_STMT (*prev_stmt_info) = new_stmt2; | |
4701 *prev_stmt_info = vinfo_for_stmt (new_stmt2); | |
4702 } | |
4703 } | |
4704 } | |
4705 | |
4706 if (multi_step_cvt) | |
4707 { | |
4708 /* For multi-step promotion operation we first generate we call the | |
4709 function recurcively for every stage. We start from the input type, | |
4710 create promotion operations to the intermediate types, and then | |
4711 create promotions to the output type. */ | |
4712 *vec_oprnds0 = VEC_copy (tree, heap, vec_tmp); | |
4713 VEC_free (tree, heap, vec_tmp); | |
4714 vect_create_vectorized_promotion_stmts (vec_oprnds0, vec_oprnds1, | |
4715 multi_step_cvt - 1, stmt, | |
4716 vec_dsts, gsi, slp_node, code1, | |
4717 code2, decl2, decl2, op_type, | |
4718 prev_stmt_info); | |
4719 } | |
4720 } | |
4721 | |
4722 | |
4723 /* Function vectorizable_type_promotion | |
4724 | |
4725 Check if STMT performs a binary or unary operation that involves | |
4726 type promotion, and if it can be vectorized. | |
4727 If VEC_STMT is also passed, vectorize the STMT: create a vectorized | |
4728 stmt to replace it, put it in VEC_STMT, and insert it at BSI. | |
4729 Return FALSE if not a vectorizable STMT, TRUE otherwise. */ | |
4730 | |
4731 bool | |
4732 vectorizable_type_promotion (gimple stmt, gimple_stmt_iterator *gsi, | |
4733 gimple *vec_stmt, slp_tree slp_node) | |
4734 { | |
4735 tree vec_dest; | |
4736 tree scalar_dest; | |
4737 tree op0, op1 = NULL; | |
4738 tree vec_oprnd0=NULL, vec_oprnd1=NULL; | |
4739 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
4740 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
4741 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK; | |
4742 tree decl1 = NULL_TREE, decl2 = NULL_TREE; | |
4743 int op_type; | |
4744 tree def; | |
4745 gimple def_stmt; | |
4746 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type}; | |
4747 stmt_vec_info prev_stmt_info; | |
4748 int nunits_in; | |
4749 int nunits_out; | |
4750 tree vectype_out; | |
4751 int ncopies; | |
4752 int j, i; | |
4753 tree vectype_in; | |
4754 tree intermediate_type = NULL_TREE; | |
4755 int multi_step_cvt = 0; | |
4756 VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL; | |
4757 VEC (tree, heap) *vec_dsts = NULL, *interm_types = NULL, *tmp_vec_dsts = NULL; | |
4758 | |
4759 if (!STMT_VINFO_RELEVANT_P (stmt_info)) | |
4760 return false; | |
4761 | |
4762 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def) | |
4763 return false; | |
4764 | |
4765 /* Is STMT a vectorizable type-promotion operation? */ | |
4766 if (!is_gimple_assign (stmt)) | |
4767 return false; | |
4768 | |
4769 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME) | |
4770 return false; | |
4771 | |
4772 code = gimple_assign_rhs_code (stmt); | |
4773 if (!CONVERT_EXPR_CODE_P (code) | |
4774 && code != WIDEN_MULT_EXPR) | |
4775 return false; | |
4776 | |
4777 op0 = gimple_assign_rhs1 (stmt); | |
4778 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0)); | |
4779 if (!vectype_in) | |
4780 return false; | |
4781 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in); | |
4782 | |
4783 scalar_dest = gimple_assign_lhs (stmt); | |
4784 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest)); | |
4785 if (!vectype_out) | |
4786 return false; | |
4787 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out); | |
4788 if (nunits_in <= nunits_out) | |
4789 return false; | |
4790 | |
4791 /* Multiple types in SLP are handled by creating the appropriate number of | |
4792 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in | |
4793 case of SLP. */ | |
4794 if (slp_node) | |
4795 ncopies = 1; | |
4796 else | |
4797 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in; | |
4798 | |
4799 gcc_assert (ncopies >= 1); | |
4800 | |
4801 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest)) | |
4802 && INTEGRAL_TYPE_P (TREE_TYPE (op0))) | |
4803 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest)) | |
4804 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0)) | |
4805 && CONVERT_EXPR_CODE_P (code)))) | |
4806 return false; | |
4807 | |
4808 /* Check the operands of the operation. */ | |
4809 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0])) | |
4810 { | |
4811 if (vect_print_dump_info (REPORT_DETAILS)) | |
4812 fprintf (vect_dump, "use not simple."); | |
4813 return false; | |
4814 } | |
4815 | |
4816 op_type = TREE_CODE_LENGTH (code); | |
4817 if (op_type == binary_op) | |
4818 { | |
4819 op1 = gimple_assign_rhs2 (stmt); | |
4820 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1])) | |
4821 { | |
4822 if (vect_print_dump_info (REPORT_DETAILS)) | |
4823 fprintf (vect_dump, "use not simple."); | |
4824 return false; | |
4825 } | |
4826 } | |
4827 | |
4828 /* Supportable by target? */ | |
4829 if (!supportable_widening_operation (code, stmt, vectype_in, | |
4830 &decl1, &decl2, &code1, &code2, | |
4831 &multi_step_cvt, &interm_types)) | |
4832 return false; | |
4833 | |
4834 /* Binary widening operation can only be supported directly by the | |
4835 architecture. */ | |
4836 gcc_assert (!(multi_step_cvt && op_type == binary_op)); | |
4837 | |
4838 STMT_VINFO_VECTYPE (stmt_info) = vectype_in; | |
4839 | |
4840 if (!vec_stmt) /* transformation not required. */ | |
4841 { | |
4842 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type; | |
4843 if (vect_print_dump_info (REPORT_DETAILS)) | |
4844 fprintf (vect_dump, "=== vectorizable_promotion ==="); | |
4845 vect_model_simple_cost (stmt_info, 2*ncopies, dt, NULL); | |
4846 return true; | |
4847 } | |
4848 | |
4849 /** Transform. **/ | |
4850 | |
4851 if (vect_print_dump_info (REPORT_DETAILS)) | |
4852 fprintf (vect_dump, "transform type promotion operation. ncopies = %d.", | |
4853 ncopies); | |
4854 | |
4855 /* Handle def. */ | |
4856 /* In case of multi-step promotion, we first generate promotion operations | |
4857 to the intermediate types, and then from that types to the final one. | |
4858 We store vector destination in VEC_DSTS in the correct order for | |
4859 recursive creation of promotion operations in | |
4860 vect_create_vectorized_promotion_stmts(). Vector destinations are created | |
4861 according to TYPES recieved from supportable_widening_operation(). */ | |
4862 if (multi_step_cvt) | |
4863 vec_dsts = VEC_alloc (tree, heap, multi_step_cvt + 1); | |
4864 else | |
4865 vec_dsts = VEC_alloc (tree, heap, 1); | |
4866 | |
4867 vec_dest = vect_create_destination_var (scalar_dest, vectype_out); | |
4868 VEC_quick_push (tree, vec_dsts, vec_dest); | |
4869 | |
4870 if (multi_step_cvt) | |
4871 { | |
4872 for (i = VEC_length (tree, interm_types) - 1; | |
4873 VEC_iterate (tree, interm_types, i, intermediate_type); i--) | |
4874 { | |
4875 vec_dest = vect_create_destination_var (scalar_dest, | |
4876 intermediate_type); | |
4877 VEC_quick_push (tree, vec_dsts, vec_dest); | |
4878 } | |
4879 } | |
4880 | |
4881 if (!slp_node) | |
4882 { | |
4883 vec_oprnds0 = VEC_alloc (tree, heap, | |
4884 (multi_step_cvt ? vect_pow2 (multi_step_cvt) : 1)); | |
4885 if (op_type == binary_op) | |
4886 vec_oprnds1 = VEC_alloc (tree, heap, 1); | |
4887 } | |
4888 | |
4889 /* In case the vectorization factor (VF) is bigger than the number | |
4890 of elements that we can fit in a vectype (nunits), we have to generate | |
4891 more than one vector stmt - i.e - we need to "unroll" the | |
4892 vector stmt by a factor VF/nunits. */ | |
4893 | |
4894 prev_stmt_info = NULL; | |
4895 for (j = 0; j < ncopies; j++) | |
4896 { | |
4897 /* Handle uses. */ | |
4898 if (j == 0) | |
4899 { | |
4900 if (slp_node) | |
4901 vect_get_slp_defs (slp_node, &vec_oprnds0, &vec_oprnds1); | |
4902 else | |
4903 { | |
4904 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL); | |
4905 VEC_quick_push (tree, vec_oprnds0, vec_oprnd0); | |
4906 if (op_type == binary_op) | |
4907 { | |
4908 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL); | |
4909 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1); | |
4910 } | |
4911 } | |
4912 } | |
4913 else | |
4914 { | |
4915 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0); | |
4916 VEC_replace (tree, vec_oprnds0, 0, vec_oprnd0); | |
4917 if (op_type == binary_op) | |
4918 { | |
4919 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd1); | |
4920 VEC_replace (tree, vec_oprnds1, 0, vec_oprnd1); | |
4921 } | |
4922 } | |
4923 | |
4924 /* Arguments are ready. Create the new vector stmts. */ | |
4925 tmp_vec_dsts = VEC_copy (tree, heap, vec_dsts); | |
4926 vect_create_vectorized_promotion_stmts (&vec_oprnds0, &vec_oprnds1, | |
4927 multi_step_cvt, stmt, | |
4928 tmp_vec_dsts, | |
4929 gsi, slp_node, code1, code2, | |
4930 decl1, decl2, op_type, | |
4931 &prev_stmt_info); | |
4932 } | |
4933 | |
4934 VEC_free (tree, heap, vec_dsts); | |
4935 VEC_free (tree, heap, tmp_vec_dsts); | |
4936 VEC_free (tree, heap, interm_types); | |
4937 VEC_free (tree, heap, vec_oprnds0); | |
4938 VEC_free (tree, heap, vec_oprnds1); | |
4939 | |
4940 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info); | |
4941 return true; | |
4942 } | |
4943 | |
4944 | |
4945 /* Function vect_strided_store_supported. | |
4946 | |
4947 Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported, | |
4948 and FALSE otherwise. */ | |
4949 | |
4950 static bool | |
4951 vect_strided_store_supported (tree vectype) | |
4952 { | |
4953 optab interleave_high_optab, interleave_low_optab; | |
4954 int mode; | |
4955 | |
4956 mode = (int) TYPE_MODE (vectype); | |
4957 | |
4958 /* Check that the operation is supported. */ | |
4959 interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR, | |
4960 vectype, optab_default); | |
4961 interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR, | |
4962 vectype, optab_default); | |
4963 if (!interleave_high_optab || !interleave_low_optab) | |
4964 { | |
4965 if (vect_print_dump_info (REPORT_DETAILS)) | |
4966 fprintf (vect_dump, "no optab for interleave."); | |
4967 return false; | |
4968 } | |
4969 | |
4970 if (optab_handler (interleave_high_optab, mode)->insn_code | |
4971 == CODE_FOR_nothing | |
4972 || optab_handler (interleave_low_optab, mode)->insn_code | |
4973 == CODE_FOR_nothing) | |
4974 { | |
4975 if (vect_print_dump_info (REPORT_DETAILS)) | |
4976 fprintf (vect_dump, "interleave op not supported by target."); | |
4977 return false; | |
4978 } | |
4979 | |
4980 return true; | |
4981 } | |
4982 | |
4983 | |
4984 /* Function vect_permute_store_chain. | |
4985 | |
4986 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be | |
4987 a power of 2, generate interleave_high/low stmts to reorder the data | |
4988 correctly for the stores. Return the final references for stores in | |
4989 RESULT_CHAIN. | |
4990 | |
4991 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. | |
4992 The input is 4 vectors each containing 8 elements. We assign a number to each | |
4993 element, the input sequence is: | |
4994 | |
4995 1st vec: 0 1 2 3 4 5 6 7 | |
4996 2nd vec: 8 9 10 11 12 13 14 15 | |
4997 3rd vec: 16 17 18 19 20 21 22 23 | |
4998 4th vec: 24 25 26 27 28 29 30 31 | |
4999 | |
5000 The output sequence should be: | |
5001 | |
5002 1st vec: 0 8 16 24 1 9 17 25 | |
5003 2nd vec: 2 10 18 26 3 11 19 27 | |
5004 3rd vec: 4 12 20 28 5 13 21 30 | |
5005 4th vec: 6 14 22 30 7 15 23 31 | |
5006 | |
5007 i.e., we interleave the contents of the four vectors in their order. | |
5008 | |
5009 We use interleave_high/low instructions to create such output. The input of | |
5010 each interleave_high/low operation is two vectors: | |
5011 1st vec 2nd vec | |
5012 0 1 2 3 4 5 6 7 | |
5013 the even elements of the result vector are obtained left-to-right from the | |
5014 high/low elements of the first vector. The odd elements of the result are | |
5015 obtained left-to-right from the high/low elements of the second vector. | |
5016 The output of interleave_high will be: 0 4 1 5 | |
5017 and of interleave_low: 2 6 3 7 | |
5018 | |
5019 | |
5020 The permutation is done in log LENGTH stages. In each stage interleave_high | |
5021 and interleave_low stmts are created for each pair of vectors in DR_CHAIN, | |
5022 where the first argument is taken from the first half of DR_CHAIN and the | |
5023 second argument from it's second half. | |
5024 In our example, | |
5025 | |
5026 I1: interleave_high (1st vec, 3rd vec) | |
5027 I2: interleave_low (1st vec, 3rd vec) | |
5028 I3: interleave_high (2nd vec, 4th vec) | |
5029 I4: interleave_low (2nd vec, 4th vec) | |
5030 | |
5031 The output for the first stage is: | |
5032 | |
5033 I1: 0 16 1 17 2 18 3 19 | |
5034 I2: 4 20 5 21 6 22 7 23 | |
5035 I3: 8 24 9 25 10 26 11 27 | |
5036 I4: 12 28 13 29 14 30 15 31 | |
5037 | |
5038 The output of the second stage, i.e. the final result is: | |
5039 | |
5040 I1: 0 8 16 24 1 9 17 25 | |
5041 I2: 2 10 18 26 3 11 19 27 | |
5042 I3: 4 12 20 28 5 13 21 30 | |
5043 I4: 6 14 22 30 7 15 23 31. */ | |
5044 | |
5045 static bool | |
5046 vect_permute_store_chain (VEC(tree,heap) *dr_chain, | |
5047 unsigned int length, | |
5048 gimple stmt, | |
5049 gimple_stmt_iterator *gsi, | |
5050 VEC(tree,heap) **result_chain) | |
5051 { | |
5052 tree perm_dest, vect1, vect2, high, low; | |
5053 gimple perm_stmt; | |
5054 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); | |
5055 tree scalar_dest; | |
5056 int i; | |
5057 unsigned int j; | |
5058 enum tree_code high_code, low_code; | |
5059 | |
5060 scalar_dest = gimple_assign_lhs (stmt); | |
5061 | |
5062 /* Check that the operation is supported. */ | |
5063 if (!vect_strided_store_supported (vectype)) | |
5064 return false; | |
5065 | |
5066 *result_chain = VEC_copy (tree, heap, dr_chain); | |
5067 | |
5068 for (i = 0; i < exact_log2 (length); i++) | |
5069 { | |
5070 for (j = 0; j < length/2; j++) | |
5071 { | |
5072 vect1 = VEC_index (tree, dr_chain, j); | |
5073 vect2 = VEC_index (tree, dr_chain, j+length/2); | |
5074 | |
5075 /* Create interleaving stmt: | |
5076 in the case of big endian: | |
5077 high = interleave_high (vect1, vect2) | |
5078 and in the case of little endian: | |
5079 high = interleave_low (vect1, vect2). */ | |
5080 perm_dest = create_tmp_var (vectype, "vect_inter_high"); | |
5081 DECL_GIMPLE_REG_P (perm_dest) = 1; | |
5082 add_referenced_var (perm_dest); | |
5083 if (BYTES_BIG_ENDIAN) | |
5084 { | |
5085 high_code = VEC_INTERLEAVE_HIGH_EXPR; | |
5086 low_code = VEC_INTERLEAVE_LOW_EXPR; | |
5087 } | |
5088 else | |
5089 { | |
5090 low_code = VEC_INTERLEAVE_HIGH_EXPR; | |
5091 high_code = VEC_INTERLEAVE_LOW_EXPR; | |
5092 } | |
5093 perm_stmt = gimple_build_assign_with_ops (high_code, perm_dest, | |
5094 vect1, vect2); | |
5095 high = make_ssa_name (perm_dest, perm_stmt); | |
5096 gimple_assign_set_lhs (perm_stmt, high); | |
5097 vect_finish_stmt_generation (stmt, perm_stmt, gsi); | |
5098 VEC_replace (tree, *result_chain, 2*j, high); | |
5099 | |
5100 /* Create interleaving stmt: | |
5101 in the case of big endian: | |
5102 low = interleave_low (vect1, vect2) | |
5103 and in the case of little endian: | |
5104 low = interleave_high (vect1, vect2). */ | |
5105 perm_dest = create_tmp_var (vectype, "vect_inter_low"); | |
5106 DECL_GIMPLE_REG_P (perm_dest) = 1; | |
5107 add_referenced_var (perm_dest); | |
5108 perm_stmt = gimple_build_assign_with_ops (low_code, perm_dest, | |
5109 vect1, vect2); | |
5110 low = make_ssa_name (perm_dest, perm_stmt); | |
5111 gimple_assign_set_lhs (perm_stmt, low); | |
5112 vect_finish_stmt_generation (stmt, perm_stmt, gsi); | |
5113 VEC_replace (tree, *result_chain, 2*j+1, low); | |
5114 } | |
5115 dr_chain = VEC_copy (tree, heap, *result_chain); | |
5116 } | |
5117 return true; | |
5118 } | |
5119 | |
5120 | |
5121 /* Function vectorizable_store. | |
5122 | |
5123 Check if STMT defines a non scalar data-ref (array/pointer/structure) that | |
5124 can be vectorized. | |
5125 If VEC_STMT is also passed, vectorize the STMT: create a vectorized | |
5126 stmt to replace it, put it in VEC_STMT, and insert it at BSI. | |
5127 Return FALSE if not a vectorizable STMT, TRUE otherwise. */ | |
5128 | |
5129 bool | |
5130 vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt, | |
5131 slp_tree slp_node) | |
5132 { | |
5133 tree scalar_dest; | |
5134 tree data_ref; | |
5135 tree op; | |
5136 tree vec_oprnd = NULL_TREE; | |
5137 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
5138 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL; | |
5139 tree vectype = STMT_VINFO_VECTYPE (stmt_info); | |
5140 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
5141 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
5142 enum machine_mode vec_mode; | |
5143 tree dummy; | |
5144 enum dr_alignment_support alignment_support_scheme; | |
5145 tree def; | |
5146 gimple def_stmt; | |
5147 enum vect_def_type dt; | |
5148 stmt_vec_info prev_stmt_info = NULL; | |
5149 tree dataref_ptr = NULL_TREE; | |
5150 int nunits = TYPE_VECTOR_SUBPARTS (vectype); | |
5151 int ncopies; | |
5152 int j; | |
5153 gimple next_stmt, first_stmt = NULL; | |
5154 bool strided_store = false; | |
5155 unsigned int group_size, i; | |
5156 VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL; | |
5157 bool inv_p; | |
5158 VEC(tree,heap) *vec_oprnds = NULL; | |
5159 bool slp = (slp_node != NULL); | |
5160 stmt_vec_info first_stmt_vinfo; | |
5161 unsigned int vec_num; | |
5162 | |
5163 /* Multiple types in SLP are handled by creating the appropriate number of | |
5164 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in | |
5165 case of SLP. */ | |
5166 if (slp) | |
5167 ncopies = 1; | |
5168 else | |
5169 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits; | |
5170 | |
5171 gcc_assert (ncopies >= 1); | |
5172 | |
5173 /* FORNOW. This restriction should be relaxed. */ | |
5174 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1) | |
5175 { | |
5176 if (vect_print_dump_info (REPORT_DETAILS)) | |
5177 fprintf (vect_dump, "multiple types in nested loop."); | |
5178 return false; | |
5179 } | |
5180 | |
5181 if (!STMT_VINFO_RELEVANT_P (stmt_info)) | |
5182 return false; | |
5183 | |
5184 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def) | |
5185 return false; | |
5186 | |
5187 /* Is vectorizable store? */ | |
5188 | |
5189 if (!is_gimple_assign (stmt)) | |
5190 return false; | |
5191 | |
5192 scalar_dest = gimple_assign_lhs (stmt); | |
5193 if (TREE_CODE (scalar_dest) != ARRAY_REF | |
5194 && TREE_CODE (scalar_dest) != INDIRECT_REF | |
5195 && !STMT_VINFO_STRIDED_ACCESS (stmt_info)) | |
5196 return false; | |
5197 | |
5198 gcc_assert (gimple_assign_single_p (stmt)); | |
5199 op = gimple_assign_rhs1 (stmt); | |
5200 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt)) | |
5201 { | |
5202 if (vect_print_dump_info (REPORT_DETAILS)) | |
5203 fprintf (vect_dump, "use not simple."); | |
5204 return false; | |
5205 } | |
5206 | |
5207 /* The scalar rhs type needs to be trivially convertible to the vector | |
5208 component type. This should always be the case. */ | |
5209 if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (op))) | |
5210 { | |
5211 if (vect_print_dump_info (REPORT_DETAILS)) | |
5212 fprintf (vect_dump, "??? operands of different types"); | |
5213 return false; | |
5214 } | |
5215 | |
5216 vec_mode = TYPE_MODE (vectype); | |
5217 /* FORNOW. In some cases can vectorize even if data-type not supported | |
5218 (e.g. - array initialization with 0). */ | |
5219 if (optab_handler (mov_optab, (int)vec_mode)->insn_code == CODE_FOR_nothing) | |
5220 return false; | |
5221 | |
5222 if (!STMT_VINFO_DATA_REF (stmt_info)) | |
5223 return false; | |
5224 | |
5225 if (STMT_VINFO_STRIDED_ACCESS (stmt_info)) | |
5226 { | |
5227 strided_store = true; | |
5228 first_stmt = DR_GROUP_FIRST_DR (stmt_info); | |
5229 if (!vect_strided_store_supported (vectype) | |
5230 && !PURE_SLP_STMT (stmt_info) && !slp) | |
5231 return false; | |
5232 | |
5233 if (first_stmt == stmt) | |
5234 { | |
5235 /* STMT is the leader of the group. Check the operands of all the | |
5236 stmts of the group. */ | |
5237 next_stmt = DR_GROUP_NEXT_DR (stmt_info); | |
5238 while (next_stmt) | |
5239 { | |
5240 gcc_assert (gimple_assign_single_p (next_stmt)); | |
5241 op = gimple_assign_rhs1 (next_stmt); | |
5242 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt)) | |
5243 { | |
5244 if (vect_print_dump_info (REPORT_DETAILS)) | |
5245 fprintf (vect_dump, "use not simple."); | |
5246 return false; | |
5247 } | |
5248 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt)); | |
5249 } | |
5250 } | |
5251 } | |
5252 | |
5253 if (!vec_stmt) /* transformation not required. */ | |
5254 { | |
5255 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type; | |
5256 vect_model_store_cost (stmt_info, ncopies, dt, NULL); | |
5257 return true; | |
5258 } | |
5259 | |
5260 /** Transform. **/ | |
5261 | |
5262 if (strided_store) | |
5263 { | |
5264 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)); | |
5265 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt)); | |
5266 | |
5267 DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++; | |
5268 | |
5269 /* FORNOW */ | |
5270 gcc_assert (!nested_in_vect_loop_p (loop, stmt)); | |
5271 | |
5272 /* We vectorize all the stmts of the interleaving group when we | |
5273 reach the last stmt in the group. */ | |
5274 if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt)) | |
5275 < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt)) | |
5276 && !slp) | |
5277 { | |
5278 *vec_stmt = NULL; | |
5279 return true; | |
5280 } | |
5281 | |
5282 if (slp) | |
5283 strided_store = false; | |
5284 | |
5285 /* VEC_NUM is the number of vect stmts to be created for this group. */ | |
5286 if (slp) | |
5287 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); | |
5288 else | |
5289 vec_num = group_size; | |
5290 } | |
5291 else | |
5292 { | |
5293 first_stmt = stmt; | |
5294 first_dr = dr; | |
5295 group_size = vec_num = 1; | |
5296 first_stmt_vinfo = stmt_info; | |
5297 } | |
5298 | |
5299 if (vect_print_dump_info (REPORT_DETAILS)) | |
5300 fprintf (vect_dump, "transform store. ncopies = %d",ncopies); | |
5301 | |
5302 dr_chain = VEC_alloc (tree, heap, group_size); | |
5303 oprnds = VEC_alloc (tree, heap, group_size); | |
5304 | |
5305 alignment_support_scheme = vect_supportable_dr_alignment (first_dr); | |
5306 gcc_assert (alignment_support_scheme); | |
5307 gcc_assert (alignment_support_scheme == dr_aligned); /* FORNOW */ | |
5308 | |
5309 /* In case the vectorization factor (VF) is bigger than the number | |
5310 of elements that we can fit in a vectype (nunits), we have to generate | |
5311 more than one vector stmt - i.e - we need to "unroll" the | |
5312 vector stmt by a factor VF/nunits. For more details see documentation in | |
5313 vect_get_vec_def_for_copy_stmt. */ | |
5314 | |
5315 /* In case of interleaving (non-unit strided access): | |
5316 | |
5317 S1: &base + 2 = x2 | |
5318 S2: &base = x0 | |
5319 S3: &base + 1 = x1 | |
5320 S4: &base + 3 = x3 | |
5321 | |
5322 We create vectorized stores starting from base address (the access of the | |
5323 first stmt in the chain (S2 in the above example), when the last store stmt | |
5324 of the chain (S4) is reached: | |
5325 | |
5326 VS1: &base = vx2 | |
5327 VS2: &base + vec_size*1 = vx0 | |
5328 VS3: &base + vec_size*2 = vx1 | |
5329 VS4: &base + vec_size*3 = vx3 | |
5330 | |
5331 Then permutation statements are generated: | |
5332 | |
5333 VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 > | |
5334 VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 > | |
5335 ... | |
5336 | |
5337 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts | |
5338 (the order of the data-refs in the output of vect_permute_store_chain | |
5339 corresponds to the order of scalar stmts in the interleaving chain - see | |
5340 the documentation of vect_permute_store_chain()). | |
5341 | |
5342 In case of both multiple types and interleaving, above vector stores and | |
5343 permutation stmts are created for every copy. The result vector stmts are | |
5344 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding | |
5345 STMT_VINFO_RELATED_STMT for the next copies. | |
5346 */ | |
5347 | |
5348 prev_stmt_info = NULL; | |
5349 for (j = 0; j < ncopies; j++) | |
5350 { | |
5351 gimple new_stmt; | |
5352 gimple ptr_incr; | |
5353 | |
5354 if (j == 0) | |
5355 { | |
5356 if (slp) | |
5357 { | |
5358 /* Get vectorized arguments for SLP_NODE. */ | |
5359 vect_get_slp_defs (slp_node, &vec_oprnds, NULL); | |
5360 | |
5361 vec_oprnd = VEC_index (tree, vec_oprnds, 0); | |
5362 } | |
5363 else | |
5364 { | |
5365 /* For interleaved stores we collect vectorized defs for all the | |
5366 stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then | |
5367 used as an input to vect_permute_store_chain(), and OPRNDS as | |
5368 an input to vect_get_vec_def_for_stmt_copy() for the next copy. | |
5369 | |
5370 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and | |
5371 OPRNDS are of size 1. */ | |
5372 next_stmt = first_stmt; | |
5373 for (i = 0; i < group_size; i++) | |
5374 { | |
5375 /* Since gaps are not supported for interleaved stores, | |
5376 GROUP_SIZE is the exact number of stmts in the chain. | |
5377 Therefore, NEXT_STMT can't be NULL_TREE. In case that | |
5378 there is no interleaving, GROUP_SIZE is 1, and only one | |
5379 iteration of the loop will be executed. */ | |
5380 gcc_assert (next_stmt | |
5381 && gimple_assign_single_p (next_stmt)); | |
5382 op = gimple_assign_rhs1 (next_stmt); | |
5383 | |
5384 vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt, | |
5385 NULL); | |
5386 VEC_quick_push(tree, dr_chain, vec_oprnd); | |
5387 VEC_quick_push(tree, oprnds, vec_oprnd); | |
5388 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt)); | |
5389 } | |
5390 } | |
5391 | |
5392 /* We should have catched mismatched types earlier. */ | |
5393 gcc_assert (useless_type_conversion_p (vectype, | |
5394 TREE_TYPE (vec_oprnd))); | |
5395 dataref_ptr = vect_create_data_ref_ptr (first_stmt, NULL, NULL_TREE, | |
5396 &dummy, &ptr_incr, false, | |
5397 &inv_p, NULL); | |
5398 gcc_assert (!inv_p); | |
5399 } | |
5400 else | |
5401 { | |
5402 /* For interleaved stores we created vectorized defs for all the | |
5403 defs stored in OPRNDS in the previous iteration (previous copy). | |
5404 DR_CHAIN is then used as an input to vect_permute_store_chain(), | |
5405 and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the | |
5406 next copy. | |
5407 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and | |
5408 OPRNDS are of size 1. */ | |
5409 for (i = 0; i < group_size; i++) | |
5410 { | |
5411 op = VEC_index (tree, oprnds, i); | |
5412 vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt); | |
5413 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, op); | |
5414 VEC_replace(tree, dr_chain, i, vec_oprnd); | |
5415 VEC_replace(tree, oprnds, i, vec_oprnd); | |
5416 } | |
5417 dataref_ptr = | |
5418 bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt, NULL_TREE); | |
5419 } | |
5420 | |
5421 if (strided_store) | |
5422 { | |
5423 result_chain = VEC_alloc (tree, heap, group_size); | |
5424 /* Permute. */ | |
5425 if (!vect_permute_store_chain (dr_chain, group_size, stmt, gsi, | |
5426 &result_chain)) | |
5427 return false; | |
5428 } | |
5429 | |
5430 next_stmt = first_stmt; | |
5431 for (i = 0; i < vec_num; i++) | |
5432 { | |
5433 if (i > 0) | |
5434 /* Bump the vector pointer. */ | |
5435 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt, | |
5436 NULL_TREE); | |
5437 | |
5438 if (slp) | |
5439 vec_oprnd = VEC_index (tree, vec_oprnds, i); | |
5440 else if (strided_store) | |
5441 /* For strided stores vectorized defs are interleaved in | |
5442 vect_permute_store_chain(). */ | |
5443 vec_oprnd = VEC_index (tree, result_chain, i); | |
5444 | |
5445 data_ref = build_fold_indirect_ref (dataref_ptr); | |
5446 | |
5447 /* Arguments are ready. Create the new vector stmt. */ | |
5448 new_stmt = gimple_build_assign (data_ref, vec_oprnd); | |
5449 vect_finish_stmt_generation (stmt, new_stmt, gsi); | |
5450 mark_symbols_for_renaming (new_stmt); | |
5451 | |
5452 if (slp) | |
5453 continue; | |
5454 | |
5455 if (j == 0) | |
5456 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt; | |
5457 else | |
5458 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; | |
5459 | |
5460 prev_stmt_info = vinfo_for_stmt (new_stmt); | |
5461 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt)); | |
5462 if (!next_stmt) | |
5463 break; | |
5464 } | |
5465 } | |
5466 | |
5467 VEC_free (tree, heap, dr_chain); | |
5468 VEC_free (tree, heap, oprnds); | |
5469 if (result_chain) | |
5470 VEC_free (tree, heap, result_chain); | |
5471 | |
5472 return true; | |
5473 } | |
5474 | |
5475 | |
5476 /* Function vect_setup_realignment | |
5477 | |
5478 This function is called when vectorizing an unaligned load using | |
5479 the dr_explicit_realign[_optimized] scheme. | |
5480 This function generates the following code at the loop prolog: | |
5481 | |
5482 p = initial_addr; | |
5483 x msq_init = *(floor(p)); # prolog load | |
5484 realignment_token = call target_builtin; | |
5485 loop: | |
5486 x msq = phi (msq_init, ---) | |
5487 | |
5488 The stmts marked with x are generated only for the case of | |
5489 dr_explicit_realign_optimized. | |
5490 | |
5491 The code above sets up a new (vector) pointer, pointing to the first | |
5492 location accessed by STMT, and a "floor-aligned" load using that pointer. | |
5493 It also generates code to compute the "realignment-token" (if the relevant | |
5494 target hook was defined), and creates a phi-node at the loop-header bb | |
5495 whose arguments are the result of the prolog-load (created by this | |
5496 function) and the result of a load that takes place in the loop (to be | |
5497 created by the caller to this function). | |
5498 | |
5499 For the case of dr_explicit_realign_optimized: | |
5500 The caller to this function uses the phi-result (msq) to create the | |
5501 realignment code inside the loop, and sets up the missing phi argument, | |
5502 as follows: | |
5503 loop: | |
5504 msq = phi (msq_init, lsq) | |
5505 lsq = *(floor(p')); # load in loop | |
5506 result = realign_load (msq, lsq, realignment_token); | |
5507 | |
5508 For the case of dr_explicit_realign: | |
5509 loop: | |
5510 msq = *(floor(p)); # load in loop | |
5511 p' = p + (VS-1); | |
5512 lsq = *(floor(p')); # load in loop | |
5513 result = realign_load (msq, lsq, realignment_token); | |
5514 | |
5515 Input: | |
5516 STMT - (scalar) load stmt to be vectorized. This load accesses | |
5517 a memory location that may be unaligned. | |
5518 BSI - place where new code is to be inserted. | |
5519 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes | |
5520 is used. | |
5521 | |
5522 Output: | |
5523 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load | |
5524 target hook, if defined. | |
5525 Return value - the result of the loop-header phi node. */ | |
5526 | |
5527 static tree | |
5528 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi, | |
5529 tree *realignment_token, | |
5530 enum dr_alignment_support alignment_support_scheme, | |
5531 tree init_addr, | |
5532 struct loop **at_loop) | |
5533 { | |
5534 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
5535 tree vectype = STMT_VINFO_VECTYPE (stmt_info); | |
5536 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
5537 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
5538 edge pe; | |
5539 tree scalar_dest = gimple_assign_lhs (stmt); | |
5540 tree vec_dest; | |
5541 gimple inc; | |
5542 tree ptr; | |
5543 tree data_ref; | |
5544 gimple new_stmt; | |
5545 basic_block new_bb; | |
5546 tree msq_init = NULL_TREE; | |
5547 tree new_temp; | |
5548 gimple phi_stmt; | |
5549 tree msq = NULL_TREE; | |
5550 gimple_seq stmts = NULL; | |
5551 bool inv_p; | |
5552 bool compute_in_loop = false; | |
5553 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt); | |
5554 struct loop *containing_loop = (gimple_bb (stmt))->loop_father; | |
5555 struct loop *loop_for_initial_load; | |
5556 | |
5557 gcc_assert (alignment_support_scheme == dr_explicit_realign | |
5558 || alignment_support_scheme == dr_explicit_realign_optimized); | |
5559 | |
5560 /* We need to generate three things: | |
5561 1. the misalignment computation | |
5562 2. the extra vector load (for the optimized realignment scheme). | |
5563 3. the phi node for the two vectors from which the realignment is | |
5564 done (for the optimized realignment scheme). | |
5565 */ | |
5566 | |
5567 /* 1. Determine where to generate the misalignment computation. | |
5568 | |
5569 If INIT_ADDR is NULL_TREE, this indicates that the misalignment | |
5570 calculation will be generated by this function, outside the loop (in the | |
5571 preheader). Otherwise, INIT_ADDR had already been computed for us by the | |
5572 caller, inside the loop. | |
5573 | |
5574 Background: If the misalignment remains fixed throughout the iterations of | |
5575 the loop, then both realignment schemes are applicable, and also the | |
5576 misalignment computation can be done outside LOOP. This is because we are | |
5577 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that | |
5578 are a multiple of VS (the Vector Size), and therefore the misalignment in | |
5579 different vectorized LOOP iterations is always the same. | |
5580 The problem arises only if the memory access is in an inner-loop nested | |
5581 inside LOOP, which is now being vectorized using outer-loop vectorization. | |
5582 This is the only case when the misalignment of the memory access may not | |
5583 remain fixed throughout the iterations of the inner-loop (as explained in | |
5584 detail in vect_supportable_dr_alignment). In this case, not only is the | |
5585 optimized realignment scheme not applicable, but also the misalignment | |
5586 computation (and generation of the realignment token that is passed to | |
5587 REALIGN_LOAD) have to be done inside the loop. | |
5588 | |
5589 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode | |
5590 or not, which in turn determines if the misalignment is computed inside | |
5591 the inner-loop, or outside LOOP. */ | |
5592 | |
5593 if (init_addr != NULL_TREE) | |
5594 { | |
5595 compute_in_loop = true; | |
5596 gcc_assert (alignment_support_scheme == dr_explicit_realign); | |
5597 } | |
5598 | |
5599 | |
5600 /* 2. Determine where to generate the extra vector load. | |
5601 | |
5602 For the optimized realignment scheme, instead of generating two vector | |
5603 loads in each iteration, we generate a single extra vector load in the | |
5604 preheader of the loop, and in each iteration reuse the result of the | |
5605 vector load from the previous iteration. In case the memory access is in | |
5606 an inner-loop nested inside LOOP, which is now being vectorized using | |
5607 outer-loop vectorization, we need to determine whether this initial vector | |
5608 load should be generated at the preheader of the inner-loop, or can be | |
5609 generated at the preheader of LOOP. If the memory access has no evolution | |
5610 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has | |
5611 to be generated inside LOOP (in the preheader of the inner-loop). */ | |
5612 | |
5613 if (nested_in_vect_loop) | |
5614 { | |
5615 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info); | |
5616 bool invariant_in_outerloop = | |
5617 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0); | |
5618 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner); | |
5619 } | |
5620 else | |
5621 loop_for_initial_load = loop; | |
5622 if (at_loop) | |
5623 *at_loop = loop_for_initial_load; | |
5624 | |
5625 /* 3. For the case of the optimized realignment, create the first vector | |
5626 load at the loop preheader. */ | |
5627 | |
5628 if (alignment_support_scheme == dr_explicit_realign_optimized) | |
5629 { | |
5630 /* Create msq_init = *(floor(p1)) in the loop preheader */ | |
5631 | |
5632 gcc_assert (!compute_in_loop); | |
5633 pe = loop_preheader_edge (loop_for_initial_load); | |
5634 vec_dest = vect_create_destination_var (scalar_dest, vectype); | |
5635 ptr = vect_create_data_ref_ptr (stmt, loop_for_initial_load, NULL_TREE, | |
5636 &init_addr, &inc, true, &inv_p, NULL_TREE); | |
5637 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr); | |
5638 new_stmt = gimple_build_assign (vec_dest, data_ref); | |
5639 new_temp = make_ssa_name (vec_dest, new_stmt); | |
5640 gimple_assign_set_lhs (new_stmt, new_temp); | |
5641 mark_symbols_for_renaming (new_stmt); | |
5642 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); | |
5643 gcc_assert (!new_bb); | |
5644 msq_init = gimple_assign_lhs (new_stmt); | |
5645 } | |
5646 | |
5647 /* 4. Create realignment token using a target builtin, if available. | |
5648 It is done either inside the containing loop, or before LOOP (as | |
5649 determined above). */ | |
5650 | |
5651 if (targetm.vectorize.builtin_mask_for_load) | |
5652 { | |
5653 tree builtin_decl; | |
5654 | |
5655 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */ | |
5656 if (compute_in_loop) | |
5657 gcc_assert (init_addr); /* already computed by the caller. */ | |
5658 else | |
5659 { | |
5660 /* Generate the INIT_ADDR computation outside LOOP. */ | |
5661 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts, | |
5662 NULL_TREE, loop); | |
5663 pe = loop_preheader_edge (loop); | |
5664 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); | |
5665 gcc_assert (!new_bb); | |
5666 } | |
5667 | |
5668 builtin_decl = targetm.vectorize.builtin_mask_for_load (); | |
5669 new_stmt = gimple_build_call (builtin_decl, 1, init_addr); | |
5670 vec_dest = | |
5671 vect_create_destination_var (scalar_dest, | |
5672 gimple_call_return_type (new_stmt)); | |
5673 new_temp = make_ssa_name (vec_dest, new_stmt); | |
5674 gimple_call_set_lhs (new_stmt, new_temp); | |
5675 | |
5676 if (compute_in_loop) | |
5677 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); | |
5678 else | |
5679 { | |
5680 /* Generate the misalignment computation outside LOOP. */ | |
5681 pe = loop_preheader_edge (loop); | |
5682 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); | |
5683 gcc_assert (!new_bb); | |
5684 } | |
5685 | |
5686 *realignment_token = gimple_call_lhs (new_stmt); | |
5687 | |
5688 /* The result of the CALL_EXPR to this builtin is determined from | |
5689 the value of the parameter and no global variables are touched | |
5690 which makes the builtin a "const" function. Requiring the | |
5691 builtin to have the "const" attribute makes it unnecessary | |
5692 to call mark_call_clobbered. */ | |
5693 gcc_assert (TREE_READONLY (builtin_decl)); | |
5694 } | |
5695 | |
5696 if (alignment_support_scheme == dr_explicit_realign) | |
5697 return msq; | |
5698 | |
5699 gcc_assert (!compute_in_loop); | |
5700 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized); | |
5701 | |
5702 | |
5703 /* 5. Create msq = phi <msq_init, lsq> in loop */ | |
5704 | |
5705 pe = loop_preheader_edge (containing_loop); | |
5706 vec_dest = vect_create_destination_var (scalar_dest, vectype); | |
5707 msq = make_ssa_name (vec_dest, NULL); | |
5708 phi_stmt = create_phi_node (msq, containing_loop->header); | |
5709 SSA_NAME_DEF_STMT (msq) = phi_stmt; | |
5710 add_phi_arg (phi_stmt, msq_init, pe); | |
5711 | |
5712 return msq; | |
5713 } | |
5714 | |
5715 | |
5716 /* Function vect_strided_load_supported. | |
5717 | |
5718 Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported, | |
5719 and FALSE otherwise. */ | |
5720 | |
5721 static bool | |
5722 vect_strided_load_supported (tree vectype) | |
5723 { | |
5724 optab perm_even_optab, perm_odd_optab; | |
5725 int mode; | |
5726 | |
5727 mode = (int) TYPE_MODE (vectype); | |
5728 | |
5729 perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype, | |
5730 optab_default); | |
5731 if (!perm_even_optab) | |
5732 { | |
5733 if (vect_print_dump_info (REPORT_DETAILS)) | |
5734 fprintf (vect_dump, "no optab for perm_even."); | |
5735 return false; | |
5736 } | |
5737 | |
5738 if (optab_handler (perm_even_optab, mode)->insn_code == CODE_FOR_nothing) | |
5739 { | |
5740 if (vect_print_dump_info (REPORT_DETAILS)) | |
5741 fprintf (vect_dump, "perm_even op not supported by target."); | |
5742 return false; | |
5743 } | |
5744 | |
5745 perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype, | |
5746 optab_default); | |
5747 if (!perm_odd_optab) | |
5748 { | |
5749 if (vect_print_dump_info (REPORT_DETAILS)) | |
5750 fprintf (vect_dump, "no optab for perm_odd."); | |
5751 return false; | |
5752 } | |
5753 | |
5754 if (optab_handler (perm_odd_optab, mode)->insn_code == CODE_FOR_nothing) | |
5755 { | |
5756 if (vect_print_dump_info (REPORT_DETAILS)) | |
5757 fprintf (vect_dump, "perm_odd op not supported by target."); | |
5758 return false; | |
5759 } | |
5760 return true; | |
5761 } | |
5762 | |
5763 | |
5764 /* Function vect_permute_load_chain. | |
5765 | |
5766 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be | |
5767 a power of 2, generate extract_even/odd stmts to reorder the input data | |
5768 correctly. Return the final references for loads in RESULT_CHAIN. | |
5769 | |
5770 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. | |
5771 The input is 4 vectors each containing 8 elements. We assign a number to each | |
5772 element, the input sequence is: | |
5773 | |
5774 1st vec: 0 1 2 3 4 5 6 7 | |
5775 2nd vec: 8 9 10 11 12 13 14 15 | |
5776 3rd vec: 16 17 18 19 20 21 22 23 | |
5777 4th vec: 24 25 26 27 28 29 30 31 | |
5778 | |
5779 The output sequence should be: | |
5780 | |
5781 1st vec: 0 4 8 12 16 20 24 28 | |
5782 2nd vec: 1 5 9 13 17 21 25 29 | |
5783 3rd vec: 2 6 10 14 18 22 26 30 | |
5784 4th vec: 3 7 11 15 19 23 27 31 | |
5785 | |
5786 i.e., the first output vector should contain the first elements of each | |
5787 interleaving group, etc. | |
5788 | |
5789 We use extract_even/odd instructions to create such output. The input of each | |
5790 extract_even/odd operation is two vectors | |
5791 1st vec 2nd vec | |
5792 0 1 2 3 4 5 6 7 | |
5793 | |
5794 and the output is the vector of extracted even/odd elements. The output of | |
5795 extract_even will be: 0 2 4 6 | |
5796 and of extract_odd: 1 3 5 7 | |
5797 | |
5798 | |
5799 The permutation is done in log LENGTH stages. In each stage extract_even and | |
5800 extract_odd stmts are created for each pair of vectors in DR_CHAIN in their | |
5801 order. In our example, | |
5802 | |
5803 E1: extract_even (1st vec, 2nd vec) | |
5804 E2: extract_odd (1st vec, 2nd vec) | |
5805 E3: extract_even (3rd vec, 4th vec) | |
5806 E4: extract_odd (3rd vec, 4th vec) | |
5807 | |
5808 The output for the first stage will be: | |
5809 | |
5810 E1: 0 2 4 6 8 10 12 14 | |
5811 E2: 1 3 5 7 9 11 13 15 | |
5812 E3: 16 18 20 22 24 26 28 30 | |
5813 E4: 17 19 21 23 25 27 29 31 | |
5814 | |
5815 In order to proceed and create the correct sequence for the next stage (or | |
5816 for the correct output, if the second stage is the last one, as in our | |
5817 example), we first put the output of extract_even operation and then the | |
5818 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN). | |
5819 The input for the second stage is: | |
5820 | |
5821 1st vec (E1): 0 2 4 6 8 10 12 14 | |
5822 2nd vec (E3): 16 18 20 22 24 26 28 30 | |
5823 3rd vec (E2): 1 3 5 7 9 11 13 15 | |
5824 4th vec (E4): 17 19 21 23 25 27 29 31 | |
5825 | |
5826 The output of the second stage: | |
5827 | |
5828 E1: 0 4 8 12 16 20 24 28 | |
5829 E2: 2 6 10 14 18 22 26 30 | |
5830 E3: 1 5 9 13 17 21 25 29 | |
5831 E4: 3 7 11 15 19 23 27 31 | |
5832 | |
5833 And RESULT_CHAIN after reordering: | |
5834 | |
5835 1st vec (E1): 0 4 8 12 16 20 24 28 | |
5836 2nd vec (E3): 1 5 9 13 17 21 25 29 | |
5837 3rd vec (E2): 2 6 10 14 18 22 26 30 | |
5838 4th vec (E4): 3 7 11 15 19 23 27 31. */ | |
5839 | |
5840 static bool | |
5841 vect_permute_load_chain (VEC(tree,heap) *dr_chain, | |
5842 unsigned int length, | |
5843 gimple stmt, | |
5844 gimple_stmt_iterator *gsi, | |
5845 VEC(tree,heap) **result_chain) | |
5846 { | |
5847 tree perm_dest, data_ref, first_vect, second_vect; | |
5848 gimple perm_stmt; | |
5849 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); | |
5850 int i; | |
5851 unsigned int j; | |
5852 | |
5853 /* Check that the operation is supported. */ | |
5854 if (!vect_strided_load_supported (vectype)) | |
5855 return false; | |
5856 | |
5857 *result_chain = VEC_copy (tree, heap, dr_chain); | |
5858 for (i = 0; i < exact_log2 (length); i++) | |
5859 { | |
5860 for (j = 0; j < length; j +=2) | |
5861 { | |
5862 first_vect = VEC_index (tree, dr_chain, j); | |
5863 second_vect = VEC_index (tree, dr_chain, j+1); | |
5864 | |
5865 /* data_ref = permute_even (first_data_ref, second_data_ref); */ | |
5866 perm_dest = create_tmp_var (vectype, "vect_perm_even"); | |
5867 DECL_GIMPLE_REG_P (perm_dest) = 1; | |
5868 add_referenced_var (perm_dest); | |
5869 | |
5870 perm_stmt = gimple_build_assign_with_ops (VEC_EXTRACT_EVEN_EXPR, | |
5871 perm_dest, first_vect, | |
5872 second_vect); | |
5873 | |
5874 data_ref = make_ssa_name (perm_dest, perm_stmt); | |
5875 gimple_assign_set_lhs (perm_stmt, data_ref); | |
5876 vect_finish_stmt_generation (stmt, perm_stmt, gsi); | |
5877 mark_symbols_for_renaming (perm_stmt); | |
5878 | |
5879 VEC_replace (tree, *result_chain, j/2, data_ref); | |
5880 | |
5881 /* data_ref = permute_odd (first_data_ref, second_data_ref); */ | |
5882 perm_dest = create_tmp_var (vectype, "vect_perm_odd"); | |
5883 DECL_GIMPLE_REG_P (perm_dest) = 1; | |
5884 add_referenced_var (perm_dest); | |
5885 | |
5886 perm_stmt = gimple_build_assign_with_ops (VEC_EXTRACT_ODD_EXPR, | |
5887 perm_dest, first_vect, | |
5888 second_vect); | |
5889 data_ref = make_ssa_name (perm_dest, perm_stmt); | |
5890 gimple_assign_set_lhs (perm_stmt, data_ref); | |
5891 vect_finish_stmt_generation (stmt, perm_stmt, gsi); | |
5892 mark_symbols_for_renaming (perm_stmt); | |
5893 | |
5894 VEC_replace (tree, *result_chain, j/2+length/2, data_ref); | |
5895 } | |
5896 dr_chain = VEC_copy (tree, heap, *result_chain); | |
5897 } | |
5898 return true; | |
5899 } | |
5900 | |
5901 | |
5902 /* Function vect_transform_strided_load. | |
5903 | |
5904 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements | |
5905 to perform their permutation and ascribe the result vectorized statements to | |
5906 the scalar statements. | |
5907 */ | |
5908 | |
5909 static bool | |
5910 vect_transform_strided_load (gimple stmt, VEC(tree,heap) *dr_chain, int size, | |
5911 gimple_stmt_iterator *gsi) | |
5912 { | |
5913 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
5914 gimple first_stmt = DR_GROUP_FIRST_DR (stmt_info); | |
5915 gimple next_stmt, new_stmt; | |
5916 VEC(tree,heap) *result_chain = NULL; | |
5917 unsigned int i, gap_count; | |
5918 tree tmp_data_ref; | |
5919 | |
5920 /* DR_CHAIN contains input data-refs that are a part of the interleaving. | |
5921 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted | |
5922 vectors, that are ready for vector computation. */ | |
5923 result_chain = VEC_alloc (tree, heap, size); | |
5924 /* Permute. */ | |
5925 if (!vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain)) | |
5926 return false; | |
5927 | |
5928 /* Put a permuted data-ref in the VECTORIZED_STMT field. | |
5929 Since we scan the chain starting from it's first node, their order | |
5930 corresponds the order of data-refs in RESULT_CHAIN. */ | |
5931 next_stmt = first_stmt; | |
5932 gap_count = 1; | |
5933 for (i = 0; VEC_iterate (tree, result_chain, i, tmp_data_ref); i++) | |
5934 { | |
5935 if (!next_stmt) | |
5936 break; | |
5937 | |
5938 /* Skip the gaps. Loads created for the gaps will be removed by dead | |
5939 code elimination pass later. No need to check for the first stmt in | |
5940 the group, since it always exists. | |
5941 DR_GROUP_GAP is the number of steps in elements from the previous | |
5942 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that | |
5943 correspond to the gaps. | |
5944 */ | |
5945 if (next_stmt != first_stmt | |
5946 && gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt))) | |
5947 { | |
5948 gap_count++; | |
5949 continue; | |
5950 } | |
5951 | |
5952 while (next_stmt) | |
5953 { | |
5954 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref); | |
5955 /* We assume that if VEC_STMT is not NULL, this is a case of multiple | |
5956 copies, and we put the new vector statement in the first available | |
5957 RELATED_STMT. */ | |
5958 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt))) | |
5959 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt; | |
5960 else | |
5961 { | |
5962 if (!DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt))) | |
5963 { | |
5964 gimple prev_stmt = | |
5965 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)); | |
5966 gimple rel_stmt = | |
5967 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)); | |
5968 while (rel_stmt) | |
5969 { | |
5970 prev_stmt = rel_stmt; | |
5971 rel_stmt = | |
5972 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt)); | |
5973 } | |
5974 | |
5975 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = | |
5976 new_stmt; | |
5977 } | |
5978 } | |
5979 | |
5980 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt)); | |
5981 gap_count = 1; | |
5982 /* If NEXT_STMT accesses the same DR as the previous statement, | |
5983 put the same TMP_DATA_REF as its vectorized statement; otherwise | |
5984 get the next data-ref from RESULT_CHAIN. */ | |
5985 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt))) | |
5986 break; | |
5987 } | |
5988 } | |
5989 | |
5990 VEC_free (tree, heap, result_chain); | |
5991 return true; | |
5992 } | |
5993 | |
5994 | |
5995 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by | |
5996 building a vector of type MASK_TYPE from it) and two input vectors placed in | |
5997 DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and | |
5998 shifting by STRIDE elements of DR_CHAIN for every copy. | |
5999 (STRIDE is the number of vectorized stmts for NODE divided by the number of | |
6000 copies). | |
6001 VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where | |
6002 the created stmts must be inserted. */ | |
6003 | |
6004 static inline void | |
6005 vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt, | |
6006 int *mask_array, int mask_nunits, | |
6007 tree mask_element_type, tree mask_type, | |
6008 int first_vec_indx, int second_vec_indx, | |
6009 gimple_stmt_iterator *gsi, slp_tree node, | |
6010 tree builtin_decl, tree vectype, | |
6011 VEC(tree,heap) *dr_chain, | |
6012 int ncopies, int vect_stmts_counter) | |
6013 { | |
6014 tree t = NULL_TREE, mask_vec, mask, perm_dest; | |
6015 gimple perm_stmt = NULL; | |
6016 stmt_vec_info next_stmt_info; | |
6017 int i, group_size, stride, dr_chain_size; | |
6018 tree first_vec, second_vec, data_ref; | |
6019 tree sym; | |
6020 ssa_op_iter iter; | |
6021 VEC (tree, heap) *params = NULL; | |
6022 | |
6023 /* Create a vector mask. */ | |
6024 for (i = mask_nunits - 1; i >= 0; --i) | |
6025 t = tree_cons (NULL_TREE, build_int_cst (mask_element_type, mask_array[i]), | |
6026 t); | |
6027 mask_vec = build_vector (mask_type, t); | |
6028 mask = vect_init_vector (stmt, mask_vec, mask_type, NULL); | |
6029 | |
6030 group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (node)); | |
6031 stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies; | |
6032 dr_chain_size = VEC_length (tree, dr_chain); | |
6033 | |
6034 /* Initialize the vect stmts of NODE to properly insert the generated | |
6035 stmts later. */ | |
6036 for (i = VEC_length (gimple, SLP_TREE_VEC_STMTS (node)); | |
6037 i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++) | |
6038 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (node), NULL); | |
6039 | |
6040 perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype); | |
6041 for (i = 0; i < ncopies; i++) | |
6042 { | |
6043 first_vec = VEC_index (tree, dr_chain, first_vec_indx); | |
6044 second_vec = VEC_index (tree, dr_chain, second_vec_indx); | |
6045 | |
6046 /* Build argument list for the vectorized call. */ | |
6047 VEC_free (tree, heap, params); | |
6048 params = VEC_alloc (tree, heap, 3); | |
6049 VEC_quick_push (tree, params, first_vec); | |
6050 VEC_quick_push (tree, params, second_vec); | |
6051 VEC_quick_push (tree, params, mask); | |
6052 | |
6053 /* Generate the permute statement. */ | |
6054 perm_stmt = gimple_build_call_vec (builtin_decl, params); | |
6055 data_ref = make_ssa_name (perm_dest, perm_stmt); | |
6056 gimple_call_set_lhs (perm_stmt, data_ref); | |
6057 vect_finish_stmt_generation (stmt, perm_stmt, gsi); | |
6058 FOR_EACH_SSA_TREE_OPERAND (sym, perm_stmt, iter, SSA_OP_ALL_VIRTUALS) | |
6059 { | |
6060 if (TREE_CODE (sym) == SSA_NAME) | |
6061 sym = SSA_NAME_VAR (sym); | |
6062 mark_sym_for_renaming (sym); | |
6063 } | |
6064 | |
6065 /* Store the vector statement in NODE. */ | |
6066 VEC_replace (gimple, SLP_TREE_VEC_STMTS (node), | |
6067 stride * i + vect_stmts_counter, perm_stmt); | |
6068 | |
6069 first_vec_indx += stride; | |
6070 second_vec_indx += stride; | |
6071 } | |
6072 | |
6073 /* Mark the scalar stmt as vectorized. */ | |
6074 next_stmt_info = vinfo_for_stmt (next_scalar_stmt); | |
6075 STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt; | |
6076 } | |
6077 | |
6078 | |
6079 /* Given FIRST_MASK_ELEMENT - the mask element in element representation, | |
6080 return in CURRENT_MASK_ELEMENT its equivalent in target specific | |
6081 representation. Check that the mask is valid and return FALSE if not. | |
6082 Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to | |
6083 the next vector, i.e., the current first vector is not needed. */ | |
6084 | |
6085 static bool | |
6086 vect_get_mask_element (gimple stmt, int first_mask_element, int m, | |
6087 int mask_nunits, bool only_one_vec, int index, | |
6088 int *mask, int *current_mask_element, | |
6089 bool *need_next_vector) | |
6090 { | |
6091 int i; | |
6092 static int number_of_mask_fixes = 1; | |
6093 static bool mask_fixed = false; | |
6094 static bool needs_first_vector = false; | |
6095 | |
6096 /* Convert to target specific representation. */ | |
6097 *current_mask_element = first_mask_element + m; | |
6098 /* Adjust the value in case it's a mask for second and third vectors. */ | |
6099 *current_mask_element -= mask_nunits * (number_of_mask_fixes - 1); | |
6100 | |
6101 if (*current_mask_element < mask_nunits) | |
6102 needs_first_vector = true; | |
6103 | |
6104 /* We have only one input vector to permute but the mask accesses values in | |
6105 the next vector as well. */ | |
6106 if (only_one_vec && *current_mask_element >= mask_nunits) | |
6107 { | |
6108 if (vect_print_dump_info (REPORT_DETAILS)) | |
6109 { | |
6110 fprintf (vect_dump, "permutation requires at least two vectors "); | |
6111 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); | |
6112 } | |
6113 | |
6114 return false; | |
6115 } | |
6116 | |
6117 /* The mask requires the next vector. */ | |
6118 if (*current_mask_element >= mask_nunits * 2) | |
6119 { | |
6120 if (needs_first_vector || mask_fixed) | |
6121 { | |
6122 /* We either need the first vector too or have already moved to the | |
6123 next vector. In both cases, this permutation needs three | |
6124 vectors. */ | |
6125 if (vect_print_dump_info (REPORT_DETAILS)) | |
6126 { | |
6127 fprintf (vect_dump, "permutation requires at " | |
6128 "least three vectors "); | |
6129 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); | |
6130 } | |
6131 | |
6132 return false; | |
6133 } | |
6134 | |
6135 /* We move to the next vector, dropping the first one and working with | |
6136 the second and the third - we need to adjust the values of the mask | |
6137 accordingly. */ | |
6138 *current_mask_element -= mask_nunits * number_of_mask_fixes; | |
6139 | |
6140 for (i = 0; i < index; i++) | |
6141 mask[i] -= mask_nunits * number_of_mask_fixes; | |
6142 | |
6143 (number_of_mask_fixes)++; | |
6144 mask_fixed = true; | |
6145 } | |
6146 | |
6147 *need_next_vector = mask_fixed; | |
6148 | |
6149 /* This was the last element of this mask. Start a new one. */ | |
6150 if (index == mask_nunits - 1) | |
6151 { | |
6152 number_of_mask_fixes = 1; | |
6153 mask_fixed = false; | |
6154 needs_first_vector = false; | |
6155 } | |
6156 | |
6157 return true; | |
6158 } | |
6159 | |
6160 | |
6161 /* Generate vector permute statements from a list of loads in DR_CHAIN. | |
6162 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid | |
6163 permute statements for SLP_NODE_INSTANCE. */ | |
6164 bool | |
6165 vect_transform_slp_perm_load (gimple stmt, VEC (tree, heap) *dr_chain, | |
6166 gimple_stmt_iterator *gsi, int vf, | |
6167 slp_instance slp_node_instance, bool analyze_only) | |
6168 { | |
6169 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
6170 tree mask_element_type = NULL_TREE, mask_type; | |
6171 int i, j, k, m, scale, mask_nunits, nunits, vec_index = 0, scalar_index; | |
6172 slp_tree node; | |
6173 tree vectype = STMT_VINFO_VECTYPE (stmt_info), builtin_decl; | |
6174 gimple next_scalar_stmt; | |
6175 int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance); | |
6176 int first_mask_element; | |
6177 int index, unroll_factor, *mask, current_mask_element, ncopies; | |
6178 bool only_one_vec = false, need_next_vector = false; | |
6179 int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter; | |
6180 | |
6181 if (!targetm.vectorize.builtin_vec_perm) | |
6182 { | |
6183 if (vect_print_dump_info (REPORT_DETAILS)) | |
6184 { | |
6185 fprintf (vect_dump, "no builtin for vect permute for "); | |
6186 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); | |
6187 } | |
6188 | |
6189 return false; | |
6190 } | |
6191 | |
6192 builtin_decl = targetm.vectorize.builtin_vec_perm (vectype, | |
6193 &mask_element_type); | |
6194 if (!builtin_decl || !mask_element_type) | |
6195 { | |
6196 if (vect_print_dump_info (REPORT_DETAILS)) | |
6197 { | |
6198 fprintf (vect_dump, "no builtin for vect permute for "); | |
6199 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); | |
6200 } | |
6201 | |
6202 return false; | |
6203 } | |
6204 | |
6205 mask_type = get_vectype_for_scalar_type (mask_element_type); | |
6206 mask_nunits = TYPE_VECTOR_SUBPARTS (mask_type); | |
6207 mask = (int *) xmalloc (sizeof (int) * mask_nunits); | |
6208 nunits = TYPE_VECTOR_SUBPARTS (vectype); | |
6209 scale = mask_nunits / nunits; | |
6210 unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance); | |
6211 | |
6212 /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE | |
6213 unrolling factor. */ | |
6214 orig_vec_stmts_num = group_size * | |
6215 SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits; | |
6216 if (orig_vec_stmts_num == 1) | |
6217 only_one_vec = true; | |
6218 | |
6219 /* Number of copies is determined by the final vectorization factor | |
6220 relatively to SLP_NODE_INSTANCE unrolling factor. */ | |
6221 ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance); | |
6222 | |
6223 /* Generate permutation masks for every NODE. Number of masks for each NODE | |
6224 is equal to GROUP_SIZE. | |
6225 E.g., we have a group of three nodes with three loads from the same | |
6226 location in each node, and the vector size is 4. I.e., we have a | |
6227 a0b0c0a1b1c1... sequence and we need to create the following vectors: | |
6228 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3 | |
6229 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3 | |
6230 ... | |
6231 | |
6232 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9} (in target | |
6233 scpecific type, e.g., in bytes for Altivec. | |
6234 The last mask is illegal since we assume two operands for permute | |
6235 operation, and the mask element values can't be outside that range. Hence, | |
6236 the last mask must be converted into {2,5,5,5}. | |
6237 For the first two permutations we need the first and the second input | |
6238 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation | |
6239 we need the second and the third vectors: {b1,c1,a2,b2} and | |
6240 {c2,a3,b3,c3}. */ | |
6241 | |
6242 for (i = 0; | |
6243 VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (slp_node_instance), | |
6244 i, node); | |
6245 i++) | |
6246 { | |
6247 scalar_index = 0; | |
6248 index = 0; | |
6249 vect_stmts_counter = 0; | |
6250 vec_index = 0; | |
6251 first_vec_index = vec_index++; | |
6252 if (only_one_vec) | |
6253 second_vec_index = first_vec_index; | |
6254 else | |
6255 second_vec_index = vec_index++; | |
6256 | |
6257 for (j = 0; j < unroll_factor; j++) | |
6258 { | |
6259 for (k = 0; k < group_size; k++) | |
6260 { | |
6261 first_mask_element = (i + j * group_size) * scale; | |
6262 for (m = 0; m < scale; m++) | |
6263 { | |
6264 if (!vect_get_mask_element (stmt, first_mask_element, m, | |
6265 mask_nunits, only_one_vec, index, mask, | |
6266 ¤t_mask_element, &need_next_vector)) | |
6267 return false; | |
6268 | |
6269 mask[index++] = current_mask_element; | |
6270 } | |
6271 | |
6272 if (index == mask_nunits) | |
6273 { | |
6274 index = 0; | |
6275 if (!analyze_only) | |
6276 { | |
6277 if (need_next_vector) | |
6278 { | |
6279 first_vec_index = second_vec_index; | |
6280 second_vec_index = vec_index; | |
6281 } | |
6282 | |
6283 next_scalar_stmt = VEC_index (gimple, | |
6284 SLP_TREE_SCALAR_STMTS (node), scalar_index++); | |
6285 | |
6286 vect_create_mask_and_perm (stmt, next_scalar_stmt, | |
6287 mask, mask_nunits, mask_element_type, mask_type, | |
6288 first_vec_index, second_vec_index, gsi, node, | |
6289 builtin_decl, vectype, dr_chain, ncopies, | |
6290 vect_stmts_counter++); | |
6291 } | |
6292 } | |
6293 } | |
6294 } | |
6295 } | |
6296 | |
6297 free (mask); | |
6298 return true; | |
6299 } | |
6300 | |
6301 /* vectorizable_load. | |
6302 | |
6303 Check if STMT reads a non scalar data-ref (array/pointer/structure) that | |
6304 can be vectorized. | |
6305 If VEC_STMT is also passed, vectorize the STMT: create a vectorized | |
6306 stmt to replace it, put it in VEC_STMT, and insert it at BSI. | |
6307 Return FALSE if not a vectorizable STMT, TRUE otherwise. */ | |
6308 | |
6309 bool | |
6310 vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt, | |
6311 slp_tree slp_node, slp_instance slp_node_instance) | |
6312 { | |
6313 tree scalar_dest; | |
6314 tree vec_dest = NULL; | |
6315 tree data_ref = NULL; | |
6316 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
6317 stmt_vec_info prev_stmt_info; | |
6318 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
6319 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
6320 struct loop *containing_loop = (gimple_bb (stmt))->loop_father; | |
6321 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt); | |
6322 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr; | |
6323 tree vectype = STMT_VINFO_VECTYPE (stmt_info); | |
6324 tree new_temp; | |
6325 int mode; | |
6326 gimple new_stmt = NULL; | |
6327 tree dummy; | |
6328 enum dr_alignment_support alignment_support_scheme; | |
6329 tree dataref_ptr = NULL_TREE; | |
6330 gimple ptr_incr; | |
6331 int nunits = TYPE_VECTOR_SUBPARTS (vectype); | |
6332 int ncopies; | |
6333 int i, j, group_size; | |
6334 tree msq = NULL_TREE, lsq; | |
6335 tree offset = NULL_TREE; | |
6336 tree realignment_token = NULL_TREE; | |
6337 gimple phi = NULL; | |
6338 VEC(tree,heap) *dr_chain = NULL; | |
6339 bool strided_load = false; | |
6340 gimple first_stmt; | |
6341 tree scalar_type; | |
6342 bool inv_p; | |
6343 bool compute_in_loop = false; | |
6344 struct loop *at_loop; | |
6345 int vec_num; | |
6346 bool slp = (slp_node != NULL); | |
6347 bool slp_perm = false; | |
6348 enum tree_code code; | |
6349 | |
6350 /* Multiple types in SLP are handled by creating the appropriate number of | |
6351 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in | |
6352 case of SLP. */ | |
6353 if (slp) | |
6354 ncopies = 1; | |
6355 else | |
6356 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits; | |
6357 | |
6358 gcc_assert (ncopies >= 1); | |
6359 | |
6360 /* FORNOW. This restriction should be relaxed. */ | |
6361 if (nested_in_vect_loop && ncopies > 1) | |
6362 { | |
6363 if (vect_print_dump_info (REPORT_DETAILS)) | |
6364 fprintf (vect_dump, "multiple types in nested loop."); | |
6365 return false; | |
6366 } | |
6367 | |
6368 if (slp && SLP_INSTANCE_LOAD_PERMUTATION (slp_node_instance)) | |
6369 slp_perm = true; | |
6370 | |
6371 if (!STMT_VINFO_RELEVANT_P (stmt_info)) | |
6372 return false; | |
6373 | |
6374 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def) | |
6375 return false; | |
6376 | |
6377 /* Is vectorizable load? */ | |
6378 if (!is_gimple_assign (stmt)) | |
6379 return false; | |
6380 | |
6381 scalar_dest = gimple_assign_lhs (stmt); | |
6382 if (TREE_CODE (scalar_dest) != SSA_NAME) | |
6383 return false; | |
6384 | |
6385 code = gimple_assign_rhs_code (stmt); | |
6386 if (code != ARRAY_REF | |
6387 && code != INDIRECT_REF | |
6388 && !STMT_VINFO_STRIDED_ACCESS (stmt_info)) | |
6389 return false; | |
6390 | |
6391 if (!STMT_VINFO_DATA_REF (stmt_info)) | |
6392 return false; | |
6393 | |
6394 scalar_type = TREE_TYPE (DR_REF (dr)); | |
6395 mode = (int) TYPE_MODE (vectype); | |
6396 | |
6397 /* FORNOW. In some cases can vectorize even if data-type not supported | |
6398 (e.g. - data copies). */ | |
6399 if (optab_handler (mov_optab, mode)->insn_code == CODE_FOR_nothing) | |
6400 { | |
6401 if (vect_print_dump_info (REPORT_DETAILS)) | |
6402 fprintf (vect_dump, "Aligned load, but unsupported type."); | |
6403 return false; | |
6404 } | |
6405 | |
6406 /* The vector component type needs to be trivially convertible to the | |
6407 scalar lhs. This should always be the case. */ | |
6408 if (!useless_type_conversion_p (TREE_TYPE (scalar_dest), TREE_TYPE (vectype))) | |
6409 { | |
6410 if (vect_print_dump_info (REPORT_DETAILS)) | |
6411 fprintf (vect_dump, "??? operands of different types"); | |
6412 return false; | |
6413 } | |
6414 | |
6415 /* Check if the load is a part of an interleaving chain. */ | |
6416 if (STMT_VINFO_STRIDED_ACCESS (stmt_info)) | |
6417 { | |
6418 strided_load = true; | |
6419 /* FORNOW */ | |
6420 gcc_assert (! nested_in_vect_loop); | |
6421 | |
6422 /* Check if interleaving is supported. */ | |
6423 if (!vect_strided_load_supported (vectype) | |
6424 && !PURE_SLP_STMT (stmt_info) && !slp) | |
6425 return false; | |
6426 } | |
6427 | |
6428 if (!vec_stmt) /* transformation not required. */ | |
6429 { | |
6430 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type; | |
6431 vect_model_load_cost (stmt_info, ncopies, NULL); | |
6432 return true; | |
6433 } | |
6434 | |
6435 if (vect_print_dump_info (REPORT_DETAILS)) | |
6436 fprintf (vect_dump, "transform load."); | |
6437 | |
6438 /** Transform. **/ | |
6439 | |
6440 if (strided_load) | |
6441 { | |
6442 first_stmt = DR_GROUP_FIRST_DR (stmt_info); | |
6443 /* Check if the chain of loads is already vectorized. */ | |
6444 if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt))) | |
6445 { | |
6446 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info); | |
6447 return true; | |
6448 } | |
6449 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)); | |
6450 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt)); | |
6451 | |
6452 /* VEC_NUM is the number of vect stmts to be created for this group. */ | |
6453 if (slp) | |
6454 { | |
6455 strided_load = false; | |
6456 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); | |
6457 } | |
6458 else | |
6459 vec_num = group_size; | |
6460 | |
6461 dr_chain = VEC_alloc (tree, heap, vec_num); | |
6462 } | |
6463 else | |
6464 { | |
6465 first_stmt = stmt; | |
6466 first_dr = dr; | |
6467 group_size = vec_num = 1; | |
6468 } | |
6469 | |
6470 alignment_support_scheme = vect_supportable_dr_alignment (first_dr); | |
6471 gcc_assert (alignment_support_scheme); | |
6472 | |
6473 /* In case the vectorization factor (VF) is bigger than the number | |
6474 of elements that we can fit in a vectype (nunits), we have to generate | |
6475 more than one vector stmt - i.e - we need to "unroll" the | |
6476 vector stmt by a factor VF/nunits. In doing so, we record a pointer | |
6477 from one copy of the vector stmt to the next, in the field | |
6478 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following | |
6479 stages to find the correct vector defs to be used when vectorizing | |
6480 stmts that use the defs of the current stmt. The example below illustrates | |
6481 the vectorization process when VF=16 and nunits=4 (i.e - we need to create | |
6482 4 vectorized stmts): | |
6483 | |
6484 before vectorization: | |
6485 RELATED_STMT VEC_STMT | |
6486 S1: x = memref - - | |
6487 S2: z = x + 1 - - | |
6488 | |
6489 step 1: vectorize stmt S1: | |
6490 We first create the vector stmt VS1_0, and, as usual, record a | |
6491 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1. | |
6492 Next, we create the vector stmt VS1_1, and record a pointer to | |
6493 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0. | |
6494 Similarly, for VS1_2 and VS1_3. This is the resulting chain of | |
6495 stmts and pointers: | |
6496 RELATED_STMT VEC_STMT | |
6497 VS1_0: vx0 = memref0 VS1_1 - | |
6498 VS1_1: vx1 = memref1 VS1_2 - | |
6499 VS1_2: vx2 = memref2 VS1_3 - | |
6500 VS1_3: vx3 = memref3 - - | |
6501 S1: x = load - VS1_0 | |
6502 S2: z = x + 1 - - | |
6503 | |
6504 See in documentation in vect_get_vec_def_for_stmt_copy for how the | |
6505 information we recorded in RELATED_STMT field is used to vectorize | |
6506 stmt S2. */ | |
6507 | |
6508 /* In case of interleaving (non-unit strided access): | |
6509 | |
6510 S1: x2 = &base + 2 | |
6511 S2: x0 = &base | |
6512 S3: x1 = &base + 1 | |
6513 S4: x3 = &base + 3 | |
6514 | |
6515 Vectorized loads are created in the order of memory accesses | |
6516 starting from the access of the first stmt of the chain: | |
6517 | |
6518 VS1: vx0 = &base | |
6519 VS2: vx1 = &base + vec_size*1 | |
6520 VS3: vx3 = &base + vec_size*2 | |
6521 VS4: vx4 = &base + vec_size*3 | |
6522 | |
6523 Then permutation statements are generated: | |
6524 | |
6525 VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 > | |
6526 VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 > | |
6527 ... | |
6528 | |
6529 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts | |
6530 (the order of the data-refs in the output of vect_permute_load_chain | |
6531 corresponds to the order of scalar stmts in the interleaving chain - see | |
6532 the documentation of vect_permute_load_chain()). | |
6533 The generation of permutation stmts and recording them in | |
6534 STMT_VINFO_VEC_STMT is done in vect_transform_strided_load(). | |
6535 | |
6536 In case of both multiple types and interleaving, the vector loads and | |
6537 permutation stmts above are created for every copy. The result vector stmts | |
6538 are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding | |
6539 STMT_VINFO_RELATED_STMT for the next copies. */ | |
6540 | |
6541 /* If the data reference is aligned (dr_aligned) or potentially unaligned | |
6542 on a target that supports unaligned accesses (dr_unaligned_supported) | |
6543 we generate the following code: | |
6544 p = initial_addr; | |
6545 indx = 0; | |
6546 loop { | |
6547 p = p + indx * vectype_size; | |
6548 vec_dest = *(p); | |
6549 indx = indx + 1; | |
6550 } | |
6551 | |
6552 Otherwise, the data reference is potentially unaligned on a target that | |
6553 does not support unaligned accesses (dr_explicit_realign_optimized) - | |
6554 then generate the following code, in which the data in each iteration is | |
6555 obtained by two vector loads, one from the previous iteration, and one | |
6556 from the current iteration: | |
6557 p1 = initial_addr; | |
6558 msq_init = *(floor(p1)) | |
6559 p2 = initial_addr + VS - 1; | |
6560 realignment_token = call target_builtin; | |
6561 indx = 0; | |
6562 loop { | |
6563 p2 = p2 + indx * vectype_size | |
6564 lsq = *(floor(p2)) | |
6565 vec_dest = realign_load (msq, lsq, realignment_token) | |
6566 indx = indx + 1; | |
6567 msq = lsq; | |
6568 } */ | |
6569 | |
6570 /* If the misalignment remains the same throughout the execution of the | |
6571 loop, we can create the init_addr and permutation mask at the loop | |
6572 preheader. Otherwise, it needs to be created inside the loop. | |
6573 This can only occur when vectorizing memory accesses in the inner-loop | |
6574 nested within an outer-loop that is being vectorized. */ | |
6575 | |
6576 if (nested_in_vect_loop_p (loop, stmt) | |
6577 && (TREE_INT_CST_LOW (DR_STEP (dr)) | |
6578 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)) | |
6579 { | |
6580 gcc_assert (alignment_support_scheme != dr_explicit_realign_optimized); | |
6581 compute_in_loop = true; | |
6582 } | |
6583 | |
6584 if ((alignment_support_scheme == dr_explicit_realign_optimized | |
6585 || alignment_support_scheme == dr_explicit_realign) | |
6586 && !compute_in_loop) | |
6587 { | |
6588 msq = vect_setup_realignment (first_stmt, gsi, &realignment_token, | |
6589 alignment_support_scheme, NULL_TREE, | |
6590 &at_loop); | |
6591 if (alignment_support_scheme == dr_explicit_realign_optimized) | |
6592 { | |
6593 phi = SSA_NAME_DEF_STMT (msq); | |
6594 offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1); | |
6595 } | |
6596 } | |
6597 else | |
6598 at_loop = loop; | |
6599 | |
6600 prev_stmt_info = NULL; | |
6601 for (j = 0; j < ncopies; j++) | |
6602 { | |
6603 /* 1. Create the vector pointer update chain. */ | |
6604 if (j == 0) | |
6605 dataref_ptr = vect_create_data_ref_ptr (first_stmt, | |
6606 at_loop, offset, | |
6607 &dummy, &ptr_incr, false, | |
6608 &inv_p, NULL_TREE); | |
6609 else | |
6610 dataref_ptr = | |
6611 bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt, NULL_TREE); | |
6612 | |
6613 for (i = 0; i < vec_num; i++) | |
6614 { | |
6615 if (i > 0) | |
6616 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt, | |
6617 NULL_TREE); | |
6618 | |
6619 /* 2. Create the vector-load in the loop. */ | |
6620 switch (alignment_support_scheme) | |
6621 { | |
6622 case dr_aligned: | |
6623 gcc_assert (aligned_access_p (first_dr)); | |
6624 data_ref = build_fold_indirect_ref (dataref_ptr); | |
6625 break; | |
6626 case dr_unaligned_supported: | |
6627 { | |
6628 int mis = DR_MISALIGNMENT (first_dr); | |
6629 tree tmis = (mis == -1 ? size_zero_node : size_int (mis)); | |
6630 | |
6631 tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT)); | |
6632 data_ref = | |
6633 build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis); | |
6634 break; | |
6635 } | |
6636 case dr_explicit_realign: | |
6637 { | |
6638 tree ptr, bump; | |
6639 tree vs_minus_1 = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1); | |
6640 | |
6641 if (compute_in_loop) | |
6642 msq = vect_setup_realignment (first_stmt, gsi, | |
6643 &realignment_token, | |
6644 dr_explicit_realign, | |
6645 dataref_ptr, NULL); | |
6646 | |
6647 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr); | |
6648 vec_dest = vect_create_destination_var (scalar_dest, vectype); | |
6649 new_stmt = gimple_build_assign (vec_dest, data_ref); | |
6650 new_temp = make_ssa_name (vec_dest, new_stmt); | |
6651 gimple_assign_set_lhs (new_stmt, new_temp); | |
6652 vect_finish_stmt_generation (stmt, new_stmt, gsi); | |
6653 copy_virtual_operands (new_stmt, stmt); | |
6654 mark_symbols_for_renaming (new_stmt); | |
6655 msq = new_temp; | |
6656 | |
6657 bump = size_binop (MULT_EXPR, vs_minus_1, | |
6658 TYPE_SIZE_UNIT (scalar_type)); | |
6659 ptr = bump_vector_ptr (dataref_ptr, NULL, gsi, stmt, bump); | |
6660 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr); | |
6661 break; | |
6662 } | |
6663 case dr_explicit_realign_optimized: | |
6664 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr); | |
6665 break; | |
6666 default: | |
6667 gcc_unreachable (); | |
6668 } | |
6669 vec_dest = vect_create_destination_var (scalar_dest, vectype); | |
6670 new_stmt = gimple_build_assign (vec_dest, data_ref); | |
6671 new_temp = make_ssa_name (vec_dest, new_stmt); | |
6672 gimple_assign_set_lhs (new_stmt, new_temp); | |
6673 vect_finish_stmt_generation (stmt, new_stmt, gsi); | |
6674 mark_symbols_for_renaming (new_stmt); | |
6675 | |
6676 /* 3. Handle explicit realignment if necessary/supported. Create in | |
6677 loop: vec_dest = realign_load (msq, lsq, realignment_token) */ | |
6678 if (alignment_support_scheme == dr_explicit_realign_optimized | |
6679 || alignment_support_scheme == dr_explicit_realign) | |
6680 { | |
6681 tree tmp; | |
6682 | |
6683 lsq = gimple_assign_lhs (new_stmt); | |
6684 if (!realignment_token) | |
6685 realignment_token = dataref_ptr; | |
6686 vec_dest = vect_create_destination_var (scalar_dest, vectype); | |
6687 tmp = build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq, | |
6688 realignment_token); | |
6689 new_stmt = gimple_build_assign (vec_dest, tmp); | |
6690 new_temp = make_ssa_name (vec_dest, new_stmt); | |
6691 gimple_assign_set_lhs (new_stmt, new_temp); | |
6692 vect_finish_stmt_generation (stmt, new_stmt, gsi); | |
6693 | |
6694 if (alignment_support_scheme == dr_explicit_realign_optimized) | |
6695 { | |
6696 gcc_assert (phi); | |
6697 if (i == vec_num - 1 && j == ncopies - 1) | |
6698 add_phi_arg (phi, lsq, loop_latch_edge (containing_loop)); | |
6699 msq = lsq; | |
6700 } | |
6701 } | |
6702 | |
6703 /* 4. Handle invariant-load. */ | |
6704 if (inv_p) | |
6705 { | |
6706 gcc_assert (!strided_load); | |
6707 gcc_assert (nested_in_vect_loop_p (loop, stmt)); | |
6708 if (j == 0) | |
6709 { | |
6710 int k; | |
6711 tree t = NULL_TREE; | |
6712 tree vec_inv, bitpos, bitsize = TYPE_SIZE (scalar_type); | |
6713 | |
6714 /* CHECKME: bitpos depends on endianess? */ | |
6715 bitpos = bitsize_zero_node; | |
6716 vec_inv = build3 (BIT_FIELD_REF, scalar_type, new_temp, | |
6717 bitsize, bitpos); | |
6718 vec_dest = | |
6719 vect_create_destination_var (scalar_dest, NULL_TREE); | |
6720 new_stmt = gimple_build_assign (vec_dest, vec_inv); | |
6721 new_temp = make_ssa_name (vec_dest, new_stmt); | |
6722 gimple_assign_set_lhs (new_stmt, new_temp); | |
6723 vect_finish_stmt_generation (stmt, new_stmt, gsi); | |
6724 | |
6725 for (k = nunits - 1; k >= 0; --k) | |
6726 t = tree_cons (NULL_TREE, new_temp, t); | |
6727 /* FIXME: use build_constructor directly. */ | |
6728 vec_inv = build_constructor_from_list (vectype, t); | |
6729 new_temp = vect_init_vector (stmt, vec_inv, vectype, gsi); | |
6730 new_stmt = SSA_NAME_DEF_STMT (new_temp); | |
6731 } | |
6732 else | |
6733 gcc_unreachable (); /* FORNOW. */ | |
6734 } | |
6735 | |
6736 /* Collect vector loads and later create their permutation in | |
6737 vect_transform_strided_load (). */ | |
6738 if (strided_load || slp_perm) | |
6739 VEC_quick_push (tree, dr_chain, new_temp); | |
6740 | |
6741 /* Store vector loads in the corresponding SLP_NODE. */ | |
6742 if (slp && !slp_perm) | |
6743 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt); | |
6744 } | |
6745 | |
6746 if (slp && !slp_perm) | |
6747 continue; | |
6748 | |
6749 if (slp_perm) | |
6750 { | |
6751 if (!vect_transform_slp_perm_load (stmt, dr_chain, gsi, | |
6752 LOOP_VINFO_VECT_FACTOR (loop_vinfo), | |
6753 slp_node_instance, false)) | |
6754 { | |
6755 VEC_free (tree, heap, dr_chain); | |
6756 return false; | |
6757 } | |
6758 } | |
6759 else | |
6760 { | |
6761 if (strided_load) | |
6762 { | |
6763 if (!vect_transform_strided_load (stmt, dr_chain, group_size, gsi)) | |
6764 return false; | |
6765 | |
6766 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info); | |
6767 VEC_free (tree, heap, dr_chain); | |
6768 dr_chain = VEC_alloc (tree, heap, group_size); | |
6769 } | |
6770 else | |
6771 { | |
6772 if (j == 0) | |
6773 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt; | |
6774 else | |
6775 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; | |
6776 prev_stmt_info = vinfo_for_stmt (new_stmt); | |
6777 } | |
6778 } | |
6779 } | |
6780 | |
6781 if (dr_chain) | |
6782 VEC_free (tree, heap, dr_chain); | |
6783 | |
6784 return true; | |
6785 } | |
6786 | |
6787 | |
6788 /* Function vectorizable_live_operation. | |
6789 | |
6790 STMT computes a value that is used outside the loop. Check if | |
6791 it can be supported. */ | |
6792 | |
6793 bool | |
6794 vectorizable_live_operation (gimple stmt, | |
6795 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED, | |
6796 gimple *vec_stmt ATTRIBUTE_UNUSED) | |
6797 { | |
6798 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
6799 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
6800 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
6801 int i; | |
6802 int op_type; | |
6803 tree op; | |
6804 tree def; | |
6805 gimple def_stmt; | |
6806 enum vect_def_type dt; | |
6807 enum tree_code code; | |
6808 enum gimple_rhs_class rhs_class; | |
6809 | |
6810 gcc_assert (STMT_VINFO_LIVE_P (stmt_info)); | |
6811 | |
6812 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def) | |
6813 return false; | |
6814 | |
6815 if (!is_gimple_assign (stmt)) | |
6816 return false; | |
6817 | |
6818 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME) | |
6819 return false; | |
6820 | |
6821 /* FORNOW. CHECKME. */ | |
6822 if (nested_in_vect_loop_p (loop, stmt)) | |
6823 return false; | |
6824 | |
6825 code = gimple_assign_rhs_code (stmt); | |
6826 op_type = TREE_CODE_LENGTH (code); | |
6827 rhs_class = get_gimple_rhs_class (code); | |
6828 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op); | |
6829 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op); | |
6830 | |
6831 /* FORNOW: support only if all uses are invariant. This means | |
6832 that the scalar operations can remain in place, unvectorized. | |
6833 The original last scalar value that they compute will be used. */ | |
6834 | |
6835 for (i = 0; i < op_type; i++) | |
6836 { | |
6837 if (rhs_class == GIMPLE_SINGLE_RHS) | |
6838 op = TREE_OPERAND (gimple_op (stmt, 1), i); | |
6839 else | |
6840 op = gimple_op (stmt, i + 1); | |
6841 if (op && !vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt)) | |
6842 { | |
6843 if (vect_print_dump_info (REPORT_DETAILS)) | |
6844 fprintf (vect_dump, "use not simple."); | |
6845 return false; | |
6846 } | |
6847 | |
6848 if (dt != vect_invariant_def && dt != vect_constant_def) | |
6849 return false; | |
6850 } | |
6851 | |
6852 /* No transformation is required for the cases we currently support. */ | |
6853 return true; | |
6854 } | |
6855 | |
6856 | |
6857 /* Function vect_is_simple_cond. | |
6858 | |
6859 Input: | |
6860 LOOP - the loop that is being vectorized. | |
6861 COND - Condition that is checked for simple use. | |
6862 | |
6863 Returns whether a COND can be vectorized. Checks whether | |
6864 condition operands are supportable using vec_is_simple_use. */ | |
6865 | |
6866 static bool | |
6867 vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo) | |
6868 { | |
6869 tree lhs, rhs; | |
6870 tree def; | |
6871 enum vect_def_type dt; | |
6872 | |
6873 if (!COMPARISON_CLASS_P (cond)) | |
6874 return false; | |
6875 | |
6876 lhs = TREE_OPERAND (cond, 0); | |
6877 rhs = TREE_OPERAND (cond, 1); | |
6878 | |
6879 if (TREE_CODE (lhs) == SSA_NAME) | |
6880 { | |
6881 gimple lhs_def_stmt = SSA_NAME_DEF_STMT (lhs); | |
6882 if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt)) | |
6883 return false; | |
6884 } | |
6885 else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST | |
6886 && TREE_CODE (lhs) != FIXED_CST) | |
6887 return false; | |
6888 | |
6889 if (TREE_CODE (rhs) == SSA_NAME) | |
6890 { | |
6891 gimple rhs_def_stmt = SSA_NAME_DEF_STMT (rhs); | |
6892 if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt)) | |
6893 return false; | |
6894 } | |
6895 else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST | |
6896 && TREE_CODE (rhs) != FIXED_CST) | |
6897 return false; | |
6898 | |
6899 return true; | |
6900 } | |
6901 | |
6902 /* vectorizable_condition. | |
6903 | |
6904 Check if STMT is conditional modify expression that can be vectorized. | |
6905 If VEC_STMT is also passed, vectorize the STMT: create a vectorized | |
6906 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it | |
6907 at BSI. | |
6908 | |
6909 Return FALSE if not a vectorizable STMT, TRUE otherwise. */ | |
6910 | |
6911 bool | |
6912 vectorizable_condition (gimple stmt, gimple_stmt_iterator *gsi, | |
6913 gimple *vec_stmt) | |
6914 { | |
6915 tree scalar_dest = NULL_TREE; | |
6916 tree vec_dest = NULL_TREE; | |
6917 tree op = NULL_TREE; | |
6918 tree cond_expr, then_clause, else_clause; | |
6919 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
6920 tree vectype = STMT_VINFO_VECTYPE (stmt_info); | |
6921 tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause; | |
6922 tree vec_compare, vec_cond_expr; | |
6923 tree new_temp; | |
6924 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
6925 enum machine_mode vec_mode; | |
6926 tree def; | |
6927 enum vect_def_type dt; | |
6928 int nunits = TYPE_VECTOR_SUBPARTS (vectype); | |
6929 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits; | |
6930 enum tree_code code; | |
6931 | |
6932 gcc_assert (ncopies >= 1); | |
6933 if (ncopies > 1) | |
6934 return false; /* FORNOW */ | |
6935 | |
6936 if (!STMT_VINFO_RELEVANT_P (stmt_info)) | |
6937 return false; | |
6938 | |
6939 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def) | |
6940 return false; | |
6941 | |
6942 /* FORNOW: SLP not supported. */ | |
6943 if (STMT_SLP_TYPE (stmt_info)) | |
6944 return false; | |
6945 | |
6946 /* FORNOW: not yet supported. */ | |
6947 if (STMT_VINFO_LIVE_P (stmt_info)) | |
6948 { | |
6949 if (vect_print_dump_info (REPORT_DETAILS)) | |
6950 fprintf (vect_dump, "value used after loop."); | |
6951 return false; | |
6952 } | |
6953 | |
6954 /* Is vectorizable conditional operation? */ | |
6955 if (!is_gimple_assign (stmt)) | |
6956 return false; | |
6957 | |
6958 code = gimple_assign_rhs_code (stmt); | |
6959 | |
6960 if (code != COND_EXPR) | |
6961 return false; | |
6962 | |
6963 gcc_assert (gimple_assign_single_p (stmt)); | |
6964 op = gimple_assign_rhs1 (stmt); | |
6965 cond_expr = TREE_OPERAND (op, 0); | |
6966 then_clause = TREE_OPERAND (op, 1); | |
6967 else_clause = TREE_OPERAND (op, 2); | |
6968 | |
6969 if (!vect_is_simple_cond (cond_expr, loop_vinfo)) | |
6970 return false; | |
6971 | |
6972 /* We do not handle two different vector types for the condition | |
6973 and the values. */ | |
6974 if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype)) | |
6975 return false; | |
6976 | |
6977 if (TREE_CODE (then_clause) == SSA_NAME) | |
6978 { | |
6979 gimple then_def_stmt = SSA_NAME_DEF_STMT (then_clause); | |
6980 if (!vect_is_simple_use (then_clause, loop_vinfo, | |
6981 &then_def_stmt, &def, &dt)) | |
6982 return false; | |
6983 } | |
6984 else if (TREE_CODE (then_clause) != INTEGER_CST | |
6985 && TREE_CODE (then_clause) != REAL_CST | |
6986 && TREE_CODE (then_clause) != FIXED_CST) | |
6987 return false; | |
6988 | |
6989 if (TREE_CODE (else_clause) == SSA_NAME) | |
6990 { | |
6991 gimple else_def_stmt = SSA_NAME_DEF_STMT (else_clause); | |
6992 if (!vect_is_simple_use (else_clause, loop_vinfo, | |
6993 &else_def_stmt, &def, &dt)) | |
6994 return false; | |
6995 } | |
6996 else if (TREE_CODE (else_clause) != INTEGER_CST | |
6997 && TREE_CODE (else_clause) != REAL_CST | |
6998 && TREE_CODE (else_clause) != FIXED_CST) | |
6999 return false; | |
7000 | |
7001 | |
7002 vec_mode = TYPE_MODE (vectype); | |
7003 | |
7004 if (!vec_stmt) | |
7005 { | |
7006 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type; | |
7007 return expand_vec_cond_expr_p (op, vec_mode); | |
7008 } | |
7009 | |
7010 /* Transform */ | |
7011 | |
7012 /* Handle def. */ | |
7013 scalar_dest = gimple_assign_lhs (stmt); | |
7014 vec_dest = vect_create_destination_var (scalar_dest, vectype); | |
7015 | |
7016 /* Handle cond expr. */ | |
7017 vec_cond_lhs = | |
7018 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL); | |
7019 vec_cond_rhs = | |
7020 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL); | |
7021 vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL); | |
7022 vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL); | |
7023 | |
7024 /* Arguments are ready. Create the new vector stmt. */ | |
7025 vec_compare = build2 (TREE_CODE (cond_expr), vectype, | |
7026 vec_cond_lhs, vec_cond_rhs); | |
7027 vec_cond_expr = build3 (VEC_COND_EXPR, vectype, | |
7028 vec_compare, vec_then_clause, vec_else_clause); | |
7029 | |
7030 *vec_stmt = gimple_build_assign (vec_dest, vec_cond_expr); | |
7031 new_temp = make_ssa_name (vec_dest, *vec_stmt); | |
7032 gimple_assign_set_lhs (*vec_stmt, new_temp); | |
7033 vect_finish_stmt_generation (stmt, *vec_stmt, gsi); | |
7034 | |
7035 return true; | |
7036 } | |
7037 | |
7038 | |
7039 /* Function vect_transform_stmt. | |
7040 | |
7041 Create a vectorized stmt to replace STMT, and insert it at BSI. */ | |
7042 | |
7043 static bool | |
7044 vect_transform_stmt (gimple stmt, gimple_stmt_iterator *gsi, | |
7045 bool *strided_store, slp_tree slp_node, | |
7046 slp_instance slp_node_instance) | |
7047 { | |
7048 bool is_store = false; | |
7049 gimple vec_stmt = NULL; | |
7050 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
7051 gimple orig_stmt_in_pattern; | |
7052 bool done; | |
7053 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
7054 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
7055 | |
7056 switch (STMT_VINFO_TYPE (stmt_info)) | |
7057 { | |
7058 case type_demotion_vec_info_type: | |
7059 done = vectorizable_type_demotion (stmt, gsi, &vec_stmt, slp_node); | |
7060 gcc_assert (done); | |
7061 break; | |
7062 | |
7063 case type_promotion_vec_info_type: | |
7064 done = vectorizable_type_promotion (stmt, gsi, &vec_stmt, slp_node); | |
7065 gcc_assert (done); | |
7066 break; | |
7067 | |
7068 case type_conversion_vec_info_type: | |
7069 done = vectorizable_conversion (stmt, gsi, &vec_stmt, slp_node); | |
7070 gcc_assert (done); | |
7071 break; | |
7072 | |
7073 case induc_vec_info_type: | |
7074 gcc_assert (!slp_node); | |
7075 done = vectorizable_induction (stmt, gsi, &vec_stmt); | |
7076 gcc_assert (done); | |
7077 break; | |
7078 | |
7079 case op_vec_info_type: | |
7080 done = vectorizable_operation (stmt, gsi, &vec_stmt, slp_node); | |
7081 gcc_assert (done); | |
7082 break; | |
7083 | |
7084 case assignment_vec_info_type: | |
7085 done = vectorizable_assignment (stmt, gsi, &vec_stmt, slp_node); | |
7086 gcc_assert (done); | |
7087 break; | |
7088 | |
7089 case load_vec_info_type: | |
7090 done = vectorizable_load (stmt, gsi, &vec_stmt, slp_node, | |
7091 slp_node_instance); | |
7092 gcc_assert (done); | |
7093 break; | |
7094 | |
7095 case store_vec_info_type: | |
7096 done = vectorizable_store (stmt, gsi, &vec_stmt, slp_node); | |
7097 gcc_assert (done); | |
7098 if (STMT_VINFO_STRIDED_ACCESS (stmt_info) && !slp_node) | |
7099 { | |
7100 /* In case of interleaving, the whole chain is vectorized when the | |
7101 last store in the chain is reached. Store stmts before the last | |
7102 one are skipped, and there vec_stmt_info shouldn't be freed | |
7103 meanwhile. */ | |
7104 *strided_store = true; | |
7105 if (STMT_VINFO_VEC_STMT (stmt_info)) | |
7106 is_store = true; | |
7107 } | |
7108 else | |
7109 is_store = true; | |
7110 break; | |
7111 | |
7112 case condition_vec_info_type: | |
7113 gcc_assert (!slp_node); | |
7114 done = vectorizable_condition (stmt, gsi, &vec_stmt); | |
7115 gcc_assert (done); | |
7116 break; | |
7117 | |
7118 case call_vec_info_type: | |
7119 gcc_assert (!slp_node); | |
7120 done = vectorizable_call (stmt, gsi, &vec_stmt); | |
7121 break; | |
7122 | |
7123 case reduc_vec_info_type: | |
7124 gcc_assert (!slp_node); | |
7125 done = vectorizable_reduction (stmt, gsi, &vec_stmt); | |
7126 gcc_assert (done); | |
7127 break; | |
7128 | |
7129 default: | |
7130 if (!STMT_VINFO_LIVE_P (stmt_info)) | |
7131 { | |
7132 if (vect_print_dump_info (REPORT_DETAILS)) | |
7133 fprintf (vect_dump, "stmt not supported."); | |
7134 gcc_unreachable (); | |
7135 } | |
7136 } | |
7137 | |
7138 /* Handle inner-loop stmts whose DEF is used in the loop-nest that | |
7139 is being vectorized, but outside the immediately enclosing loop. */ | |
7140 if (vec_stmt | |
7141 && nested_in_vect_loop_p (loop, stmt) | |
7142 && STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type | |
7143 && (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_outer | |
7144 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_outer_by_reduction)) | |
7145 { | |
7146 struct loop *innerloop = loop->inner; | |
7147 imm_use_iterator imm_iter; | |
7148 use_operand_p use_p; | |
7149 tree scalar_dest; | |
7150 gimple exit_phi; | |
7151 | |
7152 if (vect_print_dump_info (REPORT_DETAILS)) | |
7153 fprintf (vect_dump, "Record the vdef for outer-loop vectorization."); | |
7154 | |
7155 /* Find the relevant loop-exit phi-node, and reord the vec_stmt there | |
7156 (to be used when vectorizing outer-loop stmts that use the DEF of | |
7157 STMT). */ | |
7158 if (gimple_code (stmt) == GIMPLE_PHI) | |
7159 scalar_dest = PHI_RESULT (stmt); | |
7160 else | |
7161 scalar_dest = gimple_assign_lhs (stmt); | |
7162 | |
7163 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest) | |
7164 { | |
7165 if (!flow_bb_inside_loop_p (innerloop, gimple_bb (USE_STMT (use_p)))) | |
7166 { | |
7167 exit_phi = USE_STMT (use_p); | |
7168 STMT_VINFO_VEC_STMT (vinfo_for_stmt (exit_phi)) = vec_stmt; | |
7169 } | |
7170 } | |
7171 } | |
7172 | |
7173 /* Handle stmts whose DEF is used outside the loop-nest that is | |
7174 being vectorized. */ | |
7175 if (STMT_VINFO_LIVE_P (stmt_info) | |
7176 && STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type) | |
7177 { | |
7178 done = vectorizable_live_operation (stmt, gsi, &vec_stmt); | |
7179 gcc_assert (done); | |
7180 } | |
7181 | |
7182 if (vec_stmt) | |
7183 { | |
7184 STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt; | |
7185 orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info); | |
7186 if (orig_stmt_in_pattern) | |
7187 { | |
7188 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern); | |
7189 /* STMT was inserted by the vectorizer to replace a computation idiom. | |
7190 ORIG_STMT_IN_PATTERN is a stmt in the original sequence that | |
7191 computed this idiom. We need to record a pointer to VEC_STMT in | |
7192 the stmt_info of ORIG_STMT_IN_PATTERN. See more details in the | |
7193 documentation of vect_pattern_recog. */ | |
7194 if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo)) | |
7195 { | |
7196 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt); | |
7197 STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt; | |
7198 } | |
7199 } | |
7200 } | |
7201 | |
7202 return is_store; | |
7203 } | |
7204 | |
7205 | |
7206 /* This function builds ni_name = number of iterations loop executes | |
7207 on the loop preheader. */ | |
7208 | |
7209 static tree | |
7210 vect_build_loop_niters (loop_vec_info loop_vinfo) | |
7211 { | |
7212 tree ni_name, var; | |
7213 gimple_seq stmts = NULL; | |
7214 edge pe; | |
7215 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
7216 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo)); | |
7217 | |
7218 var = create_tmp_var (TREE_TYPE (ni), "niters"); | |
7219 add_referenced_var (var); | |
7220 ni_name = force_gimple_operand (ni, &stmts, false, var); | |
7221 | |
7222 pe = loop_preheader_edge (loop); | |
7223 if (stmts) | |
7224 { | |
7225 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); | |
7226 gcc_assert (!new_bb); | |
7227 } | |
7228 | |
7229 return ni_name; | |
7230 } | |
7231 | |
7232 | |
7233 /* This function generates the following statements: | |
7234 | |
7235 ni_name = number of iterations loop executes | |
7236 ratio = ni_name / vf | |
7237 ratio_mult_vf_name = ratio * vf | |
7238 | |
7239 and places them at the loop preheader edge. */ | |
7240 | |
7241 static void | |
7242 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo, | |
7243 tree *ni_name_ptr, | |
7244 tree *ratio_mult_vf_name_ptr, | |
7245 tree *ratio_name_ptr) | |
7246 { | |
7247 | |
7248 edge pe; | |
7249 basic_block new_bb; | |
7250 gimple_seq stmts; | |
7251 tree ni_name; | |
7252 tree var; | |
7253 tree ratio_name; | |
7254 tree ratio_mult_vf_name; | |
7255 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
7256 tree ni = LOOP_VINFO_NITERS (loop_vinfo); | |
7257 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); | |
7258 tree log_vf; | |
7259 | |
7260 pe = loop_preheader_edge (loop); | |
7261 | |
7262 /* Generate temporary variable that contains | |
7263 number of iterations loop executes. */ | |
7264 | |
7265 ni_name = vect_build_loop_niters (loop_vinfo); | |
7266 log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf)); | |
7267 | |
7268 /* Create: ratio = ni >> log2(vf) */ | |
7269 | |
7270 ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf); | |
7271 if (!is_gimple_val (ratio_name)) | |
7272 { | |
7273 var = create_tmp_var (TREE_TYPE (ni), "bnd"); | |
7274 add_referenced_var (var); | |
7275 | |
7276 stmts = NULL; | |
7277 ratio_name = force_gimple_operand (ratio_name, &stmts, true, var); | |
7278 pe = loop_preheader_edge (loop); | |
7279 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); | |
7280 gcc_assert (!new_bb); | |
7281 } | |
7282 | |
7283 /* Create: ratio_mult_vf = ratio << log2 (vf). */ | |
7284 | |
7285 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name), | |
7286 ratio_name, log_vf); | |
7287 if (!is_gimple_val (ratio_mult_vf_name)) | |
7288 { | |
7289 var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf"); | |
7290 add_referenced_var (var); | |
7291 | |
7292 stmts = NULL; | |
7293 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts, | |
7294 true, var); | |
7295 pe = loop_preheader_edge (loop); | |
7296 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); | |
7297 gcc_assert (!new_bb); | |
7298 } | |
7299 | |
7300 *ni_name_ptr = ni_name; | |
7301 *ratio_mult_vf_name_ptr = ratio_mult_vf_name; | |
7302 *ratio_name_ptr = ratio_name; | |
7303 | |
7304 return; | |
7305 } | |
7306 | |
7307 | |
7308 /* Function vect_update_ivs_after_vectorizer. | |
7309 | |
7310 "Advance" the induction variables of LOOP to the value they should take | |
7311 after the execution of LOOP. This is currently necessary because the | |
7312 vectorizer does not handle induction variables that are used after the | |
7313 loop. Such a situation occurs when the last iterations of LOOP are | |
7314 peeled, because: | |
7315 1. We introduced new uses after LOOP for IVs that were not originally used | |
7316 after LOOP: the IVs of LOOP are now used by an epilog loop. | |
7317 2. LOOP is going to be vectorized; this means that it will iterate N/VF | |
7318 times, whereas the loop IVs should be bumped N times. | |
7319 | |
7320 Input: | |
7321 - LOOP - a loop that is going to be vectorized. The last few iterations | |
7322 of LOOP were peeled. | |
7323 - NITERS - the number of iterations that LOOP executes (before it is | |
7324 vectorized). i.e, the number of times the ivs should be bumped. | |
7325 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path | |
7326 coming out from LOOP on which there are uses of the LOOP ivs | |
7327 (this is the path from LOOP->exit to epilog_loop->preheader). | |
7328 | |
7329 The new definitions of the ivs are placed in LOOP->exit. | |
7330 The phi args associated with the edge UPDATE_E in the bb | |
7331 UPDATE_E->dest are updated accordingly. | |
7332 | |
7333 Assumption 1: Like the rest of the vectorizer, this function assumes | |
7334 a single loop exit that has a single predecessor. | |
7335 | |
7336 Assumption 2: The phi nodes in the LOOP header and in update_bb are | |
7337 organized in the same order. | |
7338 | |
7339 Assumption 3: The access function of the ivs is simple enough (see | |
7340 vect_can_advance_ivs_p). This assumption will be relaxed in the future. | |
7341 | |
7342 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path | |
7343 coming out of LOOP on which the ivs of LOOP are used (this is the path | |
7344 that leads to the epilog loop; other paths skip the epilog loop). This | |
7345 path starts with the edge UPDATE_E, and its destination (denoted update_bb) | |
7346 needs to have its phis updated. | |
7347 */ | |
7348 | |
7349 static void | |
7350 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters, | |
7351 edge update_e) | |
7352 { | |
7353 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
7354 basic_block exit_bb = single_exit (loop)->dest; | |
7355 gimple phi, phi1; | |
7356 gimple_stmt_iterator gsi, gsi1; | |
7357 basic_block update_bb = update_e->dest; | |
7358 | |
7359 /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */ | |
7360 | |
7361 /* Make sure there exists a single-predecessor exit bb: */ | |
7362 gcc_assert (single_pred_p (exit_bb)); | |
7363 | |
7364 for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb); | |
7365 !gsi_end_p (gsi) && !gsi_end_p (gsi1); | |
7366 gsi_next (&gsi), gsi_next (&gsi1)) | |
7367 { | |
7368 tree access_fn = NULL; | |
7369 tree evolution_part; | |
7370 tree init_expr; | |
7371 tree step_expr; | |
7372 tree var, ni, ni_name; | |
7373 gimple_stmt_iterator last_gsi; | |
7374 | |
7375 phi = gsi_stmt (gsi); | |
7376 phi1 = gsi_stmt (gsi1); | |
7377 if (vect_print_dump_info (REPORT_DETAILS)) | |
7378 { | |
7379 fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: "); | |
7380 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM); | |
7381 } | |
7382 | |
7383 /* Skip virtual phi's. */ | |
7384 if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi)))) | |
7385 { | |
7386 if (vect_print_dump_info (REPORT_DETAILS)) | |
7387 fprintf (vect_dump, "virtual phi. skip."); | |
7388 continue; | |
7389 } | |
7390 | |
7391 /* Skip reduction phis. */ | |
7392 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def) | |
7393 { | |
7394 if (vect_print_dump_info (REPORT_DETAILS)) | |
7395 fprintf (vect_dump, "reduc phi. skip."); | |
7396 continue; | |
7397 } | |
7398 | |
7399 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi)); | |
7400 gcc_assert (access_fn); | |
7401 STRIP_NOPS (access_fn); | |
7402 evolution_part = | |
7403 unshare_expr (evolution_part_in_loop_num (access_fn, loop->num)); | |
7404 gcc_assert (evolution_part != NULL_TREE); | |
7405 | |
7406 /* FORNOW: We do not support IVs whose evolution function is a polynomial | |
7407 of degree >= 2 or exponential. */ | |
7408 gcc_assert (!tree_is_chrec (evolution_part)); | |
7409 | |
7410 step_expr = evolution_part; | |
7411 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, | |
7412 loop->num)); | |
7413 | |
7414 if (POINTER_TYPE_P (TREE_TYPE (init_expr))) | |
7415 ni = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (init_expr), | |
7416 init_expr, | |
7417 fold_convert (sizetype, | |
7418 fold_build2 (MULT_EXPR, TREE_TYPE (niters), | |
7419 niters, step_expr))); | |
7420 else | |
7421 ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr), | |
7422 fold_build2 (MULT_EXPR, TREE_TYPE (init_expr), | |
7423 fold_convert (TREE_TYPE (init_expr), | |
7424 niters), | |
7425 step_expr), | |
7426 init_expr); | |
7427 | |
7428 | |
7429 | |
7430 var = create_tmp_var (TREE_TYPE (init_expr), "tmp"); | |
7431 add_referenced_var (var); | |
7432 | |
7433 last_gsi = gsi_last_bb (exit_bb); | |
7434 ni_name = force_gimple_operand_gsi (&last_gsi, ni, false, var, | |
7435 true, GSI_SAME_STMT); | |
7436 | |
7437 /* Fix phi expressions in the successor bb. */ | |
7438 SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name); | |
7439 } | |
7440 } | |
7441 | |
7442 /* Return the more conservative threshold between the | |
7443 min_profitable_iters returned by the cost model and the user | |
7444 specified threshold, if provided. */ | |
7445 | |
7446 static unsigned int | |
7447 conservative_cost_threshold (loop_vec_info loop_vinfo, | |
7448 int min_profitable_iters) | |
7449 { | |
7450 unsigned int th; | |
7451 int min_scalar_loop_bound; | |
7452 | |
7453 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND) | |
7454 * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1); | |
7455 | |
7456 /* Use the cost model only if it is more conservative than user specified | |
7457 threshold. */ | |
7458 th = (unsigned) min_scalar_loop_bound; | |
7459 if (min_profitable_iters | |
7460 && (!min_scalar_loop_bound | |
7461 || min_profitable_iters > min_scalar_loop_bound)) | |
7462 th = (unsigned) min_profitable_iters; | |
7463 | |
7464 if (th && vect_print_dump_info (REPORT_COST)) | |
7465 fprintf (vect_dump, "Vectorization may not be profitable."); | |
7466 | |
7467 return th; | |
7468 } | |
7469 | |
7470 /* Function vect_do_peeling_for_loop_bound | |
7471 | |
7472 Peel the last iterations of the loop represented by LOOP_VINFO. | |
7473 The peeled iterations form a new epilog loop. Given that the loop now | |
7474 iterates NITERS times, the new epilog loop iterates | |
7475 NITERS % VECTORIZATION_FACTOR times. | |
7476 | |
7477 The original loop will later be made to iterate | |
7478 NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). */ | |
7479 | |
7480 static void | |
7481 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio) | |
7482 { | |
7483 tree ni_name, ratio_mult_vf_name; | |
7484 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
7485 struct loop *new_loop; | |
7486 edge update_e; | |
7487 basic_block preheader; | |
7488 int loop_num; | |
7489 bool check_profitability = false; | |
7490 unsigned int th = 0; | |
7491 int min_profitable_iters; | |
7492 | |
7493 if (vect_print_dump_info (REPORT_DETAILS)) | |
7494 fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ==="); | |
7495 | |
7496 initialize_original_copy_tables (); | |
7497 | |
7498 /* Generate the following variables on the preheader of original loop: | |
7499 | |
7500 ni_name = number of iteration the original loop executes | |
7501 ratio = ni_name / vf | |
7502 ratio_mult_vf_name = ratio * vf */ | |
7503 vect_generate_tmps_on_preheader (loop_vinfo, &ni_name, | |
7504 &ratio_mult_vf_name, ratio); | |
7505 | |
7506 loop_num = loop->num; | |
7507 | |
7508 /* If cost model check not done during versioning and | |
7509 peeling for alignment. */ | |
7510 if (!VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)) | |
7511 && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)) | |
7512 && !LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo)) | |
7513 { | |
7514 check_profitability = true; | |
7515 | |
7516 /* Get profitability threshold for vectorized loop. */ | |
7517 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo); | |
7518 | |
7519 th = conservative_cost_threshold (loop_vinfo, | |
7520 min_profitable_iters); | |
7521 } | |
7522 | |
7523 new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop), | |
7524 ratio_mult_vf_name, ni_name, false, | |
7525 th, check_profitability); | |
7526 gcc_assert (new_loop); | |
7527 gcc_assert (loop_num == loop->num); | |
7528 #ifdef ENABLE_CHECKING | |
7529 slpeel_verify_cfg_after_peeling (loop, new_loop); | |
7530 #endif | |
7531 | |
7532 /* A guard that controls whether the new_loop is to be executed or skipped | |
7533 is placed in LOOP->exit. LOOP->exit therefore has two successors - one | |
7534 is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other | |
7535 is a bb after NEW_LOOP, where these IVs are not used. Find the edge that | |
7536 is on the path where the LOOP IVs are used and need to be updated. */ | |
7537 | |
7538 preheader = loop_preheader_edge (new_loop)->src; | |
7539 if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest) | |
7540 update_e = EDGE_PRED (preheader, 0); | |
7541 else | |
7542 update_e = EDGE_PRED (preheader, 1); | |
7543 | |
7544 /* Update IVs of original loop as if they were advanced | |
7545 by ratio_mult_vf_name steps. */ | |
7546 vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e); | |
7547 | |
7548 /* After peeling we have to reset scalar evolution analyzer. */ | |
7549 scev_reset (); | |
7550 | |
7551 free_original_copy_tables (); | |
7552 } | |
7553 | |
7554 | |
7555 /* Function vect_gen_niters_for_prolog_loop | |
7556 | |
7557 Set the number of iterations for the loop represented by LOOP_VINFO | |
7558 to the minimum between LOOP_NITERS (the original iteration count of the loop) | |
7559 and the misalignment of DR - the data reference recorded in | |
7560 LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of | |
7561 this loop, the data reference DR will refer to an aligned location. | |
7562 | |
7563 The following computation is generated: | |
7564 | |
7565 If the misalignment of DR is known at compile time: | |
7566 addr_mis = int mis = DR_MISALIGNMENT (dr); | |
7567 Else, compute address misalignment in bytes: | |
7568 addr_mis = addr & (vectype_size - 1) | |
7569 | |
7570 prolog_niters = min (LOOP_NITERS, ((VF - addr_mis/elem_size)&(VF-1))/step) | |
7571 | |
7572 (elem_size = element type size; an element is the scalar element whose type | |
7573 is the inner type of the vectype) | |
7574 | |
7575 When the step of the data-ref in the loop is not 1 (as in interleaved data | |
7576 and SLP), the number of iterations of the prolog must be divided by the step | |
7577 (which is equal to the size of interleaved group). | |
7578 | |
7579 The above formulas assume that VF == number of elements in the vector. This | |
7580 may not hold when there are multiple-types in the loop. | |
7581 In this case, for some data-references in the loop the VF does not represent | |
7582 the number of elements that fit in the vector. Therefore, instead of VF we | |
7583 use TYPE_VECTOR_SUBPARTS. */ | |
7584 | |
7585 static tree | |
7586 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters) | |
7587 { | |
7588 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo); | |
7589 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
7590 tree var; | |
7591 gimple_seq stmts; | |
7592 tree iters, iters_name; | |
7593 edge pe; | |
7594 basic_block new_bb; | |
7595 gimple dr_stmt = DR_STMT (dr); | |
7596 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt); | |
7597 tree vectype = STMT_VINFO_VECTYPE (stmt_info); | |
7598 int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT; | |
7599 tree niters_type = TREE_TYPE (loop_niters); | |
7600 int step = 1; | |
7601 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr)))); | |
7602 int nelements = TYPE_VECTOR_SUBPARTS (vectype); | |
7603 | |
7604 if (STMT_VINFO_STRIDED_ACCESS (stmt_info)) | |
7605 step = DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_DR (stmt_info))); | |
7606 | |
7607 pe = loop_preheader_edge (loop); | |
7608 | |
7609 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0) | |
7610 { | |
7611 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo); | |
7612 int elem_misalign = byte_misalign / element_size; | |
7613 | |
7614 if (vect_print_dump_info (REPORT_DETAILS)) | |
7615 fprintf (vect_dump, "known alignment = %d.", byte_misalign); | |
7616 | |
7617 iters = build_int_cst (niters_type, | |
7618 (((nelements - elem_misalign) & (nelements - 1)) / step)); | |
7619 } | |
7620 else | |
7621 { | |
7622 gimple_seq new_stmts = NULL; | |
7623 tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt, | |
7624 &new_stmts, NULL_TREE, loop); | |
7625 tree ptr_type = TREE_TYPE (start_addr); | |
7626 tree size = TYPE_SIZE (ptr_type); | |
7627 tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1); | |
7628 tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1); | |
7629 tree elem_size_log = | |
7630 build_int_cst (type, exact_log2 (vectype_align/nelements)); | |
7631 tree nelements_minus_1 = build_int_cst (type, nelements - 1); | |
7632 tree nelements_tree = build_int_cst (type, nelements); | |
7633 tree byte_misalign; | |
7634 tree elem_misalign; | |
7635 | |
7636 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmts); | |
7637 gcc_assert (!new_bb); | |
7638 | |
7639 /* Create: byte_misalign = addr & (vectype_size - 1) */ | |
7640 byte_misalign = | |
7641 fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr), vectype_size_minus_1); | |
7642 | |
7643 /* Create: elem_misalign = byte_misalign / element_size */ | |
7644 elem_misalign = | |
7645 fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log); | |
7646 | |
7647 /* Create: (niters_type) (nelements - elem_misalign)&(nelements - 1) */ | |
7648 iters = fold_build2 (MINUS_EXPR, type, nelements_tree, elem_misalign); | |
7649 iters = fold_build2 (BIT_AND_EXPR, type, iters, nelements_minus_1); | |
7650 iters = fold_convert (niters_type, iters); | |
7651 } | |
7652 | |
7653 /* Create: prolog_loop_niters = min (iters, loop_niters) */ | |
7654 /* If the loop bound is known at compile time we already verified that it is | |
7655 greater than vf; since the misalignment ('iters') is at most vf, there's | |
7656 no need to generate the MIN_EXPR in this case. */ | |
7657 if (TREE_CODE (loop_niters) != INTEGER_CST) | |
7658 iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters); | |
7659 | |
7660 if (vect_print_dump_info (REPORT_DETAILS)) | |
7661 { | |
7662 fprintf (vect_dump, "niters for prolog loop: "); | |
7663 print_generic_expr (vect_dump, iters, TDF_SLIM); | |
7664 } | |
7665 | |
7666 var = create_tmp_var (niters_type, "prolog_loop_niters"); | |
7667 add_referenced_var (var); | |
7668 stmts = NULL; | |
7669 iters_name = force_gimple_operand (iters, &stmts, false, var); | |
7670 | |
7671 /* Insert stmt on loop preheader edge. */ | |
7672 if (stmts) | |
7673 { | |
7674 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); | |
7675 gcc_assert (!new_bb); | |
7676 } | |
7677 | |
7678 return iters_name; | |
7679 } | |
7680 | |
7681 | |
7682 /* Function vect_update_init_of_dr | |
7683 | |
7684 NITERS iterations were peeled from LOOP. DR represents a data reference | |
7685 in LOOP. This function updates the information recorded in DR to | |
7686 account for the fact that the first NITERS iterations had already been | |
7687 executed. Specifically, it updates the OFFSET field of DR. */ | |
7688 | |
7689 static void | |
7690 vect_update_init_of_dr (struct data_reference *dr, tree niters) | |
7691 { | |
7692 tree offset = DR_OFFSET (dr); | |
7693 | |
7694 niters = fold_build2 (MULT_EXPR, sizetype, | |
7695 fold_convert (sizetype, niters), | |
7696 fold_convert (sizetype, DR_STEP (dr))); | |
7697 offset = fold_build2 (PLUS_EXPR, sizetype, offset, niters); | |
7698 DR_OFFSET (dr) = offset; | |
7699 } | |
7700 | |
7701 | |
7702 /* Function vect_update_inits_of_drs | |
7703 | |
7704 NITERS iterations were peeled from the loop represented by LOOP_VINFO. | |
7705 This function updates the information recorded for the data references in | |
7706 the loop to account for the fact that the first NITERS iterations had | |
7707 already been executed. Specifically, it updates the initial_condition of | |
7708 the access_function of all the data_references in the loop. */ | |
7709 | |
7710 static void | |
7711 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters) | |
7712 { | |
7713 unsigned int i; | |
7714 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); | |
7715 struct data_reference *dr; | |
7716 | |
7717 if (vect_print_dump_info (REPORT_DETAILS)) | |
7718 fprintf (vect_dump, "=== vect_update_inits_of_dr ==="); | |
7719 | |
7720 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++) | |
7721 vect_update_init_of_dr (dr, niters); | |
7722 } | |
7723 | |
7724 | |
7725 /* Function vect_do_peeling_for_alignment | |
7726 | |
7727 Peel the first 'niters' iterations of the loop represented by LOOP_VINFO. | |
7728 'niters' is set to the misalignment of one of the data references in the | |
7729 loop, thereby forcing it to refer to an aligned location at the beginning | |
7730 of the execution of this loop. The data reference for which we are | |
7731 peeling is recorded in LOOP_VINFO_UNALIGNED_DR. */ | |
7732 | |
7733 static void | |
7734 vect_do_peeling_for_alignment (loop_vec_info loop_vinfo) | |
7735 { | |
7736 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
7737 tree niters_of_prolog_loop, ni_name; | |
7738 tree n_iters; | |
7739 struct loop *new_loop; | |
7740 bool check_profitability = false; | |
7741 unsigned int th = 0; | |
7742 int min_profitable_iters; | |
7743 | |
7744 if (vect_print_dump_info (REPORT_DETAILS)) | |
7745 fprintf (vect_dump, "=== vect_do_peeling_for_alignment ==="); | |
7746 | |
7747 initialize_original_copy_tables (); | |
7748 | |
7749 ni_name = vect_build_loop_niters (loop_vinfo); | |
7750 niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name); | |
7751 | |
7752 | |
7753 /* If cost model check not done during versioning. */ | |
7754 if (!VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)) | |
7755 && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo))) | |
7756 { | |
7757 check_profitability = true; | |
7758 | |
7759 /* Get profitability threshold for vectorized loop. */ | |
7760 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo); | |
7761 | |
7762 th = conservative_cost_threshold (loop_vinfo, | |
7763 min_profitable_iters); | |
7764 } | |
7765 | |
7766 /* Peel the prolog loop and iterate it niters_of_prolog_loop. */ | |
7767 new_loop = | |
7768 slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop), | |
7769 niters_of_prolog_loop, ni_name, true, | |
7770 th, check_profitability); | |
7771 | |
7772 gcc_assert (new_loop); | |
7773 #ifdef ENABLE_CHECKING | |
7774 slpeel_verify_cfg_after_peeling (new_loop, loop); | |
7775 #endif | |
7776 | |
7777 /* Update number of times loop executes. */ | |
7778 n_iters = LOOP_VINFO_NITERS (loop_vinfo); | |
7779 LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR, | |
7780 TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop); | |
7781 | |
7782 /* Update the init conditions of the access functions of all data refs. */ | |
7783 vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop); | |
7784 | |
7785 /* After peeling we have to reset scalar evolution analyzer. */ | |
7786 scev_reset (); | |
7787 | |
7788 free_original_copy_tables (); | |
7789 } | |
7790 | |
7791 | |
7792 /* Function vect_create_cond_for_align_checks. | |
7793 | |
7794 Create a conditional expression that represents the alignment checks for | |
7795 all of data references (array element references) whose alignment must be | |
7796 checked at runtime. | |
7797 | |
7798 Input: | |
7799 COND_EXPR - input conditional expression. New conditions will be chained | |
7800 with logical AND operation. | |
7801 LOOP_VINFO - two fields of the loop information are used. | |
7802 LOOP_VINFO_PTR_MASK is the mask used to check the alignment. | |
7803 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked. | |
7804 | |
7805 Output: | |
7806 COND_EXPR_STMT_LIST - statements needed to construct the conditional | |
7807 expression. | |
7808 The returned value is the conditional expression to be used in the if | |
7809 statement that controls which version of the loop gets executed at runtime. | |
7810 | |
7811 The algorithm makes two assumptions: | |
7812 1) The number of bytes "n" in a vector is a power of 2. | |
7813 2) An address "a" is aligned if a%n is zero and that this | |
7814 test can be done as a&(n-1) == 0. For example, for 16 | |
7815 byte vectors the test is a&0xf == 0. */ | |
7816 | |
7817 static void | |
7818 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo, | |
7819 tree *cond_expr, | |
7820 gimple_seq *cond_expr_stmt_list) | |
7821 { | |
7822 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
7823 VEC(gimple,heap) *may_misalign_stmts | |
7824 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo); | |
7825 gimple ref_stmt; | |
7826 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo); | |
7827 tree mask_cst; | |
7828 unsigned int i; | |
7829 tree psize; | |
7830 tree int_ptrsize_type; | |
7831 char tmp_name[20]; | |
7832 tree or_tmp_name = NULL_TREE; | |
7833 tree and_tmp, and_tmp_name; | |
7834 gimple and_stmt; | |
7835 tree ptrsize_zero; | |
7836 tree part_cond_expr; | |
7837 | |
7838 /* Check that mask is one less than a power of 2, i.e., mask is | |
7839 all zeros followed by all ones. */ | |
7840 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0)); | |
7841 | |
7842 /* CHECKME: what is the best integer or unsigned type to use to hold a | |
7843 cast from a pointer value? */ | |
7844 psize = TYPE_SIZE (ptr_type_node); | |
7845 int_ptrsize_type | |
7846 = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0); | |
7847 | |
7848 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address | |
7849 of the first vector of the i'th data reference. */ | |
7850 | |
7851 for (i = 0; VEC_iterate (gimple, may_misalign_stmts, i, ref_stmt); i++) | |
7852 { | |
7853 gimple_seq new_stmt_list = NULL; | |
7854 tree addr_base; | |
7855 tree addr_tmp, addr_tmp_name; | |
7856 tree or_tmp, new_or_tmp_name; | |
7857 gimple addr_stmt, or_stmt; | |
7858 | |
7859 /* create: addr_tmp = (int)(address_of_first_vector) */ | |
7860 addr_base = | |
7861 vect_create_addr_base_for_vector_ref (ref_stmt, &new_stmt_list, | |
7862 NULL_TREE, loop); | |
7863 if (new_stmt_list != NULL) | |
7864 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list); | |
7865 | |
7866 sprintf (tmp_name, "%s%d", "addr2int", i); | |
7867 addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name); | |
7868 add_referenced_var (addr_tmp); | |
7869 addr_tmp_name = make_ssa_name (addr_tmp, NULL); | |
7870 addr_stmt = gimple_build_assign_with_ops (NOP_EXPR, addr_tmp_name, | |
7871 addr_base, NULL_TREE); | |
7872 SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt; | |
7873 gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt); | |
7874 | |
7875 /* The addresses are OR together. */ | |
7876 | |
7877 if (or_tmp_name != NULL_TREE) | |
7878 { | |
7879 /* create: or_tmp = or_tmp | addr_tmp */ | |
7880 sprintf (tmp_name, "%s%d", "orptrs", i); | |
7881 or_tmp = create_tmp_var (int_ptrsize_type, tmp_name); | |
7882 add_referenced_var (or_tmp); | |
7883 new_or_tmp_name = make_ssa_name (or_tmp, NULL); | |
7884 or_stmt = gimple_build_assign_with_ops (BIT_IOR_EXPR, | |
7885 new_or_tmp_name, | |
7886 or_tmp_name, addr_tmp_name); | |
7887 SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt; | |
7888 gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt); | |
7889 or_tmp_name = new_or_tmp_name; | |
7890 } | |
7891 else | |
7892 or_tmp_name = addr_tmp_name; | |
7893 | |
7894 } /* end for i */ | |
7895 | |
7896 mask_cst = build_int_cst (int_ptrsize_type, mask); | |
7897 | |
7898 /* create: and_tmp = or_tmp & mask */ | |
7899 and_tmp = create_tmp_var (int_ptrsize_type, "andmask" ); | |
7900 add_referenced_var (and_tmp); | |
7901 and_tmp_name = make_ssa_name (and_tmp, NULL); | |
7902 | |
7903 and_stmt = gimple_build_assign_with_ops (BIT_AND_EXPR, and_tmp_name, | |
7904 or_tmp_name, mask_cst); | |
7905 SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt; | |
7906 gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt); | |
7907 | |
7908 /* Make and_tmp the left operand of the conditional test against zero. | |
7909 if and_tmp has a nonzero bit then some address is unaligned. */ | |
7910 ptrsize_zero = build_int_cst (int_ptrsize_type, 0); | |
7911 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node, | |
7912 and_tmp_name, ptrsize_zero); | |
7913 if (*cond_expr) | |
7914 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, | |
7915 *cond_expr, part_cond_expr); | |
7916 else | |
7917 *cond_expr = part_cond_expr; | |
7918 } | |
7919 | |
7920 /* Function vect_vfa_segment_size. | |
7921 | |
7922 Create an expression that computes the size of segment | |
7923 that will be accessed for a data reference. The functions takes into | |
7924 account that realignment loads may access one more vector. | |
7925 | |
7926 Input: | |
7927 DR: The data reference. | |
7928 VECT_FACTOR: vectorization factor. | |
7929 | |
7930 Return an expression whose value is the size of segment which will be | |
7931 accessed by DR. */ | |
7932 | |
7933 static tree | |
7934 vect_vfa_segment_size (struct data_reference *dr, tree vect_factor) | |
7935 { | |
7936 tree segment_length = fold_build2 (MULT_EXPR, integer_type_node, | |
7937 DR_STEP (dr), vect_factor); | |
7938 | |
7939 if (vect_supportable_dr_alignment (dr) == dr_explicit_realign_optimized) | |
7940 { | |
7941 tree vector_size = TYPE_SIZE_UNIT | |
7942 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)))); | |
7943 | |
7944 segment_length = fold_build2 (PLUS_EXPR, integer_type_node, | |
7945 segment_length, vector_size); | |
7946 } | |
7947 return fold_convert (sizetype, segment_length); | |
7948 } | |
7949 | |
7950 /* Function vect_create_cond_for_alias_checks. | |
7951 | |
7952 Create a conditional expression that represents the run-time checks for | |
7953 overlapping of address ranges represented by a list of data references | |
7954 relations passed as input. | |
7955 | |
7956 Input: | |
7957 COND_EXPR - input conditional expression. New conditions will be chained | |
7958 with logical AND operation. | |
7959 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs | |
7960 to be checked. | |
7961 | |
7962 Output: | |
7963 COND_EXPR - conditional expression. | |
7964 COND_EXPR_STMT_LIST - statements needed to construct the conditional | |
7965 expression. | |
7966 | |
7967 | |
7968 The returned value is the conditional expression to be used in the if | |
7969 statement that controls which version of the loop gets executed at runtime. | |
7970 */ | |
7971 | |
7972 static void | |
7973 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, | |
7974 tree * cond_expr, | |
7975 gimple_seq * cond_expr_stmt_list) | |
7976 { | |
7977 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
7978 VEC (ddr_p, heap) * may_alias_ddrs = | |
7979 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo); | |
7980 tree vect_factor = | |
7981 build_int_cst (integer_type_node, LOOP_VINFO_VECT_FACTOR (loop_vinfo)); | |
7982 | |
7983 ddr_p ddr; | |
7984 unsigned int i; | |
7985 tree part_cond_expr; | |
7986 | |
7987 /* Create expression | |
7988 ((store_ptr_0 + store_segment_length_0) < load_ptr_0) | |
7989 || (load_ptr_0 + load_segment_length_0) < store_ptr_0)) | |
7990 && | |
7991 ... | |
7992 && | |
7993 ((store_ptr_n + store_segment_length_n) < load_ptr_n) | |
7994 || (load_ptr_n + load_segment_length_n) < store_ptr_n)) */ | |
7995 | |
7996 if (VEC_empty (ddr_p, may_alias_ddrs)) | |
7997 return; | |
7998 | |
7999 for (i = 0; VEC_iterate (ddr_p, may_alias_ddrs, i, ddr); i++) | |
8000 { | |
8001 struct data_reference *dr_a, *dr_b; | |
8002 gimple dr_group_first_a, dr_group_first_b; | |
8003 tree addr_base_a, addr_base_b; | |
8004 tree segment_length_a, segment_length_b; | |
8005 gimple stmt_a, stmt_b; | |
8006 | |
8007 dr_a = DDR_A (ddr); | |
8008 stmt_a = DR_STMT (DDR_A (ddr)); | |
8009 dr_group_first_a = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_a)); | |
8010 if (dr_group_first_a) | |
8011 { | |
8012 stmt_a = dr_group_first_a; | |
8013 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a)); | |
8014 } | |
8015 | |
8016 dr_b = DDR_B (ddr); | |
8017 stmt_b = DR_STMT (DDR_B (ddr)); | |
8018 dr_group_first_b = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_b)); | |
8019 if (dr_group_first_b) | |
8020 { | |
8021 stmt_b = dr_group_first_b; | |
8022 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b)); | |
8023 } | |
8024 | |
8025 addr_base_a = | |
8026 vect_create_addr_base_for_vector_ref (stmt_a, cond_expr_stmt_list, | |
8027 NULL_TREE, loop); | |
8028 addr_base_b = | |
8029 vect_create_addr_base_for_vector_ref (stmt_b, cond_expr_stmt_list, | |
8030 NULL_TREE, loop); | |
8031 | |
8032 segment_length_a = vect_vfa_segment_size (dr_a, vect_factor); | |
8033 segment_length_b = vect_vfa_segment_size (dr_b, vect_factor); | |
8034 | |
8035 if (vect_print_dump_info (REPORT_DR_DETAILS)) | |
8036 { | |
8037 fprintf (vect_dump, | |
8038 "create runtime check for data references "); | |
8039 print_generic_expr (vect_dump, DR_REF (dr_a), TDF_SLIM); | |
8040 fprintf (vect_dump, " and "); | |
8041 print_generic_expr (vect_dump, DR_REF (dr_b), TDF_SLIM); | |
8042 } | |
8043 | |
8044 | |
8045 part_cond_expr = | |
8046 fold_build2 (TRUTH_OR_EXPR, boolean_type_node, | |
8047 fold_build2 (LT_EXPR, boolean_type_node, | |
8048 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_a), | |
8049 addr_base_a, | |
8050 segment_length_a), | |
8051 addr_base_b), | |
8052 fold_build2 (LT_EXPR, boolean_type_node, | |
8053 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_b), | |
8054 addr_base_b, | |
8055 segment_length_b), | |
8056 addr_base_a)); | |
8057 | |
8058 if (*cond_expr) | |
8059 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, | |
8060 *cond_expr, part_cond_expr); | |
8061 else | |
8062 *cond_expr = part_cond_expr; | |
8063 } | |
8064 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS)) | |
8065 fprintf (vect_dump, "created %u versioning for alias checks.\n", | |
8066 VEC_length (ddr_p, may_alias_ddrs)); | |
8067 | |
8068 } | |
8069 | |
8070 /* Function vect_loop_versioning. | |
8071 | |
8072 If the loop has data references that may or may not be aligned or/and | |
8073 has data reference relations whose independence was not proven then | |
8074 two versions of the loop need to be generated, one which is vectorized | |
8075 and one which isn't. A test is then generated to control which of the | |
8076 loops is executed. The test checks for the alignment of all of the | |
8077 data references that may or may not be aligned. An additional | |
8078 sequence of runtime tests is generated for each pairs of DDRs whose | |
8079 independence was not proven. The vectorized version of loop is | |
8080 executed only if both alias and alignment tests are passed. | |
8081 | |
8082 The test generated to check which version of loop is executed | |
8083 is modified to also check for profitability as indicated by the | |
8084 cost model initially. */ | |
8085 | |
8086 static void | |
8087 vect_loop_versioning (loop_vec_info loop_vinfo) | |
8088 { | |
8089 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
8090 struct loop *nloop; | |
8091 tree cond_expr = NULL_TREE; | |
8092 gimple_seq cond_expr_stmt_list = NULL; | |
8093 basic_block condition_bb; | |
8094 gimple_stmt_iterator gsi, cond_exp_gsi; | |
8095 basic_block merge_bb; | |
8096 basic_block new_exit_bb; | |
8097 edge new_exit_e, e; | |
8098 gimple orig_phi, new_phi; | |
8099 tree arg; | |
8100 unsigned prob = 4 * REG_BR_PROB_BASE / 5; | |
8101 gimple_seq gimplify_stmt_list = NULL; | |
8102 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo); | |
8103 int min_profitable_iters = 0; | |
8104 unsigned int th; | |
8105 | |
8106 /* Get profitability threshold for vectorized loop. */ | |
8107 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo); | |
8108 | |
8109 th = conservative_cost_threshold (loop_vinfo, | |
8110 min_profitable_iters); | |
8111 | |
8112 cond_expr = | |
8113 fold_build2 (GT_EXPR, boolean_type_node, scalar_loop_iters, | |
8114 build_int_cst (TREE_TYPE (scalar_loop_iters), th)); | |
8115 | |
8116 cond_expr = force_gimple_operand (cond_expr, &cond_expr_stmt_list, | |
8117 false, NULL_TREE); | |
8118 | |
8119 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))) | |
8120 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr, | |
8121 &cond_expr_stmt_list); | |
8122 | |
8123 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo))) | |
8124 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr, | |
8125 &cond_expr_stmt_list); | |
8126 | |
8127 cond_expr = | |
8128 fold_build2 (NE_EXPR, boolean_type_node, cond_expr, integer_zero_node); | |
8129 cond_expr = | |
8130 force_gimple_operand (cond_expr, &gimplify_stmt_list, true, NULL_TREE); | |
8131 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list); | |
8132 | |
8133 initialize_original_copy_tables (); | |
8134 nloop = loop_version (loop, cond_expr, &condition_bb, | |
8135 prob, prob, REG_BR_PROB_BASE - prob, true); | |
8136 free_original_copy_tables(); | |
8137 | |
8138 /* Loop versioning violates an assumption we try to maintain during | |
8139 vectorization - that the loop exit block has a single predecessor. | |
8140 After versioning, the exit block of both loop versions is the same | |
8141 basic block (i.e. it has two predecessors). Just in order to simplify | |
8142 following transformations in the vectorizer, we fix this situation | |
8143 here by adding a new (empty) block on the exit-edge of the loop, | |
8144 with the proper loop-exit phis to maintain loop-closed-form. */ | |
8145 | |
8146 merge_bb = single_exit (loop)->dest; | |
8147 gcc_assert (EDGE_COUNT (merge_bb->preds) == 2); | |
8148 new_exit_bb = split_edge (single_exit (loop)); | |
8149 new_exit_e = single_exit (loop); | |
8150 e = EDGE_SUCC (new_exit_bb, 0); | |
8151 | |
8152 for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); gsi_next (&gsi)) | |
8153 { | |
8154 orig_phi = gsi_stmt (gsi); | |
8155 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)), | |
8156 new_exit_bb); | |
8157 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e); | |
8158 add_phi_arg (new_phi, arg, new_exit_e); | |
8159 SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi)); | |
8160 } | |
8161 | |
8162 /* End loop-exit-fixes after versioning. */ | |
8163 | |
8164 update_ssa (TODO_update_ssa); | |
8165 if (cond_expr_stmt_list) | |
8166 { | |
8167 cond_exp_gsi = gsi_last_bb (condition_bb); | |
8168 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list, GSI_SAME_STMT); | |
8169 } | |
8170 } | |
8171 | |
8172 /* Remove a group of stores (for SLP or interleaving), free their | |
8173 stmt_vec_info. */ | |
8174 | |
8175 static void | |
8176 vect_remove_stores (gimple first_stmt) | |
8177 { | |
8178 gimple next = first_stmt; | |
8179 gimple tmp; | |
8180 gimple_stmt_iterator next_si; | |
8181 | |
8182 while (next) | |
8183 { | |
8184 /* Free the attached stmt_vec_info and remove the stmt. */ | |
8185 next_si = gsi_for_stmt (next); | |
8186 gsi_remove (&next_si, true); | |
8187 tmp = DR_GROUP_NEXT_DR (vinfo_for_stmt (next)); | |
8188 free_stmt_vec_info (next); | |
8189 next = tmp; | |
8190 } | |
8191 } | |
8192 | |
8193 | |
8194 /* Vectorize SLP instance tree in postorder. */ | |
8195 | |
8196 static bool | |
8197 vect_schedule_slp_instance (slp_tree node, slp_instance instance, | |
8198 unsigned int vectorization_factor) | |
8199 { | |
8200 gimple stmt; | |
8201 bool strided_store, is_store; | |
8202 gimple_stmt_iterator si; | |
8203 stmt_vec_info stmt_info; | |
8204 unsigned int vec_stmts_size, nunits, group_size; | |
8205 tree vectype; | |
8206 int i; | |
8207 slp_tree loads_node; | |
8208 | |
8209 if (!node) | |
8210 return false; | |
8211 | |
8212 vect_schedule_slp_instance (SLP_TREE_LEFT (node), instance, | |
8213 vectorization_factor); | |
8214 vect_schedule_slp_instance (SLP_TREE_RIGHT (node), instance, | |
8215 vectorization_factor); | |
8216 | |
8217 stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (node), 0); | |
8218 stmt_info = vinfo_for_stmt (stmt); | |
8219 | |
8220 /* VECTYPE is the type of the destination. */ | |
8221 vectype = get_vectype_for_scalar_type (TREE_TYPE (gimple_assign_lhs (stmt))); | |
8222 nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype); | |
8223 group_size = SLP_INSTANCE_GROUP_SIZE (instance); | |
8224 | |
8225 /* For each SLP instance calculate number of vector stmts to be created | |
8226 for the scalar stmts in each node of the SLP tree. Number of vector | |
8227 elements in one vector iteration is the number of scalar elements in | |
8228 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector | |
8229 size. */ | |
8230 vec_stmts_size = (vectorization_factor * group_size) / nunits; | |
8231 | |
8232 /* In case of load permutation we have to allocate vectorized statements for | |
8233 all the nodes that participate in that permutation. */ | |
8234 if (SLP_INSTANCE_LOAD_PERMUTATION (instance)) | |
8235 { | |
8236 for (i = 0; | |
8237 VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (instance), i, loads_node); | |
8238 i++) | |
8239 { | |
8240 if (!SLP_TREE_VEC_STMTS (loads_node)) | |
8241 { | |
8242 SLP_TREE_VEC_STMTS (loads_node) = VEC_alloc (gimple, heap, | |
8243 vec_stmts_size); | |
8244 SLP_TREE_NUMBER_OF_VEC_STMTS (loads_node) = vec_stmts_size; | |
8245 } | |
8246 } | |
8247 } | |
8248 | |
8249 if (!SLP_TREE_VEC_STMTS (node)) | |
8250 { | |
8251 SLP_TREE_VEC_STMTS (node) = VEC_alloc (gimple, heap, vec_stmts_size); | |
8252 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size; | |
8253 } | |
8254 | |
8255 if (vect_print_dump_info (REPORT_DETAILS)) | |
8256 { | |
8257 fprintf (vect_dump, "------>vectorizing SLP node starting from: "); | |
8258 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); | |
8259 } | |
8260 | |
8261 /* Loads should be inserted before the first load. */ | |
8262 if (SLP_INSTANCE_FIRST_LOAD_STMT (instance) | |
8263 && STMT_VINFO_STRIDED_ACCESS (stmt_info) | |
8264 && !REFERENCE_CLASS_P (gimple_get_lhs (stmt))) | |
8265 si = gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance)); | |
8266 else | |
8267 si = gsi_for_stmt (stmt); | |
8268 | |
8269 is_store = vect_transform_stmt (stmt, &si, &strided_store, node, instance); | |
8270 if (is_store) | |
8271 { | |
8272 if (DR_GROUP_FIRST_DR (stmt_info)) | |
8273 /* If IS_STORE is TRUE, the vectorization of the | |
8274 interleaving chain was completed - free all the stores in | |
8275 the chain. */ | |
8276 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info)); | |
8277 else | |
8278 /* FORNOW: SLP originates only from strided stores. */ | |
8279 gcc_unreachable (); | |
8280 | |
8281 return true; | |
8282 } | |
8283 | |
8284 /* FORNOW: SLP originates only from strided stores. */ | |
8285 return false; | |
8286 } | |
8287 | |
8288 | |
8289 static bool | |
8290 vect_schedule_slp (loop_vec_info loop_vinfo) | |
8291 { | |
8292 VEC (slp_instance, heap) *slp_instances = | |
8293 LOOP_VINFO_SLP_INSTANCES (loop_vinfo); | |
8294 slp_instance instance; | |
8295 unsigned int i; | |
8296 bool is_store = false; | |
8297 | |
8298 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++) | |
8299 { | |
8300 /* Schedule the tree of INSTANCE. */ | |
8301 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance), | |
8302 instance, LOOP_VINFO_VECT_FACTOR (loop_vinfo)); | |
8303 | |
8304 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS) | |
8305 || vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS)) | |
8306 fprintf (vect_dump, "vectorizing stmts using SLP."); | |
8307 } | |
8308 | |
8309 return is_store; | |
8310 } | |
8311 | |
8312 /* Function vect_transform_loop. | |
8313 | |
8314 The analysis phase has determined that the loop is vectorizable. | |
8315 Vectorize the loop - created vectorized stmts to replace the scalar | |
8316 stmts in the loop, and update the loop exit condition. */ | |
8317 | |
8318 void | |
8319 vect_transform_loop (loop_vec_info loop_vinfo) | |
8320 { | |
8321 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
8322 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); | |
8323 int nbbs = loop->num_nodes; | |
8324 gimple_stmt_iterator si; | |
8325 int i; | |
8326 tree ratio = NULL; | |
8327 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); | |
8328 bool strided_store; | |
8329 bool slp_scheduled = false; | |
8330 unsigned int nunits; | |
8331 | |
8332 if (vect_print_dump_info (REPORT_DETAILS)) | |
8333 fprintf (vect_dump, "=== vec_transform_loop ==="); | |
8334 | |
8335 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)) | |
8336 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo))) | |
8337 vect_loop_versioning (loop_vinfo); | |
8338 | |
8339 /* CHECKME: we wouldn't need this if we called update_ssa once | |
8340 for all loops. */ | |
8341 bitmap_zero (vect_memsyms_to_rename); | |
8342 | |
8343 /* Peel the loop if there are data refs with unknown alignment. | |
8344 Only one data ref with unknown store is allowed. */ | |
8345 | |
8346 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo)) | |
8347 vect_do_peeling_for_alignment (loop_vinfo); | |
8348 | |
8349 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a | |
8350 compile time constant), or it is a constant that doesn't divide by the | |
8351 vectorization factor, then an epilog loop needs to be created. | |
8352 We therefore duplicate the loop: the original loop will be vectorized, | |
8353 and will compute the first (n/VF) iterations. The second copy of the loop | |
8354 will remain scalar and will compute the remaining (n%VF) iterations. | |
8355 (VF is the vectorization factor). */ | |
8356 | |
8357 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) | |
8358 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) | |
8359 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0)) | |
8360 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio); | |
8361 else | |
8362 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)), | |
8363 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor); | |
8364 | |
8365 /* 1) Make sure the loop header has exactly two entries | |
8366 2) Make sure we have a preheader basic block. */ | |
8367 | |
8368 gcc_assert (EDGE_COUNT (loop->header->preds) == 2); | |
8369 | |
8370 split_edge (loop_preheader_edge (loop)); | |
8371 | |
8372 /* FORNOW: the vectorizer supports only loops which body consist | |
8373 of one basic block (header + empty latch). When the vectorizer will | |
8374 support more involved loop forms, the order by which the BBs are | |
8375 traversed need to be reconsidered. */ | |
8376 | |
8377 for (i = 0; i < nbbs; i++) | |
8378 { | |
8379 basic_block bb = bbs[i]; | |
8380 stmt_vec_info stmt_info; | |
8381 gimple phi; | |
8382 | |
8383 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si)) | |
8384 { | |
8385 phi = gsi_stmt (si); | |
8386 if (vect_print_dump_info (REPORT_DETAILS)) | |
8387 { | |
8388 fprintf (vect_dump, "------>vectorizing phi: "); | |
8389 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM); | |
8390 } | |
8391 stmt_info = vinfo_for_stmt (phi); | |
8392 if (!stmt_info) | |
8393 continue; | |
8394 | |
8395 if (!STMT_VINFO_RELEVANT_P (stmt_info) | |
8396 && !STMT_VINFO_LIVE_P (stmt_info)) | |
8397 continue; | |
8398 | |
8399 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)) | |
8400 != (unsigned HOST_WIDE_INT) vectorization_factor) | |
8401 && vect_print_dump_info (REPORT_DETAILS)) | |
8402 fprintf (vect_dump, "multiple-types."); | |
8403 | |
8404 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def) | |
8405 { | |
8406 if (vect_print_dump_info (REPORT_DETAILS)) | |
8407 fprintf (vect_dump, "transform phi."); | |
8408 vect_transform_stmt (phi, NULL, NULL, NULL, NULL); | |
8409 } | |
8410 } | |
8411 | |
8412 for (si = gsi_start_bb (bb); !gsi_end_p (si);) | |
8413 { | |
8414 gimple stmt = gsi_stmt (si); | |
8415 bool is_store; | |
8416 | |
8417 if (vect_print_dump_info (REPORT_DETAILS)) | |
8418 { | |
8419 fprintf (vect_dump, "------>vectorizing statement: "); | |
8420 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); | |
8421 } | |
8422 | |
8423 stmt_info = vinfo_for_stmt (stmt); | |
8424 | |
8425 /* vector stmts created in the outer-loop during vectorization of | |
8426 stmts in an inner-loop may not have a stmt_info, and do not | |
8427 need to be vectorized. */ | |
8428 if (!stmt_info) | |
8429 { | |
8430 gsi_next (&si); | |
8431 continue; | |
8432 } | |
8433 | |
8434 if (!STMT_VINFO_RELEVANT_P (stmt_info) | |
8435 && !STMT_VINFO_LIVE_P (stmt_info)) | |
8436 { | |
8437 gsi_next (&si); | |
8438 continue; | |
8439 } | |
8440 | |
8441 gcc_assert (STMT_VINFO_VECTYPE (stmt_info)); | |
8442 nunits = | |
8443 (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); | |
8444 if (!STMT_SLP_TYPE (stmt_info) | |
8445 && nunits != (unsigned int) vectorization_factor | |
8446 && vect_print_dump_info (REPORT_DETAILS)) | |
8447 /* For SLP VF is set according to unrolling factor, and not to | |
8448 vector size, hence for SLP this print is not valid. */ | |
8449 fprintf (vect_dump, "multiple-types."); | |
8450 | |
8451 /* SLP. Schedule all the SLP instances when the first SLP stmt is | |
8452 reached. */ | |
8453 if (STMT_SLP_TYPE (stmt_info)) | |
8454 { | |
8455 if (!slp_scheduled) | |
8456 { | |
8457 slp_scheduled = true; | |
8458 | |
8459 if (vect_print_dump_info (REPORT_DETAILS)) | |
8460 fprintf (vect_dump, "=== scheduling SLP instances ==="); | |
8461 | |
8462 vect_schedule_slp (loop_vinfo); | |
8463 } | |
8464 | |
8465 /* Hybrid SLP stmts must be vectorized in addition to SLP. */ | |
8466 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info)) | |
8467 { | |
8468 gsi_next (&si); | |
8469 continue; | |
8470 } | |
8471 } | |
8472 | |
8473 /* -------- vectorize statement ------------ */ | |
8474 if (vect_print_dump_info (REPORT_DETAILS)) | |
8475 fprintf (vect_dump, "transform statement."); | |
8476 | |
8477 strided_store = false; | |
8478 is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL); | |
8479 if (is_store) | |
8480 { | |
8481 if (STMT_VINFO_STRIDED_ACCESS (stmt_info)) | |
8482 { | |
8483 /* Interleaving. If IS_STORE is TRUE, the vectorization of the | |
8484 interleaving chain was completed - free all the stores in | |
8485 the chain. */ | |
8486 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info)); | |
8487 gsi_remove (&si, true); | |
8488 continue; | |
8489 } | |
8490 else | |
8491 { | |
8492 /* Free the attached stmt_vec_info and remove the stmt. */ | |
8493 free_stmt_vec_info (stmt); | |
8494 gsi_remove (&si, true); | |
8495 continue; | |
8496 } | |
8497 } | |
8498 gsi_next (&si); | |
8499 } /* stmts in BB */ | |
8500 } /* BBs in loop */ | |
8501 | |
8502 slpeel_make_loop_iterate_ntimes (loop, ratio); | |
8503 | |
8504 mark_set_for_renaming (vect_memsyms_to_rename); | |
8505 | |
8506 /* The memory tags and pointers in vectorized statements need to | |
8507 have their SSA forms updated. FIXME, why can't this be delayed | |
8508 until all the loops have been transformed? */ | |
8509 update_ssa (TODO_update_ssa); | |
8510 | |
8511 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS)) | |
8512 fprintf (vect_dump, "LOOP VECTORIZED."); | |
8513 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOOPS)) | |
8514 fprintf (vect_dump, "OUTER LOOP VECTORIZED."); | |
8515 } |