comparison gcc/tree-ssa-math-opts.c @ 131:84e7813d76e9

gcc-8.2
author mir3636
date Thu, 25 Oct 2018 07:37:49 +0900
parents 04ced10e8804
children 1830386684a0
comparison
equal deleted inserted replaced
111:04ced10e8804 131:84e7813d76e9
1 /* Global, SSA-based optimizations using mathematical identities. 1 /* Global, SSA-based optimizations using mathematical identities.
2 Copyright (C) 2005-2017 Free Software Foundation, Inc. 2 Copyright (C) 2005-2018 Free Software Foundation, Inc.
3 3
4 This file is part of GCC. 4 This file is part of GCC.
5 5
6 GCC is free software; you can redistribute it and/or modify it 6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the 7 under the terms of the GNU General Public License as published by the
113 #include "internal-fn.h" 113 #include "internal-fn.h"
114 #include "case-cfn-macros.h" 114 #include "case-cfn-macros.h"
115 #include "optabs-libfuncs.h" 115 #include "optabs-libfuncs.h"
116 #include "tree-eh.h" 116 #include "tree-eh.h"
117 #include "targhooks.h" 117 #include "targhooks.h"
118 #include "domwalk.h"
118 119
119 /* This structure represents one basic block that either computes a 120 /* This structure represents one basic block that either computes a
120 division, or is a common dominator for basic block that compute a 121 division, or is a common dominator for basic block that compute a
121 division. */ 122 division. */
122 struct occurrence { 123 struct occurrence {
125 126
126 /* If non-NULL, the SSA_NAME holding the definition for a reciprocal 127 /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
127 inserted in BB. */ 128 inserted in BB. */
128 tree recip_def; 129 tree recip_def;
129 130
131 /* If non-NULL, the SSA_NAME holding the definition for a squared
132 reciprocal inserted in BB. */
133 tree square_recip_def;
134
130 /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that 135 /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
131 was inserted in BB. */ 136 was inserted in BB. */
132 gimple *recip_def_stmt; 137 gimple *recip_def_stmt;
133 138
134 /* Pointer to a list of "struct occurrence"s for blocks dominated 139 /* Pointer to a list of "struct occurrence"s for blocks dominated
162 static struct 167 static struct
163 { 168 {
164 /* Number of cexpi calls inserted. */ 169 /* Number of cexpi calls inserted. */
165 int inserted; 170 int inserted;
166 } sincos_stats; 171 } sincos_stats;
167
168 static struct
169 {
170 /* Number of hand-written 16-bit nop / bswaps found. */
171 int found_16bit;
172
173 /* Number of hand-written 32-bit nop / bswaps found. */
174 int found_32bit;
175
176 /* Number of hand-written 64-bit nop / bswaps found. */
177 int found_64bit;
178 } nop_stats, bswap_stats;
179 172
180 static struct 173 static struct
181 { 174 {
182 /* Number of widening multiplication ops inserted. */ 175 /* Number of widening multiplication ops inserted. */
183 int widen_mults_inserted; 176 int widen_mults_inserted;
280 /* No place was found as a child of IDOM. Make BB a sibling of IDOM. */ 273 /* No place was found as a child of IDOM. Make BB a sibling of IDOM. */
281 new_occ->next = *p_head; 274 new_occ->next = *p_head;
282 *p_head = new_occ; 275 *p_head = new_occ;
283 } 276 }
284 277
285 /* Register that we found a division in BB. */ 278 /* Register that we found a division in BB.
279 IMPORTANCE is a measure of how much weighting to give
280 that division. Use IMPORTANCE = 2 to register a single
281 division. If the division is going to be found multiple
282 times use 1 (as it is with squares). */
286 283
287 static inline void 284 static inline void
288 register_division_in (basic_block bb) 285 register_division_in (basic_block bb, int importance)
289 { 286 {
290 struct occurrence *occ; 287 struct occurrence *occ;
291 288
292 occ = (struct occurrence *) bb->aux; 289 occ = (struct occurrence *) bb->aux;
293 if (!occ) 290 if (!occ)
295 occ = occ_new (bb, NULL); 292 occ = occ_new (bb, NULL);
296 insert_bb (occ, ENTRY_BLOCK_PTR_FOR_FN (cfun), &occ_head); 293 insert_bb (occ, ENTRY_BLOCK_PTR_FOR_FN (cfun), &occ_head);
297 } 294 }
298 295
299 occ->bb_has_division = true; 296 occ->bb_has_division = true;
300 occ->num_divisions++; 297 occ->num_divisions += importance;
301 } 298 }
302 299
303 300
304 /* Compute the number of divisions that postdominate each block in OCC and 301 /* Compute the number of divisions that postdominate each block in OCC and
305 its children. */ 302 its children. */
338 confused later by replacing all immediate uses x in such 335 confused later by replacing all immediate uses x in such
339 a stmt. */ 336 a stmt. */
340 && gimple_assign_rhs1 (use_stmt) != def; 337 && gimple_assign_rhs1 (use_stmt) != def;
341 } 338 }
342 339
340 /* Return TRUE if USE_STMT is a multiplication of DEF by A. */
341 static inline bool
342 is_mult_by (gimple *use_stmt, tree def, tree a)
343 {
344 if (gimple_code (use_stmt) == GIMPLE_ASSIGN
345 && gimple_assign_rhs_code (use_stmt) == MULT_EXPR)
346 {
347 tree op0 = gimple_assign_rhs1 (use_stmt);
348 tree op1 = gimple_assign_rhs2 (use_stmt);
349
350 return (op0 == def && op1 == a)
351 || (op0 == a && op1 == def);
352 }
353 return 0;
354 }
355
356 /* Return whether USE_STMT is DEF * DEF. */
357 static inline bool
358 is_square_of (gimple *use_stmt, tree def)
359 {
360 return is_mult_by (use_stmt, def, def);
361 }
362
363 /* Return whether USE_STMT is a floating-point division by
364 DEF * DEF. */
365 static inline bool
366 is_division_by_square (gimple *use_stmt, tree def)
367 {
368 if (gimple_code (use_stmt) == GIMPLE_ASSIGN
369 && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
370 && gimple_assign_rhs1 (use_stmt) != gimple_assign_rhs2 (use_stmt))
371 {
372 tree denominator = gimple_assign_rhs2 (use_stmt);
373 if (TREE_CODE (denominator) == SSA_NAME)
374 {
375 return is_square_of (SSA_NAME_DEF_STMT (denominator), def);
376 }
377 }
378 return 0;
379 }
380
343 /* Walk the subset of the dominator tree rooted at OCC, setting the 381 /* Walk the subset of the dominator tree rooted at OCC, setting the
344 RECIP_DEF field to a definition of 1.0 / DEF that can be used in 382 RECIP_DEF field to a definition of 1.0 / DEF that can be used in
345 the given basic block. The field may be left NULL, of course, 383 the given basic block. The field may be left NULL, of course,
346 if it is not possible or profitable to do the optimization. 384 if it is not possible or profitable to do the optimization.
347 385
348 DEF_BSI is an iterator pointing at the statement defining DEF. 386 DEF_BSI is an iterator pointing at the statement defining DEF.
349 If RECIP_DEF is set, a dominator already has a computation that can 387 If RECIP_DEF is set, a dominator already has a computation that can
350 be used. */ 388 be used.
389
390 If should_insert_square_recip is set, then this also inserts
391 the square of the reciprocal immediately after the definition
392 of the reciprocal. */
351 393
352 static void 394 static void
353 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ, 395 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
354 tree def, tree recip_def, int threshold) 396 tree def, tree recip_def, tree square_recip_def,
397 int should_insert_square_recip, int threshold)
355 { 398 {
356 tree type; 399 tree type;
357 gassign *new_stmt; 400 gassign *new_stmt, *new_square_stmt;
358 gimple_stmt_iterator gsi; 401 gimple_stmt_iterator gsi;
359 struct occurrence *occ_child; 402 struct occurrence *occ_child;
360 403
361 if (!recip_def 404 if (!recip_def
362 && (occ->bb_has_division || !flag_trapping_math) 405 && (occ->bb_has_division || !flag_trapping_math)
363 && occ->num_divisions >= threshold) 406 /* Divide by two as all divisions are counted twice in
407 the costing loop. */
408 && occ->num_divisions / 2 >= threshold)
364 { 409 {
365 /* Make a variable with the replacement and substitute it. */ 410 /* Make a variable with the replacement and substitute it. */
366 type = TREE_TYPE (def); 411 type = TREE_TYPE (def);
367 recip_def = create_tmp_reg (type, "reciptmp"); 412 recip_def = create_tmp_reg (type, "reciptmp");
368 new_stmt = gimple_build_assign (recip_def, RDIV_EXPR, 413 new_stmt = gimple_build_assign (recip_def, RDIV_EXPR,
369 build_one_cst (type), def); 414 build_one_cst (type), def);
370 415
416 if (should_insert_square_recip)
417 {
418 square_recip_def = create_tmp_reg (type, "powmult_reciptmp");
419 new_square_stmt = gimple_build_assign (square_recip_def, MULT_EXPR,
420 recip_def, recip_def);
421 }
422
371 if (occ->bb_has_division) 423 if (occ->bb_has_division)
372 { 424 {
373 /* Case 1: insert before an existing division. */ 425 /* Case 1: insert before an existing division. */
374 gsi = gsi_after_labels (occ->bb); 426 gsi = gsi_after_labels (occ->bb);
375 while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def)) 427 while (!gsi_end_p (gsi)
428 && (!is_division_by (gsi_stmt (gsi), def))
429 && (!is_division_by_square (gsi_stmt (gsi), def)))
376 gsi_next (&gsi); 430 gsi_next (&gsi);
377 431
378 gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT); 432 gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
379 } 433 if (should_insert_square_recip)
434 gsi_insert_before (&gsi, new_square_stmt, GSI_SAME_STMT);
435 }
380 else if (def_gsi && occ->bb == def_gsi->bb) 436 else if (def_gsi && occ->bb == def_gsi->bb)
381 { 437 {
382 /* Case 2: insert right after the definition. Note that this will 438 /* Case 2: insert right after the definition. Note that this will
383 never happen if the definition statement can throw, because in 439 never happen if the definition statement can throw, because in
384 that case the sole successor of the statement's basic block will 440 that case the sole successor of the statement's basic block will
385 dominate all the uses as well. */ 441 dominate all the uses as well. */
386 gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT); 442 gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
387 } 443 if (should_insert_square_recip)
444 gsi_insert_after (def_gsi, new_square_stmt, GSI_NEW_STMT);
445 }
388 else 446 else
389 { 447 {
390 /* Case 3: insert in a basic block not containing defs/uses. */ 448 /* Case 3: insert in a basic block not containing defs/uses. */
391 gsi = gsi_after_labels (occ->bb); 449 gsi = gsi_after_labels (occ->bb);
392 gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT); 450 gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
393 } 451 if (should_insert_square_recip)
452 gsi_insert_before (&gsi, new_square_stmt, GSI_SAME_STMT);
453 }
394 454
395 reciprocal_stats.rdivs_inserted++; 455 reciprocal_stats.rdivs_inserted++;
396 456
397 occ->recip_def_stmt = new_stmt; 457 occ->recip_def_stmt = new_stmt;
398 } 458 }
399 459
400 occ->recip_def = recip_def; 460 occ->recip_def = recip_def;
461 occ->square_recip_def = square_recip_def;
401 for (occ_child = occ->children; occ_child; occ_child = occ_child->next) 462 for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
402 insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold); 463 insert_reciprocals (def_gsi, occ_child, def, recip_def,
464 square_recip_def, should_insert_square_recip,
465 threshold);
466 }
467
468 /* Replace occurrences of expr / (x * x) with expr * ((1 / x) * (1 / x)).
469 Take as argument the use for (x * x). */
470 static inline void
471 replace_reciprocal_squares (use_operand_p use_p)
472 {
473 gimple *use_stmt = USE_STMT (use_p);
474 basic_block bb = gimple_bb (use_stmt);
475 struct occurrence *occ = (struct occurrence *) bb->aux;
476
477 if (optimize_bb_for_speed_p (bb) && occ->square_recip_def
478 && occ->recip_def)
479 {
480 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
481 gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
482 gimple_assign_set_rhs2 (use_stmt, occ->square_recip_def);
483 SET_USE (use_p, occ->square_recip_def);
484 fold_stmt_inplace (&gsi);
485 update_stmt (use_stmt);
486 }
403 } 487 }
404 488
405 489
406 /* Replace the division at USE_P with a multiplication by the reciprocal, if 490 /* Replace the division at USE_P with a multiplication by the reciprocal, if
407 possible. */ 491 possible. */
448 532
449 return child; 533 return child;
450 } 534 }
451 } 535 }
452 536
537 /* Transform sequences like
538 t = sqrt (a)
539 x = 1.0 / t;
540 r1 = x * x;
541 r2 = a * x;
542 into:
543 t = sqrt (a)
544 r1 = 1.0 / a;
545 r2 = t;
546 x = r1 * r2;
547 depending on the uses of x, r1, r2. This removes one multiplication and
548 allows the sqrt and division operations to execute in parallel.
549 DEF_GSI is the gsi of the initial division by sqrt that defines
550 DEF (x in the example above). */
551
552 static void
553 optimize_recip_sqrt (gimple_stmt_iterator *def_gsi, tree def)
554 {
555 gimple *use_stmt;
556 imm_use_iterator use_iter;
557 gimple *stmt = gsi_stmt (*def_gsi);
558 tree x = def;
559 tree orig_sqrt_ssa_name = gimple_assign_rhs2 (stmt);
560 tree div_rhs1 = gimple_assign_rhs1 (stmt);
561
562 if (TREE_CODE (orig_sqrt_ssa_name) != SSA_NAME
563 || TREE_CODE (div_rhs1) != REAL_CST
564 || !real_equal (&TREE_REAL_CST (div_rhs1), &dconst1))
565 return;
566
567 gcall *sqrt_stmt
568 = dyn_cast <gcall *> (SSA_NAME_DEF_STMT (orig_sqrt_ssa_name));
569
570 if (!sqrt_stmt || !gimple_call_lhs (sqrt_stmt))
571 return;
572
573 switch (gimple_call_combined_fn (sqrt_stmt))
574 {
575 CASE_CFN_SQRT:
576 CASE_CFN_SQRT_FN:
577 break;
578
579 default:
580 return;
581 }
582 tree a = gimple_call_arg (sqrt_stmt, 0);
583
584 /* We have 'a' and 'x'. Now analyze the uses of 'x'. */
585
586 /* Statements that use x in x * x. */
587 auto_vec<gimple *> sqr_stmts;
588 /* Statements that use x in a * x. */
589 auto_vec<gimple *> mult_stmts;
590 bool has_other_use = false;
591 bool mult_on_main_path = false;
592
593 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, x)
594 {
595 if (is_gimple_debug (use_stmt))
596 continue;
597 if (is_square_of (use_stmt, x))
598 {
599 sqr_stmts.safe_push (use_stmt);
600 if (gimple_bb (use_stmt) == gimple_bb (stmt))
601 mult_on_main_path = true;
602 }
603 else if (is_mult_by (use_stmt, x, a))
604 {
605 mult_stmts.safe_push (use_stmt);
606 if (gimple_bb (use_stmt) == gimple_bb (stmt))
607 mult_on_main_path = true;
608 }
609 else
610 has_other_use = true;
611 }
612
613 /* In the x * x and a * x cases we just rewire stmt operands or
614 remove multiplications. In the has_other_use case we introduce
615 a multiplication so make sure we don't introduce a multiplication
616 on a path where there was none. */
617 if (has_other_use && !mult_on_main_path)
618 return;
619
620 if (sqr_stmts.is_empty () && mult_stmts.is_empty ())
621 return;
622
623 /* If x = 1.0 / sqrt (a) has uses other than those optimized here we want
624 to be able to compose it from the sqr and mult cases. */
625 if (has_other_use && (sqr_stmts.is_empty () || mult_stmts.is_empty ()))
626 return;
627
628 if (dump_file)
629 {
630 fprintf (dump_file, "Optimizing reciprocal sqrt multiplications of\n");
631 print_gimple_stmt (dump_file, sqrt_stmt, 0, TDF_NONE);
632 print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
633 fprintf (dump_file, "\n");
634 }
635
636 bool delete_div = !has_other_use;
637 tree sqr_ssa_name = NULL_TREE;
638 if (!sqr_stmts.is_empty ())
639 {
640 /* r1 = x * x. Transform the original
641 x = 1.0 / t
642 into
643 tmp1 = 1.0 / a
644 r1 = tmp1. */
645
646 sqr_ssa_name
647 = make_temp_ssa_name (TREE_TYPE (a), NULL, "recip_sqrt_sqr");
648
649 if (dump_file)
650 {
651 fprintf (dump_file, "Replacing original division\n");
652 print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
653 fprintf (dump_file, "with new division\n");
654 }
655 gimple_assign_set_lhs (stmt, sqr_ssa_name);
656 gimple_assign_set_rhs2 (stmt, a);
657 fold_stmt_inplace (def_gsi);
658 update_stmt (stmt);
659
660 if (dump_file)
661 print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
662
663 delete_div = false;
664 gimple *sqr_stmt;
665 unsigned int i;
666 FOR_EACH_VEC_ELT (sqr_stmts, i, sqr_stmt)
667 {
668 gimple_stmt_iterator gsi2 = gsi_for_stmt (sqr_stmt);
669 gimple_assign_set_rhs_from_tree (&gsi2, sqr_ssa_name);
670 update_stmt (sqr_stmt);
671 }
672 }
673 if (!mult_stmts.is_empty ())
674 {
675 /* r2 = a * x. Transform this into:
676 r2 = t (The original sqrt (a)). */
677 unsigned int i;
678 gimple *mult_stmt = NULL;
679 FOR_EACH_VEC_ELT (mult_stmts, i, mult_stmt)
680 {
681 gimple_stmt_iterator gsi2 = gsi_for_stmt (mult_stmt);
682
683 if (dump_file)
684 {
685 fprintf (dump_file, "Replacing squaring multiplication\n");
686 print_gimple_stmt (dump_file, mult_stmt, 0, TDF_NONE);
687 fprintf (dump_file, "with assignment\n");
688 }
689 gimple_assign_set_rhs_from_tree (&gsi2, orig_sqrt_ssa_name);
690 fold_stmt_inplace (&gsi2);
691 update_stmt (mult_stmt);
692 if (dump_file)
693 print_gimple_stmt (dump_file, mult_stmt, 0, TDF_NONE);
694 }
695 }
696
697 if (has_other_use)
698 {
699 /* Using the two temporaries tmp1, tmp2 from above
700 the original x is now:
701 x = tmp1 * tmp2. */
702 gcc_assert (orig_sqrt_ssa_name);
703 gcc_assert (sqr_ssa_name);
704
705 gimple *new_stmt
706 = gimple_build_assign (x, MULT_EXPR,
707 orig_sqrt_ssa_name, sqr_ssa_name);
708 gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
709 update_stmt (stmt);
710 }
711 else if (delete_div)
712 {
713 /* Remove the original division. */
714 gimple_stmt_iterator gsi2 = gsi_for_stmt (stmt);
715 gsi_remove (&gsi2, true);
716 release_defs (stmt);
717 }
718 }
453 719
454 /* Look for floating-point divisions among DEF's uses, and try to 720 /* Look for floating-point divisions among DEF's uses, and try to
455 replace them by multiplications with the reciprocal. Add 721 replace them by multiplications with the reciprocal. Add
456 as many statements computing the reciprocal as needed. 722 as many statements computing the reciprocal as needed.
457 723
458 DEF must be a GIMPLE register of a floating-point type. */ 724 DEF must be a GIMPLE register of a floating-point type. */
459 725
460 static void 726 static void
461 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def) 727 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
462 { 728 {
463 use_operand_p use_p; 729 use_operand_p use_p, square_use_p;
464 imm_use_iterator use_iter; 730 imm_use_iterator use_iter, square_use_iter;
731 tree square_def;
465 struct occurrence *occ; 732 struct occurrence *occ;
466 int count = 0, threshold; 733 int count = 0;
467 734 int threshold;
468 gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def)); 735 int square_recip_count = 0;
736 int sqrt_recip_count = 0;
737
738 gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && TREE_CODE (def) == SSA_NAME);
739 threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
740
741 /* If DEF is a square (x * x), count the number of divisions by x.
742 If there are more divisions by x than by (DEF * DEF), prefer to optimize
743 the reciprocal of x instead of DEF. This improves cases like:
744 def = x * x
745 t0 = a / def
746 t1 = b / def
747 t2 = c / x
748 Reciprocal optimization of x results in 1 division rather than 2 or 3. */
749 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
750
751 if (is_gimple_assign (def_stmt)
752 && gimple_assign_rhs_code (def_stmt) == MULT_EXPR
753 && TREE_CODE (gimple_assign_rhs1 (def_stmt)) == SSA_NAME
754 && gimple_assign_rhs1 (def_stmt) == gimple_assign_rhs2 (def_stmt))
755 {
756 tree op0 = gimple_assign_rhs1 (def_stmt);
757
758 FOR_EACH_IMM_USE_FAST (use_p, use_iter, op0)
759 {
760 gimple *use_stmt = USE_STMT (use_p);
761 if (is_division_by (use_stmt, op0))
762 sqrt_recip_count++;
763 }
764 }
469 765
470 FOR_EACH_IMM_USE_FAST (use_p, use_iter, def) 766 FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
471 { 767 {
472 gimple *use_stmt = USE_STMT (use_p); 768 gimple *use_stmt = USE_STMT (use_p);
473 if (is_division_by (use_stmt, def)) 769 if (is_division_by (use_stmt, def))
474 { 770 {
475 register_division_in (gimple_bb (use_stmt)); 771 register_division_in (gimple_bb (use_stmt), 2);
476 count++; 772 count++;
477 } 773 }
478 } 774
775 if (is_square_of (use_stmt, def))
776 {
777 square_def = gimple_assign_lhs (use_stmt);
778 FOR_EACH_IMM_USE_FAST (square_use_p, square_use_iter, square_def)
779 {
780 gimple *square_use_stmt = USE_STMT (square_use_p);
781 if (is_division_by (square_use_stmt, square_def))
782 {
783 /* This is executed twice for each division by a square. */
784 register_division_in (gimple_bb (square_use_stmt), 1);
785 square_recip_count++;
786 }
787 }
788 }
789 }
790
791 /* Square reciprocals were counted twice above. */
792 square_recip_count /= 2;
793
794 /* If it is more profitable to optimize 1 / x, don't optimize 1 / (x * x). */
795 if (sqrt_recip_count > square_recip_count)
796 return;
479 797
480 /* Do the expensive part only if we can hope to optimize something. */ 798 /* Do the expensive part only if we can hope to optimize something. */
481 threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def))); 799 if (count + square_recip_count >= threshold && count >= 1)
482 if (count >= threshold)
483 { 800 {
484 gimple *use_stmt; 801 gimple *use_stmt;
485 for (occ = occ_head; occ; occ = occ->next) 802 for (occ = occ_head; occ; occ = occ->next)
486 { 803 {
487 compute_merit (occ); 804 compute_merit (occ);
488 insert_reciprocals (def_gsi, occ, def, NULL, threshold); 805 insert_reciprocals (def_gsi, occ, def, NULL, NULL,
806 square_recip_count, threshold);
489 } 807 }
490 808
491 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def) 809 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
492 { 810 {
493 if (is_division_by (use_stmt, def)) 811 if (is_division_by (use_stmt, def))
494 { 812 {
495 FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter) 813 FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
496 replace_reciprocal (use_p); 814 replace_reciprocal (use_p);
497 } 815 }
816 else if (square_recip_count > 0 && is_square_of (use_stmt, def))
817 {
818 FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
819 {
820 /* Find all uses of the square that are divisions and
821 * replace them by multiplications with the inverse. */
822 imm_use_iterator square_iterator;
823 gimple *powmult_use_stmt = USE_STMT (use_p);
824 tree powmult_def_name = gimple_assign_lhs (powmult_use_stmt);
825
826 FOR_EACH_IMM_USE_STMT (powmult_use_stmt,
827 square_iterator, powmult_def_name)
828 FOR_EACH_IMM_USE_ON_STMT (square_use_p, square_iterator)
829 {
830 gimple *powmult_use_stmt = USE_STMT (square_use_p);
831 if (is_division_by (powmult_use_stmt, powmult_def_name))
832 replace_reciprocal_squares (square_use_p);
833 }
834 }
835 }
498 } 836 }
499 } 837 }
500 838
501 for (occ = occ_head; occ; ) 839 for (occ = occ_head; occ; )
502 occ = free_bb (occ); 840 occ = free_bb (occ);
513 internal_fn ifn; 851 internal_fn ifn;
514 852
515 switch (gimple_call_combined_fn (call)) 853 switch (gimple_call_combined_fn (call))
516 { 854 {
517 CASE_CFN_SQRT: 855 CASE_CFN_SQRT:
856 CASE_CFN_SQRT_FN:
518 ifn = IFN_RSQRT; 857 ifn = IFN_RSQRT;
519 break; 858 break;
520 859
521 default: 860 default:
522 return IFN_LAST; 861 return IFN_LAST;
536 const pass_data pass_data_cse_reciprocals = 875 const pass_data pass_data_cse_reciprocals =
537 { 876 {
538 GIMPLE_PASS, /* type */ 877 GIMPLE_PASS, /* type */
539 "recip", /* name */ 878 "recip", /* name */
540 OPTGROUP_NONE, /* optinfo_flags */ 879 OPTGROUP_NONE, /* optinfo_flags */
541 TV_NONE, /* tv_id */ 880 TV_TREE_RECIP, /* tv_id */
542 PROP_ssa, /* properties_required */ 881 PROP_ssa, /* properties_required */
543 0, /* properties_provided */ 882 0, /* properties_provided */
544 0, /* properties_destroyed */ 883 0, /* properties_destroyed */
545 0, /* todo_flags_start */ 884 0, /* todo_flags_start */
546 TODO_update_ssa, /* todo_flags_finish */ 885 TODO_update_ssa, /* todo_flags_finish */
605 944
606 if (gimple_has_lhs (stmt) 945 if (gimple_has_lhs (stmt)
607 && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL 946 && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
608 && FLOAT_TYPE_P (TREE_TYPE (def)) 947 && FLOAT_TYPE_P (TREE_TYPE (def))
609 && TREE_CODE (def) == SSA_NAME) 948 && TREE_CODE (def) == SSA_NAME)
610 execute_cse_reciprocals_1 (&gsi, def); 949 {
950 execute_cse_reciprocals_1 (&gsi, def);
951 stmt = gsi_stmt (gsi);
952 if (flag_unsafe_math_optimizations
953 && is_gimple_assign (stmt)
954 && !stmt_can_throw_internal (cfun, stmt)
955 && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
956 optimize_recip_sqrt (&gsi, def);
957 }
611 } 958 }
612 959
613 if (optimize_bb_for_size_p (bb)) 960 if (optimize_bb_for_size_p (bb))
614 continue; 961 continue;
615 962
642 internal_fn ifn = internal_fn_reciprocal (call); 989 internal_fn ifn = internal_fn_reciprocal (call);
643 if (ifn == IFN_LAST) 990 if (ifn == IFN_LAST)
644 { 991 {
645 fndecl = gimple_call_fndecl (call); 992 fndecl = gimple_call_fndecl (call);
646 if (!fndecl 993 if (!fndecl
647 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_MD) 994 || !fndecl_built_in_p (fndecl, BUILT_IN_MD))
648 continue; 995 continue;
649 fndecl = targetm.builtin_reciprocal (fndecl); 996 fndecl = targetm.builtin_reciprocal (fndecl);
650 if (!fndecl) 997 if (!fndecl)
651 continue; 998 continue;
652 } 999 }
1750 const pass_data pass_data_cse_sincos = 2097 const pass_data pass_data_cse_sincos =
1751 { 2098 {
1752 GIMPLE_PASS, /* type */ 2099 GIMPLE_PASS, /* type */
1753 "sincos", /* name */ 2100 "sincos", /* name */
1754 OPTGROUP_NONE, /* optinfo_flags */ 2101 OPTGROUP_NONE, /* optinfo_flags */
1755 TV_NONE, /* tv_id */ 2102 TV_TREE_SINCOS, /* tv_id */
1756 PROP_ssa, /* properties_required */ 2103 PROP_ssa, /* properties_required */
1757 PROP_gimple_opt_math, /* properties_provided */ 2104 PROP_gimple_opt_math, /* properties_provided */
1758 0, /* properties_destroyed */ 2105 0, /* properties_destroyed */
1759 0, /* todo_flags_start */ 2106 0, /* todo_flags_start */
1760 TODO_update_ssa, /* todo_flags_finish */ 2107 TODO_update_ssa, /* todo_flags_finish */
1931 make_pass_cse_sincos (gcc::context *ctxt) 2278 make_pass_cse_sincos (gcc::context *ctxt)
1932 { 2279 {
1933 return new pass_cse_sincos (ctxt); 2280 return new pass_cse_sincos (ctxt);
1934 } 2281 }
1935 2282
1936 /* A symbolic number structure is used to detect byte permutation and selection
1937 patterns of a source. To achieve that, its field N contains an artificial
1938 number consisting of BITS_PER_MARKER sized markers tracking where does each
1939 byte come from in the source:
1940
1941 0 - target byte has the value 0
1942 FF - target byte has an unknown value (eg. due to sign extension)
1943 1..size - marker value is the byte index in the source (0 for lsb).
1944
1945 To detect permutations on memory sources (arrays and structures), a symbolic
1946 number is also associated:
1947 - a base address BASE_ADDR and an OFFSET giving the address of the source;
1948 - a range which gives the difference between the highest and lowest accessed
1949 memory location to make such a symbolic number;
1950 - the address SRC of the source element of lowest address as a convenience
1951 to easily get BASE_ADDR + offset + lowest bytepos;
1952 - number of expressions N_OPS bitwise ored together to represent
1953 approximate cost of the computation.
1954
1955 Note 1: the range is different from size as size reflects the size of the
1956 type of the current expression. For instance, for an array char a[],
1957 (short) a[0] | (short) a[3] would have a size of 2 but a range of 4 while
1958 (short) a[0] | ((short) a[0] << 1) would still have a size of 2 but this
1959 time a range of 1.
1960
1961 Note 2: for non-memory sources, range holds the same value as size.
1962
1963 Note 3: SRC points to the SSA_NAME in case of non-memory source. */
1964
1965 struct symbolic_number {
1966 uint64_t n;
1967 tree type;
1968 tree base_addr;
1969 tree offset;
1970 HOST_WIDE_INT bytepos;
1971 tree src;
1972 tree alias_set;
1973 tree vuse;
1974 unsigned HOST_WIDE_INT range;
1975 int n_ops;
1976 };
1977
1978 #define BITS_PER_MARKER 8
1979 #define MARKER_MASK ((1 << BITS_PER_MARKER) - 1)
1980 #define MARKER_BYTE_UNKNOWN MARKER_MASK
1981 #define HEAD_MARKER(n, size) \
1982 ((n) & ((uint64_t) MARKER_MASK << (((size) - 1) * BITS_PER_MARKER)))
1983
1984 /* The number which the find_bswap_or_nop_1 result should match in
1985 order to have a nop. The number is masked according to the size of
1986 the symbolic number before using it. */
1987 #define CMPNOP (sizeof (int64_t) < 8 ? 0 : \
1988 (uint64_t)0x08070605 << 32 | 0x04030201)
1989
1990 /* The number which the find_bswap_or_nop_1 result should match in
1991 order to have a byte swap. The number is masked according to the
1992 size of the symbolic number before using it. */
1993 #define CMPXCHG (sizeof (int64_t) < 8 ? 0 : \
1994 (uint64_t)0x01020304 << 32 | 0x05060708)
1995
1996 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1997 number N. Return false if the requested operation is not permitted
1998 on a symbolic number. */
1999
2000 static inline bool
2001 do_shift_rotate (enum tree_code code,
2002 struct symbolic_number *n,
2003 int count)
2004 {
2005 int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2006 unsigned head_marker;
2007
2008 if (count % BITS_PER_UNIT != 0)
2009 return false;
2010 count = (count / BITS_PER_UNIT) * BITS_PER_MARKER;
2011
2012 /* Zero out the extra bits of N in order to avoid them being shifted
2013 into the significant bits. */
2014 if (size < 64 / BITS_PER_MARKER)
2015 n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
2016
2017 switch (code)
2018 {
2019 case LSHIFT_EXPR:
2020 n->n <<= count;
2021 break;
2022 case RSHIFT_EXPR:
2023 head_marker = HEAD_MARKER (n->n, size);
2024 n->n >>= count;
2025 /* Arithmetic shift of signed type: result is dependent on the value. */
2026 if (!TYPE_UNSIGNED (n->type) && head_marker)
2027 for (i = 0; i < count / BITS_PER_MARKER; i++)
2028 n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
2029 << ((size - 1 - i) * BITS_PER_MARKER);
2030 break;
2031 case LROTATE_EXPR:
2032 n->n = (n->n << count) | (n->n >> ((size * BITS_PER_MARKER) - count));
2033 break;
2034 case RROTATE_EXPR:
2035 n->n = (n->n >> count) | (n->n << ((size * BITS_PER_MARKER) - count));
2036 break;
2037 default:
2038 return false;
2039 }
2040 /* Zero unused bits for size. */
2041 if (size < 64 / BITS_PER_MARKER)
2042 n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
2043 return true;
2044 }
2045
2046 /* Perform sanity checking for the symbolic number N and the gimple
2047 statement STMT. */
2048
2049 static inline bool
2050 verify_symbolic_number_p (struct symbolic_number *n, gimple *stmt)
2051 {
2052 tree lhs_type;
2053
2054 lhs_type = gimple_expr_type (stmt);
2055
2056 if (TREE_CODE (lhs_type) != INTEGER_TYPE)
2057 return false;
2058
2059 if (TYPE_PRECISION (lhs_type) != TYPE_PRECISION (n->type))
2060 return false;
2061
2062 return true;
2063 }
2064
2065 /* Initialize the symbolic number N for the bswap pass from the base element
2066 SRC manipulated by the bitwise OR expression. */
2067
2068 static bool
2069 init_symbolic_number (struct symbolic_number *n, tree src)
2070 {
2071 int size;
2072
2073 if (! INTEGRAL_TYPE_P (TREE_TYPE (src)))
2074 return false;
2075
2076 n->base_addr = n->offset = n->alias_set = n->vuse = NULL_TREE;
2077 n->src = src;
2078
2079 /* Set up the symbolic number N by setting each byte to a value between 1 and
2080 the byte size of rhs1. The highest order byte is set to n->size and the
2081 lowest order byte to 1. */
2082 n->type = TREE_TYPE (src);
2083 size = TYPE_PRECISION (n->type);
2084 if (size % BITS_PER_UNIT != 0)
2085 return false;
2086 size /= BITS_PER_UNIT;
2087 if (size > 64 / BITS_PER_MARKER)
2088 return false;
2089 n->range = size;
2090 n->n = CMPNOP;
2091 n->n_ops = 1;
2092
2093 if (size < 64 / BITS_PER_MARKER)
2094 n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
2095
2096 return true;
2097 }
2098
2099 /* Check if STMT might be a byte swap or a nop from a memory source and returns
2100 the answer. If so, REF is that memory source and the base of the memory area
2101 accessed and the offset of the access from that base are recorded in N. */
2102
2103 bool
2104 find_bswap_or_nop_load (gimple *stmt, tree ref, struct symbolic_number *n)
2105 {
2106 /* Leaf node is an array or component ref. Memorize its base and
2107 offset from base to compare to other such leaf node. */
2108 HOST_WIDE_INT bitsize, bitpos;
2109 machine_mode mode;
2110 int unsignedp, reversep, volatilep;
2111 tree offset, base_addr;
2112
2113 /* Not prepared to handle PDP endian. */
2114 if (BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN)
2115 return false;
2116
2117 if (!gimple_assign_load_p (stmt) || gimple_has_volatile_ops (stmt))
2118 return false;
2119
2120 base_addr = get_inner_reference (ref, &bitsize, &bitpos, &offset, &mode,
2121 &unsignedp, &reversep, &volatilep);
2122
2123 if (TREE_CODE (base_addr) == MEM_REF)
2124 {
2125 offset_int bit_offset = 0;
2126 tree off = TREE_OPERAND (base_addr, 1);
2127
2128 if (!integer_zerop (off))
2129 {
2130 offset_int boff, coff = mem_ref_offset (base_addr);
2131 boff = coff << LOG2_BITS_PER_UNIT;
2132 bit_offset += boff;
2133 }
2134
2135 base_addr = TREE_OPERAND (base_addr, 0);
2136
2137 /* Avoid returning a negative bitpos as this may wreak havoc later. */
2138 if (wi::neg_p (bit_offset))
2139 {
2140 offset_int mask = wi::mask <offset_int> (LOG2_BITS_PER_UNIT, false);
2141 offset_int tem = wi::bit_and_not (bit_offset, mask);
2142 /* TEM is the bitpos rounded to BITS_PER_UNIT towards -Inf.
2143 Subtract it to BIT_OFFSET and add it (scaled) to OFFSET. */
2144 bit_offset -= tem;
2145 tem >>= LOG2_BITS_PER_UNIT;
2146 if (offset)
2147 offset = size_binop (PLUS_EXPR, offset,
2148 wide_int_to_tree (sizetype, tem));
2149 else
2150 offset = wide_int_to_tree (sizetype, tem);
2151 }
2152
2153 bitpos += bit_offset.to_shwi ();
2154 }
2155
2156 if (bitpos % BITS_PER_UNIT)
2157 return false;
2158 if (bitsize % BITS_PER_UNIT)
2159 return false;
2160 if (reversep)
2161 return false;
2162
2163 if (!init_symbolic_number (n, ref))
2164 return false;
2165 n->base_addr = base_addr;
2166 n->offset = offset;
2167 n->bytepos = bitpos / BITS_PER_UNIT;
2168 n->alias_set = reference_alias_ptr_type (ref);
2169 n->vuse = gimple_vuse (stmt);
2170 return true;
2171 }
2172
2173 /* Compute the symbolic number N representing the result of a bitwise OR on 2
2174 symbolic number N1 and N2 whose source statements are respectively
2175 SOURCE_STMT1 and SOURCE_STMT2. */
2176
2177 static gimple *
2178 perform_symbolic_merge (gimple *source_stmt1, struct symbolic_number *n1,
2179 gimple *source_stmt2, struct symbolic_number *n2,
2180 struct symbolic_number *n)
2181 {
2182 int i, size;
2183 uint64_t mask;
2184 gimple *source_stmt;
2185 struct symbolic_number *n_start;
2186
2187 tree rhs1 = gimple_assign_rhs1 (source_stmt1);
2188 if (TREE_CODE (rhs1) == BIT_FIELD_REF
2189 && TREE_CODE (TREE_OPERAND (rhs1, 0)) == SSA_NAME)
2190 rhs1 = TREE_OPERAND (rhs1, 0);
2191 tree rhs2 = gimple_assign_rhs1 (source_stmt2);
2192 if (TREE_CODE (rhs2) == BIT_FIELD_REF
2193 && TREE_CODE (TREE_OPERAND (rhs2, 0)) == SSA_NAME)
2194 rhs2 = TREE_OPERAND (rhs2, 0);
2195
2196 /* Sources are different, cancel bswap if they are not memory location with
2197 the same base (array, structure, ...). */
2198 if (rhs1 != rhs2)
2199 {
2200 uint64_t inc;
2201 HOST_WIDE_INT start_sub, end_sub, end1, end2, end;
2202 struct symbolic_number *toinc_n_ptr, *n_end;
2203 basic_block bb1, bb2;
2204
2205 if (!n1->base_addr || !n2->base_addr
2206 || !operand_equal_p (n1->base_addr, n2->base_addr, 0))
2207 return NULL;
2208
2209 if (!n1->offset != !n2->offset
2210 || (n1->offset && !operand_equal_p (n1->offset, n2->offset, 0)))
2211 return NULL;
2212
2213 if (n1->bytepos < n2->bytepos)
2214 {
2215 n_start = n1;
2216 start_sub = n2->bytepos - n1->bytepos;
2217 }
2218 else
2219 {
2220 n_start = n2;
2221 start_sub = n1->bytepos - n2->bytepos;
2222 }
2223
2224 bb1 = gimple_bb (source_stmt1);
2225 bb2 = gimple_bb (source_stmt2);
2226 if (dominated_by_p (CDI_DOMINATORS, bb1, bb2))
2227 source_stmt = source_stmt1;
2228 else
2229 source_stmt = source_stmt2;
2230
2231 /* Find the highest address at which a load is performed and
2232 compute related info. */
2233 end1 = n1->bytepos + (n1->range - 1);
2234 end2 = n2->bytepos + (n2->range - 1);
2235 if (end1 < end2)
2236 {
2237 end = end2;
2238 end_sub = end2 - end1;
2239 }
2240 else
2241 {
2242 end = end1;
2243 end_sub = end1 - end2;
2244 }
2245 n_end = (end2 > end1) ? n2 : n1;
2246
2247 /* Find symbolic number whose lsb is the most significant. */
2248 if (BYTES_BIG_ENDIAN)
2249 toinc_n_ptr = (n_end == n1) ? n2 : n1;
2250 else
2251 toinc_n_ptr = (n_start == n1) ? n2 : n1;
2252
2253 n->range = end - n_start->bytepos + 1;
2254
2255 /* Check that the range of memory covered can be represented by
2256 a symbolic number. */
2257 if (n->range > 64 / BITS_PER_MARKER)
2258 return NULL;
2259
2260 /* Reinterpret byte marks in symbolic number holding the value of
2261 bigger weight according to target endianness. */
2262 inc = BYTES_BIG_ENDIAN ? end_sub : start_sub;
2263 size = TYPE_PRECISION (n1->type) / BITS_PER_UNIT;
2264 for (i = 0; i < size; i++, inc <<= BITS_PER_MARKER)
2265 {
2266 unsigned marker
2267 = (toinc_n_ptr->n >> (i * BITS_PER_MARKER)) & MARKER_MASK;
2268 if (marker && marker != MARKER_BYTE_UNKNOWN)
2269 toinc_n_ptr->n += inc;
2270 }
2271 }
2272 else
2273 {
2274 n->range = n1->range;
2275 n_start = n1;
2276 source_stmt = source_stmt1;
2277 }
2278
2279 if (!n1->alias_set
2280 || alias_ptr_types_compatible_p (n1->alias_set, n2->alias_set))
2281 n->alias_set = n1->alias_set;
2282 else
2283 n->alias_set = ptr_type_node;
2284 n->vuse = n_start->vuse;
2285 n->base_addr = n_start->base_addr;
2286 n->offset = n_start->offset;
2287 n->src = n_start->src;
2288 n->bytepos = n_start->bytepos;
2289 n->type = n_start->type;
2290 size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2291
2292 for (i = 0, mask = MARKER_MASK; i < size; i++, mask <<= BITS_PER_MARKER)
2293 {
2294 uint64_t masked1, masked2;
2295
2296 masked1 = n1->n & mask;
2297 masked2 = n2->n & mask;
2298 if (masked1 && masked2 && masked1 != masked2)
2299 return NULL;
2300 }
2301 n->n = n1->n | n2->n;
2302 n->n_ops = n1->n_ops + n2->n_ops;
2303
2304 return source_stmt;
2305 }
2306
2307 /* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform
2308 the operation given by the rhs of STMT on the result. If the operation
2309 could successfully be executed the function returns a gimple stmt whose
2310 rhs's first tree is the expression of the source operand and NULL
2311 otherwise. */
2312
2313 static gimple *
2314 find_bswap_or_nop_1 (gimple *stmt, struct symbolic_number *n, int limit)
2315 {
2316 enum tree_code code;
2317 tree rhs1, rhs2 = NULL;
2318 gimple *rhs1_stmt, *rhs2_stmt, *source_stmt1;
2319 enum gimple_rhs_class rhs_class;
2320
2321 if (!limit || !is_gimple_assign (stmt))
2322 return NULL;
2323
2324 rhs1 = gimple_assign_rhs1 (stmt);
2325
2326 if (find_bswap_or_nop_load (stmt, rhs1, n))
2327 return stmt;
2328
2329 /* Handle BIT_FIELD_REF. */
2330 if (TREE_CODE (rhs1) == BIT_FIELD_REF
2331 && TREE_CODE (TREE_OPERAND (rhs1, 0)) == SSA_NAME)
2332 {
2333 unsigned HOST_WIDE_INT bitsize = tree_to_uhwi (TREE_OPERAND (rhs1, 1));
2334 unsigned HOST_WIDE_INT bitpos = tree_to_uhwi (TREE_OPERAND (rhs1, 2));
2335 if (bitpos % BITS_PER_UNIT == 0
2336 && bitsize % BITS_PER_UNIT == 0
2337 && init_symbolic_number (n, TREE_OPERAND (rhs1, 0)))
2338 {
2339 /* Handle big-endian bit numbering in BIT_FIELD_REF. */
2340 if (BYTES_BIG_ENDIAN)
2341 bitpos = TYPE_PRECISION (n->type) - bitpos - bitsize;
2342
2343 /* Shift. */
2344 if (!do_shift_rotate (RSHIFT_EXPR, n, bitpos))
2345 return NULL;
2346
2347 /* Mask. */
2348 uint64_t mask = 0;
2349 uint64_t tmp = (1 << BITS_PER_UNIT) - 1;
2350 for (unsigned i = 0; i < bitsize / BITS_PER_UNIT;
2351 i++, tmp <<= BITS_PER_UNIT)
2352 mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER);
2353 n->n &= mask;
2354
2355 /* Convert. */
2356 n->type = TREE_TYPE (rhs1);
2357 if (!n->base_addr)
2358 n->range = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2359
2360 return verify_symbolic_number_p (n, stmt) ? stmt : NULL;
2361 }
2362
2363 return NULL;
2364 }
2365
2366 if (TREE_CODE (rhs1) != SSA_NAME)
2367 return NULL;
2368
2369 code = gimple_assign_rhs_code (stmt);
2370 rhs_class = gimple_assign_rhs_class (stmt);
2371 rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2372
2373 if (rhs_class == GIMPLE_BINARY_RHS)
2374 rhs2 = gimple_assign_rhs2 (stmt);
2375
2376 /* Handle unary rhs and binary rhs with integer constants as second
2377 operand. */
2378
2379 if (rhs_class == GIMPLE_UNARY_RHS
2380 || (rhs_class == GIMPLE_BINARY_RHS
2381 && TREE_CODE (rhs2) == INTEGER_CST))
2382 {
2383 if (code != BIT_AND_EXPR
2384 && code != LSHIFT_EXPR
2385 && code != RSHIFT_EXPR
2386 && code != LROTATE_EXPR
2387 && code != RROTATE_EXPR
2388 && !CONVERT_EXPR_CODE_P (code))
2389 return NULL;
2390
2391 source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1);
2392
2393 /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and
2394 we have to initialize the symbolic number. */
2395 if (!source_stmt1)
2396 {
2397 if (gimple_assign_load_p (stmt)
2398 || !init_symbolic_number (n, rhs1))
2399 return NULL;
2400 source_stmt1 = stmt;
2401 }
2402
2403 switch (code)
2404 {
2405 case BIT_AND_EXPR:
2406 {
2407 int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2408 uint64_t val = int_cst_value (rhs2), mask = 0;
2409 uint64_t tmp = (1 << BITS_PER_UNIT) - 1;
2410
2411 /* Only constants masking full bytes are allowed. */
2412 for (i = 0; i < size; i++, tmp <<= BITS_PER_UNIT)
2413 if ((val & tmp) != 0 && (val & tmp) != tmp)
2414 return NULL;
2415 else if (val & tmp)
2416 mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER);
2417
2418 n->n &= mask;
2419 }
2420 break;
2421 case LSHIFT_EXPR:
2422 case RSHIFT_EXPR:
2423 case LROTATE_EXPR:
2424 case RROTATE_EXPR:
2425 if (!do_shift_rotate (code, n, (int) TREE_INT_CST_LOW (rhs2)))
2426 return NULL;
2427 break;
2428 CASE_CONVERT:
2429 {
2430 int i, type_size, old_type_size;
2431 tree type;
2432
2433 type = gimple_expr_type (stmt);
2434 type_size = TYPE_PRECISION (type);
2435 if (type_size % BITS_PER_UNIT != 0)
2436 return NULL;
2437 type_size /= BITS_PER_UNIT;
2438 if (type_size > 64 / BITS_PER_MARKER)
2439 return NULL;
2440
2441 /* Sign extension: result is dependent on the value. */
2442 old_type_size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2443 if (!TYPE_UNSIGNED (n->type) && type_size > old_type_size
2444 && HEAD_MARKER (n->n, old_type_size))
2445 for (i = 0; i < type_size - old_type_size; i++)
2446 n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
2447 << ((type_size - 1 - i) * BITS_PER_MARKER);
2448
2449 if (type_size < 64 / BITS_PER_MARKER)
2450 {
2451 /* If STMT casts to a smaller type mask out the bits not
2452 belonging to the target type. */
2453 n->n &= ((uint64_t) 1 << (type_size * BITS_PER_MARKER)) - 1;
2454 }
2455 n->type = type;
2456 if (!n->base_addr)
2457 n->range = type_size;
2458 }
2459 break;
2460 default:
2461 return NULL;
2462 };
2463 return verify_symbolic_number_p (n, stmt) ? source_stmt1 : NULL;
2464 }
2465
2466 /* Handle binary rhs. */
2467
2468 if (rhs_class == GIMPLE_BINARY_RHS)
2469 {
2470 struct symbolic_number n1, n2;
2471 gimple *source_stmt, *source_stmt2;
2472
2473 if (code != BIT_IOR_EXPR)
2474 return NULL;
2475
2476 if (TREE_CODE (rhs2) != SSA_NAME)
2477 return NULL;
2478
2479 rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2480
2481 switch (code)
2482 {
2483 case BIT_IOR_EXPR:
2484 source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1);
2485
2486 if (!source_stmt1)
2487 return NULL;
2488
2489 source_stmt2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1);
2490
2491 if (!source_stmt2)
2492 return NULL;
2493
2494 if (TYPE_PRECISION (n1.type) != TYPE_PRECISION (n2.type))
2495 return NULL;
2496
2497 if (!n1.vuse != !n2.vuse
2498 || (n1.vuse && !operand_equal_p (n1.vuse, n2.vuse, 0)))
2499 return NULL;
2500
2501 source_stmt
2502 = perform_symbolic_merge (source_stmt1, &n1, source_stmt2, &n2, n);
2503
2504 if (!source_stmt)
2505 return NULL;
2506
2507 if (!verify_symbolic_number_p (n, stmt))
2508 return NULL;
2509
2510 break;
2511 default:
2512 return NULL;
2513 }
2514 return source_stmt;
2515 }
2516 return NULL;
2517 }
2518
2519 /* Check if STMT completes a bswap implementation or a read in a given
2520 endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
2521 accordingly. It also sets N to represent the kind of operations
2522 performed: size of the resulting expression and whether it works on
2523 a memory source, and if so alias-set and vuse. At last, the
2524 function returns a stmt whose rhs's first tree is the source
2525 expression. */
2526
2527 static gimple *
2528 find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap)
2529 {
2530 unsigned rsize;
2531 uint64_t tmpn, mask;
2532 /* The number which the find_bswap_or_nop_1 result should match in order
2533 to have a full byte swap. The number is shifted to the right
2534 according to the size of the symbolic number before using it. */
2535 uint64_t cmpxchg = CMPXCHG;
2536 uint64_t cmpnop = CMPNOP;
2537
2538 gimple *ins_stmt;
2539 int limit;
2540
2541 /* The last parameter determines the depth search limit. It usually
2542 correlates directly to the number n of bytes to be touched. We
2543 increase that number by log2(n) + 1 here in order to also
2544 cover signed -> unsigned conversions of the src operand as can be seen
2545 in libgcc, and for initial shift/and operation of the src operand. */
2546 limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
2547 limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
2548 ins_stmt = find_bswap_or_nop_1 (stmt, n, limit);
2549
2550 if (!ins_stmt)
2551 return NULL;
2552
2553 /* Find real size of result (highest non-zero byte). */
2554 if (n->base_addr)
2555 for (tmpn = n->n, rsize = 0; tmpn; tmpn >>= BITS_PER_MARKER, rsize++);
2556 else
2557 rsize = n->range;
2558
2559 /* Zero out the bits corresponding to untouched bytes in original gimple
2560 expression. */
2561 if (n->range < (int) sizeof (int64_t))
2562 {
2563 mask = ((uint64_t) 1 << (n->range * BITS_PER_MARKER)) - 1;
2564 cmpxchg >>= (64 / BITS_PER_MARKER - n->range) * BITS_PER_MARKER;
2565 cmpnop &= mask;
2566 }
2567
2568 /* Zero out the bits corresponding to unused bytes in the result of the
2569 gimple expression. */
2570 if (rsize < n->range)
2571 {
2572 if (BYTES_BIG_ENDIAN)
2573 {
2574 mask = ((uint64_t) 1 << (rsize * BITS_PER_MARKER)) - 1;
2575 cmpxchg &= mask;
2576 cmpnop >>= (n->range - rsize) * BITS_PER_MARKER;
2577 }
2578 else
2579 {
2580 mask = ((uint64_t) 1 << (rsize * BITS_PER_MARKER)) - 1;
2581 cmpxchg >>= (n->range - rsize) * BITS_PER_MARKER;
2582 cmpnop &= mask;
2583 }
2584 n->range = rsize;
2585 }
2586
2587 /* A complete byte swap should make the symbolic number to start with
2588 the largest digit in the highest order byte. Unchanged symbolic
2589 number indicates a read with same endianness as target architecture. */
2590 if (n->n == cmpnop)
2591 *bswap = false;
2592 else if (n->n == cmpxchg)
2593 *bswap = true;
2594 else
2595 return NULL;
2596
2597 /* Useless bit manipulation performed by code. */
2598 if (!n->base_addr && n->n == cmpnop && n->n_ops == 1)
2599 return NULL;
2600
2601 n->range *= BITS_PER_UNIT;
2602 return ins_stmt;
2603 }
2604
2605 namespace {
2606
2607 const pass_data pass_data_optimize_bswap =
2608 {
2609 GIMPLE_PASS, /* type */
2610 "bswap", /* name */
2611 OPTGROUP_NONE, /* optinfo_flags */
2612 TV_NONE, /* tv_id */
2613 PROP_ssa, /* properties_required */
2614 0, /* properties_provided */
2615 0, /* properties_destroyed */
2616 0, /* todo_flags_start */
2617 0, /* todo_flags_finish */
2618 };
2619
2620 class pass_optimize_bswap : public gimple_opt_pass
2621 {
2622 public:
2623 pass_optimize_bswap (gcc::context *ctxt)
2624 : gimple_opt_pass (pass_data_optimize_bswap, ctxt)
2625 {}
2626
2627 /* opt_pass methods: */
2628 virtual bool gate (function *)
2629 {
2630 return flag_expensive_optimizations && optimize;
2631 }
2632
2633 virtual unsigned int execute (function *);
2634
2635 }; // class pass_optimize_bswap
2636
2637 /* Perform the bswap optimization: replace the expression computed in the rhs
2638 of CUR_STMT by an equivalent bswap, load or load + bswap expression.
2639 Which of these alternatives replace the rhs is given by N->base_addr (non
2640 null if a load is needed) and BSWAP. The type, VUSE and set-alias of the
2641 load to perform are also given in N while the builtin bswap invoke is given
2642 in FNDEL. Finally, if a load is involved, SRC_STMT refers to one of the
2643 load statements involved to construct the rhs in CUR_STMT and N->range gives
2644 the size of the rhs expression for maintaining some statistics.
2645
2646 Note that if the replacement involve a load, CUR_STMT is moved just after
2647 SRC_STMT to do the load with the same VUSE which can lead to CUR_STMT
2648 changing of basic block. */
2649
2650 static bool
2651 bswap_replace (gimple *cur_stmt, gimple *ins_stmt, tree fndecl,
2652 tree bswap_type, tree load_type, struct symbolic_number *n,
2653 bool bswap)
2654 {
2655 gimple_stmt_iterator gsi;
2656 tree src, tmp, tgt;
2657 gimple *bswap_stmt;
2658
2659 gsi = gsi_for_stmt (cur_stmt);
2660 src = n->src;
2661 tgt = gimple_assign_lhs (cur_stmt);
2662
2663 /* Need to load the value from memory first. */
2664 if (n->base_addr)
2665 {
2666 gimple_stmt_iterator gsi_ins = gsi_for_stmt (ins_stmt);
2667 tree addr_expr, addr_tmp, val_expr, val_tmp;
2668 tree load_offset_ptr, aligned_load_type;
2669 gimple *addr_stmt, *load_stmt;
2670 unsigned align;
2671 HOST_WIDE_INT load_offset = 0;
2672 basic_block ins_bb, cur_bb;
2673
2674 ins_bb = gimple_bb (ins_stmt);
2675 cur_bb = gimple_bb (cur_stmt);
2676 if (!dominated_by_p (CDI_DOMINATORS, cur_bb, ins_bb))
2677 return false;
2678
2679 align = get_object_alignment (src);
2680
2681 /* Move cur_stmt just before one of the load of the original
2682 to ensure it has the same VUSE. See PR61517 for what could
2683 go wrong. */
2684 if (gimple_bb (cur_stmt) != gimple_bb (ins_stmt))
2685 reset_flow_sensitive_info (gimple_assign_lhs (cur_stmt));
2686 gsi_move_before (&gsi, &gsi_ins);
2687 gsi = gsi_for_stmt (cur_stmt);
2688
2689 /* Compute address to load from and cast according to the size
2690 of the load. */
2691 addr_expr = build_fold_addr_expr (unshare_expr (src));
2692 if (is_gimple_mem_ref_addr (addr_expr))
2693 addr_tmp = addr_expr;
2694 else
2695 {
2696 addr_tmp = make_temp_ssa_name (TREE_TYPE (addr_expr), NULL,
2697 "load_src");
2698 addr_stmt = gimple_build_assign (addr_tmp, addr_expr);
2699 gsi_insert_before (&gsi, addr_stmt, GSI_SAME_STMT);
2700 }
2701
2702 /* Perform the load. */
2703 aligned_load_type = load_type;
2704 if (align < TYPE_ALIGN (load_type))
2705 aligned_load_type = build_aligned_type (load_type, align);
2706 load_offset_ptr = build_int_cst (n->alias_set, load_offset);
2707 val_expr = fold_build2 (MEM_REF, aligned_load_type, addr_tmp,
2708 load_offset_ptr);
2709
2710 if (!bswap)
2711 {
2712 if (n->range == 16)
2713 nop_stats.found_16bit++;
2714 else if (n->range == 32)
2715 nop_stats.found_32bit++;
2716 else
2717 {
2718 gcc_assert (n->range == 64);
2719 nop_stats.found_64bit++;
2720 }
2721
2722 /* Convert the result of load if necessary. */
2723 if (!useless_type_conversion_p (TREE_TYPE (tgt), load_type))
2724 {
2725 val_tmp = make_temp_ssa_name (aligned_load_type, NULL,
2726 "load_dst");
2727 load_stmt = gimple_build_assign (val_tmp, val_expr);
2728 gimple_set_vuse (load_stmt, n->vuse);
2729 gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2730 gimple_assign_set_rhs_with_ops (&gsi, NOP_EXPR, val_tmp);
2731 }
2732 else
2733 {
2734 gimple_assign_set_rhs_with_ops (&gsi, MEM_REF, val_expr);
2735 gimple_set_vuse (cur_stmt, n->vuse);
2736 }
2737 update_stmt (cur_stmt);
2738
2739 if (dump_file)
2740 {
2741 fprintf (dump_file,
2742 "%d bit load in target endianness found at: ",
2743 (int) n->range);
2744 print_gimple_stmt (dump_file, cur_stmt, 0);
2745 }
2746 return true;
2747 }
2748 else
2749 {
2750 val_tmp = make_temp_ssa_name (aligned_load_type, NULL, "load_dst");
2751 load_stmt = gimple_build_assign (val_tmp, val_expr);
2752 gimple_set_vuse (load_stmt, n->vuse);
2753 gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2754 }
2755 src = val_tmp;
2756 }
2757 else if (!bswap)
2758 {
2759 gimple *g;
2760 if (!useless_type_conversion_p (TREE_TYPE (tgt), TREE_TYPE (src)))
2761 {
2762 if (!is_gimple_val (src))
2763 return false;
2764 g = gimple_build_assign (tgt, NOP_EXPR, src);
2765 }
2766 else
2767 g = gimple_build_assign (tgt, src);
2768 if (n->range == 16)
2769 nop_stats.found_16bit++;
2770 else if (n->range == 32)
2771 nop_stats.found_32bit++;
2772 else
2773 {
2774 gcc_assert (n->range == 64);
2775 nop_stats.found_64bit++;
2776 }
2777 if (dump_file)
2778 {
2779 fprintf (dump_file,
2780 "%d bit reshuffle in target endianness found at: ",
2781 (int) n->range);
2782 print_gimple_stmt (dump_file, cur_stmt, 0);
2783 }
2784 gsi_replace (&gsi, g, true);
2785 return true;
2786 }
2787 else if (TREE_CODE (src) == BIT_FIELD_REF)
2788 src = TREE_OPERAND (src, 0);
2789
2790 if (n->range == 16)
2791 bswap_stats.found_16bit++;
2792 else if (n->range == 32)
2793 bswap_stats.found_32bit++;
2794 else
2795 {
2796 gcc_assert (n->range == 64);
2797 bswap_stats.found_64bit++;
2798 }
2799
2800 tmp = src;
2801
2802 /* Convert the src expression if necessary. */
2803 if (!useless_type_conversion_p (TREE_TYPE (tmp), bswap_type))
2804 {
2805 gimple *convert_stmt;
2806
2807 tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
2808 convert_stmt = gimple_build_assign (tmp, NOP_EXPR, src);
2809 gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
2810 }
2811
2812 /* Canonical form for 16 bit bswap is a rotate expression. Only 16bit values
2813 are considered as rotation of 2N bit values by N bits is generally not
2814 equivalent to a bswap. Consider for instance 0x01020304 r>> 16 which
2815 gives 0x03040102 while a bswap for that value is 0x04030201. */
2816 if (bswap && n->range == 16)
2817 {
2818 tree count = build_int_cst (NULL, BITS_PER_UNIT);
2819 src = fold_build2 (LROTATE_EXPR, bswap_type, tmp, count);
2820 bswap_stmt = gimple_build_assign (NULL, src);
2821 }
2822 else
2823 bswap_stmt = gimple_build_call (fndecl, 1, tmp);
2824
2825 tmp = tgt;
2826
2827 /* Convert the result if necessary. */
2828 if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type))
2829 {
2830 gimple *convert_stmt;
2831
2832 tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
2833 convert_stmt = gimple_build_assign (tgt, NOP_EXPR, tmp);
2834 gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
2835 }
2836
2837 gimple_set_lhs (bswap_stmt, tmp);
2838
2839 if (dump_file)
2840 {
2841 fprintf (dump_file, "%d bit bswap implementation found at: ",
2842 (int) n->range);
2843 print_gimple_stmt (dump_file, cur_stmt, 0);
2844 }
2845
2846 gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT);
2847 gsi_remove (&gsi, true);
2848 return true;
2849 }
2850
2851 /* Find manual byte swap implementations as well as load in a given
2852 endianness. Byte swaps are turned into a bswap builtin invokation
2853 while endian loads are converted to bswap builtin invokation or
2854 simple load according to the target endianness. */
2855
2856 unsigned int
2857 pass_optimize_bswap::execute (function *fun)
2858 {
2859 basic_block bb;
2860 bool bswap32_p, bswap64_p;
2861 bool changed = false;
2862 tree bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
2863
2864 if (BITS_PER_UNIT != 8)
2865 return 0;
2866
2867 bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
2868 && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
2869 bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
2870 && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
2871 || (bswap32_p && word_mode == SImode)));
2872
2873 /* Determine the argument type of the builtins. The code later on
2874 assumes that the return and argument type are the same. */
2875 if (bswap32_p)
2876 {
2877 tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2878 bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2879 }
2880
2881 if (bswap64_p)
2882 {
2883 tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2884 bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2885 }
2886
2887 memset (&nop_stats, 0, sizeof (nop_stats));
2888 memset (&bswap_stats, 0, sizeof (bswap_stats));
2889 calculate_dominance_info (CDI_DOMINATORS);
2890
2891 FOR_EACH_BB_FN (bb, fun)
2892 {
2893 gimple_stmt_iterator gsi;
2894
2895 /* We do a reverse scan for bswap patterns to make sure we get the
2896 widest match. As bswap pattern matching doesn't handle previously
2897 inserted smaller bswap replacements as sub-patterns, the wider
2898 variant wouldn't be detected. */
2899 for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi);)
2900 {
2901 gimple *ins_stmt, *cur_stmt = gsi_stmt (gsi);
2902 tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type;
2903 enum tree_code code;
2904 struct symbolic_number n;
2905 bool bswap;
2906
2907 /* This gsi_prev (&gsi) is not part of the for loop because cur_stmt
2908 might be moved to a different basic block by bswap_replace and gsi
2909 must not points to it if that's the case. Moving the gsi_prev
2910 there make sure that gsi points to the statement previous to
2911 cur_stmt while still making sure that all statements are
2912 considered in this basic block. */
2913 gsi_prev (&gsi);
2914
2915 if (!is_gimple_assign (cur_stmt))
2916 continue;
2917
2918 code = gimple_assign_rhs_code (cur_stmt);
2919 switch (code)
2920 {
2921 case LROTATE_EXPR:
2922 case RROTATE_EXPR:
2923 if (!tree_fits_uhwi_p (gimple_assign_rhs2 (cur_stmt))
2924 || tree_to_uhwi (gimple_assign_rhs2 (cur_stmt))
2925 % BITS_PER_UNIT)
2926 continue;
2927 /* Fall through. */
2928 case BIT_IOR_EXPR:
2929 break;
2930 default:
2931 continue;
2932 }
2933
2934 ins_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap);
2935
2936 if (!ins_stmt)
2937 continue;
2938
2939 switch (n.range)
2940 {
2941 case 16:
2942 /* Already in canonical form, nothing to do. */
2943 if (code == LROTATE_EXPR || code == RROTATE_EXPR)
2944 continue;
2945 load_type = bswap_type = uint16_type_node;
2946 break;
2947 case 32:
2948 load_type = uint32_type_node;
2949 if (bswap32_p)
2950 {
2951 fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2952 bswap_type = bswap32_type;
2953 }
2954 break;
2955 case 64:
2956 load_type = uint64_type_node;
2957 if (bswap64_p)
2958 {
2959 fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2960 bswap_type = bswap64_type;
2961 }
2962 break;
2963 default:
2964 continue;
2965 }
2966
2967 if (bswap && !fndecl && n.range != 16)
2968 continue;
2969
2970 if (bswap_replace (cur_stmt, ins_stmt, fndecl, bswap_type, load_type,
2971 &n, bswap))
2972 changed = true;
2973 }
2974 }
2975
2976 statistics_counter_event (fun, "16-bit nop implementations found",
2977 nop_stats.found_16bit);
2978 statistics_counter_event (fun, "32-bit nop implementations found",
2979 nop_stats.found_32bit);
2980 statistics_counter_event (fun, "64-bit nop implementations found",
2981 nop_stats.found_64bit);
2982 statistics_counter_event (fun, "16-bit bswap implementations found",
2983 bswap_stats.found_16bit);
2984 statistics_counter_event (fun, "32-bit bswap implementations found",
2985 bswap_stats.found_32bit);
2986 statistics_counter_event (fun, "64-bit bswap implementations found",
2987 bswap_stats.found_64bit);
2988
2989 return (changed ? TODO_update_ssa : 0);
2990 }
2991
2992 } // anon namespace
2993
2994 gimple_opt_pass *
2995 make_pass_optimize_bswap (gcc::context *ctxt)
2996 {
2997 return new pass_optimize_bswap (ctxt);
2998 }
2999
3000 /* Return true if stmt is a type conversion operation that can be stripped 2283 /* Return true if stmt is a type conversion operation that can be stripped
3001 when used in a widening multiply operation. */ 2284 when used in a widening multiply operation. */
3002 static bool 2285 static bool
3003 widening_mult_conversion_strippable_p (tree result_type, gimple *stmt) 2286 widening_mult_conversion_strippable_p (tree result_type, gimple *stmt)
3004 { 2287 {
3109 tree *type1_out, tree *rhs1_out, 2392 tree *type1_out, tree *rhs1_out,
3110 tree *type2_out, tree *rhs2_out) 2393 tree *type2_out, tree *rhs2_out)
3111 { 2394 {
3112 tree type = TREE_TYPE (gimple_assign_lhs (stmt)); 2395 tree type = TREE_TYPE (gimple_assign_lhs (stmt));
3113 2396
3114 if (TREE_CODE (type) != INTEGER_TYPE 2397 if (TREE_CODE (type) == INTEGER_TYPE)
3115 && TREE_CODE (type) != FIXED_POINT_TYPE) 2398 {
2399 if (TYPE_OVERFLOW_TRAPS (type))
2400 return false;
2401 }
2402 else if (TREE_CODE (type) != FIXED_POINT_TYPE)
3116 return false; 2403 return false;
3117 2404
3118 if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out, 2405 if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
3119 rhs1_out)) 2406 rhs1_out))
3120 return false; 2407 return false;
3240 static bool 2527 static bool
3241 convert_mult_to_widen (gimple *stmt, gimple_stmt_iterator *gsi) 2528 convert_mult_to_widen (gimple *stmt, gimple_stmt_iterator *gsi)
3242 { 2529 {
3243 tree lhs, rhs1, rhs2, type, type1, type2; 2530 tree lhs, rhs1, rhs2, type, type1, type2;
3244 enum insn_code handler; 2531 enum insn_code handler;
3245 machine_mode to_mode, from_mode, actual_mode; 2532 scalar_int_mode to_mode, from_mode, actual_mode;
3246 optab op; 2533 optab op;
3247 int actual_precision; 2534 int actual_precision;
3248 location_t loc = gimple_location (stmt); 2535 location_t loc = gimple_location (stmt);
3249 bool from_unsigned1, from_unsigned2; 2536 bool from_unsigned1, from_unsigned2;
3250 2537
3256 if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2)) 2543 if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
3257 return false; 2544 return false;
3258 2545
3259 to_mode = SCALAR_INT_TYPE_MODE (type); 2546 to_mode = SCALAR_INT_TYPE_MODE (type);
3260 from_mode = SCALAR_INT_TYPE_MODE (type1); 2547 from_mode = SCALAR_INT_TYPE_MODE (type1);
2548 if (to_mode == from_mode)
2549 return false;
2550
3261 from_unsigned1 = TYPE_UNSIGNED (type1); 2551 from_unsigned1 = TYPE_UNSIGNED (type1);
3262 from_unsigned2 = TYPE_UNSIGNED (type2); 2552 from_unsigned2 = TYPE_UNSIGNED (type2);
3263 2553
3264 if (from_unsigned1 && from_unsigned2) 2554 if (from_unsigned1 && from_unsigned2)
3265 op = umul_widen_optab; 2555 op = umul_widen_optab;
3267 op = smul_widen_optab; 2557 op = smul_widen_optab;
3268 else 2558 else
3269 op = usmul_widen_optab; 2559 op = usmul_widen_optab;
3270 2560
3271 handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode, 2561 handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
3272 0, &actual_mode); 2562 &actual_mode);
3273 2563
3274 if (handler == CODE_FOR_nothing) 2564 if (handler == CODE_FOR_nothing)
3275 { 2565 {
3276 if (op != smul_widen_optab) 2566 if (op != smul_widen_optab)
3277 { 2567 {
3288 return false; 2578 return false;
3289 } 2579 }
3290 2580
3291 op = smul_widen_optab; 2581 op = smul_widen_optab;
3292 handler = find_widening_optab_handler_and_mode (op, to_mode, 2582 handler = find_widening_optab_handler_and_mode (op, to_mode,
3293 from_mode, 0, 2583 from_mode,
3294 &actual_mode); 2584 &actual_mode);
3295 2585
3296 if (handler == CODE_FOR_nothing) 2586 if (handler == CODE_FOR_nothing)
3297 return false; 2587 return false;
3298 2588
3348 tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs; 2638 tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
3349 enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK; 2639 enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
3350 optab this_optab; 2640 optab this_optab;
3351 enum tree_code wmult_code; 2641 enum tree_code wmult_code;
3352 enum insn_code handler; 2642 enum insn_code handler;
3353 scalar_mode to_mode, from_mode; 2643 scalar_mode to_mode, from_mode, actual_mode;
3354 machine_mode actual_mode;
3355 location_t loc = gimple_location (stmt); 2644 location_t loc = gimple_location (stmt);
3356 int actual_precision; 2645 int actual_precision;
3357 bool from_unsigned1, from_unsigned2; 2646 bool from_unsigned1, from_unsigned2;
3358 2647
3359 lhs = gimple_assign_lhs (stmt); 2648 lhs = gimple_assign_lhs (stmt);
3447 else 2736 else
3448 return false; 2737 return false;
3449 2738
3450 to_mode = SCALAR_TYPE_MODE (type); 2739 to_mode = SCALAR_TYPE_MODE (type);
3451 from_mode = SCALAR_TYPE_MODE (type1); 2740 from_mode = SCALAR_TYPE_MODE (type1);
2741 if (to_mode == from_mode)
2742 return false;
2743
3452 from_unsigned1 = TYPE_UNSIGNED (type1); 2744 from_unsigned1 = TYPE_UNSIGNED (type1);
3453 from_unsigned2 = TYPE_UNSIGNED (type2); 2745 from_unsigned2 = TYPE_UNSIGNED (type2);
3454 optype = type1; 2746 optype = type1;
3455 2747
3456 /* There's no such thing as a mixed sign madd yet, so use a wider mode. */ 2748 /* There's no such thing as a mixed sign madd yet, so use a wider mode. */
3507 /* Verify that the machine can perform a widening multiply 2799 /* Verify that the machine can perform a widening multiply
3508 accumulate in this mode/signedness combination, otherwise 2800 accumulate in this mode/signedness combination, otherwise
3509 this transformation is likely to pessimize code. */ 2801 this transformation is likely to pessimize code. */
3510 this_optab = optab_for_tree_code (wmult_code, optype, optab_default); 2802 this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
3511 handler = find_widening_optab_handler_and_mode (this_optab, to_mode, 2803 handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
3512 from_mode, 0, &actual_mode); 2804 from_mode, &actual_mode);
3513 2805
3514 if (handler == CODE_FOR_nothing) 2806 if (handler == CODE_FOR_nothing)
3515 return false; 2807 return false;
3516 2808
3517 /* Ensure that the inputs to the handler are in the correct precison 2809 /* Ensure that the inputs to the handler are in the correct precison
3544 update_stmt (gsi_stmt (*gsi)); 2836 update_stmt (gsi_stmt (*gsi));
3545 widen_mul_stats.maccs_inserted++; 2837 widen_mul_stats.maccs_inserted++;
3546 return true; 2838 return true;
3547 } 2839 }
3548 2840
2841 /* Given a result MUL_RESULT which is a result of a multiplication of OP1 and
2842 OP2 and which we know is used in statements that can be, together with the
2843 multiplication, converted to FMAs, perform the transformation. */
2844
2845 static void
2846 convert_mult_to_fma_1 (tree mul_result, tree op1, tree op2)
2847 {
2848 tree type = TREE_TYPE (mul_result);
2849 gimple *use_stmt;
2850 imm_use_iterator imm_iter;
2851 gcall *fma_stmt;
2852
2853 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
2854 {
2855 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
2856 tree addop, mulop1 = op1, result = mul_result;
2857 bool negate_p = false;
2858 gimple_seq seq = NULL;
2859
2860 if (is_gimple_debug (use_stmt))
2861 continue;
2862
2863 if (is_gimple_assign (use_stmt)
2864 && gimple_assign_rhs_code (use_stmt) == NEGATE_EXPR)
2865 {
2866 result = gimple_assign_lhs (use_stmt);
2867 use_operand_p use_p;
2868 gimple *neguse_stmt;
2869 single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
2870 gsi_remove (&gsi, true);
2871 release_defs (use_stmt);
2872
2873 use_stmt = neguse_stmt;
2874 gsi = gsi_for_stmt (use_stmt);
2875 negate_p = true;
2876 }
2877
2878 tree cond, else_value, ops[3];
2879 tree_code code;
2880 if (!can_interpret_as_conditional_op_p (use_stmt, &cond, &code,
2881 ops, &else_value))
2882 gcc_unreachable ();
2883 addop = ops[0] == result ? ops[1] : ops[0];
2884
2885 if (code == MINUS_EXPR)
2886 {
2887 if (ops[0] == result)
2888 /* a * b - c -> a * b + (-c) */
2889 addop = gimple_build (&seq, NEGATE_EXPR, type, addop);
2890 else
2891 /* a - b * c -> (-b) * c + a */
2892 negate_p = !negate_p;
2893 }
2894
2895 if (negate_p)
2896 mulop1 = gimple_build (&seq, NEGATE_EXPR, type, mulop1);
2897
2898 if (seq)
2899 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
2900
2901 if (cond)
2902 fma_stmt = gimple_build_call_internal (IFN_COND_FMA, 5, cond, mulop1,
2903 op2, addop, else_value);
2904 else
2905 fma_stmt = gimple_build_call_internal (IFN_FMA, 3, mulop1, op2, addop);
2906 gimple_set_lhs (fma_stmt, gimple_get_lhs (use_stmt));
2907 gimple_call_set_nothrow (fma_stmt, !stmt_can_throw_internal (cfun,
2908 use_stmt));
2909 gsi_replace (&gsi, fma_stmt, true);
2910 /* Follow all SSA edges so that we generate FMS, FNMA and FNMS
2911 regardless of where the negation occurs. */
2912 if (fold_stmt (&gsi, follow_all_ssa_edges))
2913 update_stmt (gsi_stmt (gsi));
2914
2915 if (dump_file && (dump_flags & TDF_DETAILS))
2916 {
2917 fprintf (dump_file, "Generated FMA ");
2918 print_gimple_stmt (dump_file, gsi_stmt (gsi), 0, TDF_NONE);
2919 fprintf (dump_file, "\n");
2920 }
2921
2922 widen_mul_stats.fmas_inserted++;
2923 }
2924 }
2925
2926 /* Data necessary to perform the actual transformation from a multiplication
2927 and an addition to an FMA after decision is taken it should be done and to
2928 then delete the multiplication statement from the function IL. */
2929
2930 struct fma_transformation_info
2931 {
2932 gimple *mul_stmt;
2933 tree mul_result;
2934 tree op1;
2935 tree op2;
2936 };
2937
2938 /* Structure containing the current state of FMA deferring, i.e. whether we are
2939 deferring, whether to continue deferring, and all data necessary to come
2940 back and perform all deferred transformations. */
2941
2942 class fma_deferring_state
2943 {
2944 public:
2945 /* Class constructor. Pass true as PERFORM_DEFERRING in order to actually
2946 do any deferring. */
2947
2948 fma_deferring_state (bool perform_deferring)
2949 : m_candidates (), m_mul_result_set (), m_initial_phi (NULL),
2950 m_last_result (NULL_TREE), m_deferring_p (perform_deferring) {}
2951
2952 /* List of FMA candidates for which we the transformation has been determined
2953 possible but we at this point in BB analysis we do not consider them
2954 beneficial. */
2955 auto_vec<fma_transformation_info, 8> m_candidates;
2956
2957 /* Set of results of multiplication that are part of an already deferred FMA
2958 candidates. */
2959 hash_set<tree> m_mul_result_set;
2960
2961 /* The PHI that supposedly feeds back result of a FMA to another over loop
2962 boundary. */
2963 gphi *m_initial_phi;
2964
2965 /* Result of the last produced FMA candidate or NULL if there has not been
2966 one. */
2967 tree m_last_result;
2968
2969 /* If true, deferring might still be profitable. If false, transform all
2970 candidates and no longer defer. */
2971 bool m_deferring_p;
2972 };
2973
2974 /* Transform all deferred FMA candidates and mark STATE as no longer
2975 deferring. */
2976
2977 static void
2978 cancel_fma_deferring (fma_deferring_state *state)
2979 {
2980 if (!state->m_deferring_p)
2981 return;
2982
2983 for (unsigned i = 0; i < state->m_candidates.length (); i++)
2984 {
2985 if (dump_file && (dump_flags & TDF_DETAILS))
2986 fprintf (dump_file, "Generating deferred FMA\n");
2987
2988 const fma_transformation_info &fti = state->m_candidates[i];
2989 convert_mult_to_fma_1 (fti.mul_result, fti.op1, fti.op2);
2990
2991 gimple_stmt_iterator gsi = gsi_for_stmt (fti.mul_stmt);
2992 gsi_remove (&gsi, true);
2993 release_defs (fti.mul_stmt);
2994 }
2995 state->m_deferring_p = false;
2996 }
2997
2998 /* If OP is an SSA name defined by a PHI node, return the PHI statement.
2999 Otherwise return NULL. */
3000
3001 static gphi *
3002 result_of_phi (tree op)
3003 {
3004 if (TREE_CODE (op) != SSA_NAME)
3005 return NULL;
3006
3007 return dyn_cast <gphi *> (SSA_NAME_DEF_STMT (op));
3008 }
3009
3010 /* After processing statements of a BB and recording STATE, return true if the
3011 initial phi is fed by the last FMA candidate result ore one such result from
3012 previously processed BBs marked in LAST_RESULT_SET. */
3013
3014 static bool
3015 last_fma_candidate_feeds_initial_phi (fma_deferring_state *state,
3016 hash_set<tree> *last_result_set)
3017 {
3018 ssa_op_iter iter;
3019 use_operand_p use;
3020 FOR_EACH_PHI_ARG (use, state->m_initial_phi, iter, SSA_OP_USE)
3021 {
3022 tree t = USE_FROM_PTR (use);
3023 if (t == state->m_last_result
3024 || last_result_set->contains (t))
3025 return true;
3026 }
3027
3028 return false;
3029 }
3030
3549 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2 3031 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
3550 with uses in additions and subtractions to form fused multiply-add 3032 with uses in additions and subtractions to form fused multiply-add
3551 operations. Returns true if successful and MUL_STMT should be removed. */ 3033 operations. Returns true if successful and MUL_STMT should be removed.
3034
3035 If STATE indicates that we are deferring FMA transformation, that means
3036 that we do not produce FMAs for basic blocks which look like:
3037
3038 <bb 6>
3039 # accumulator_111 = PHI <0.0(5), accumulator_66(6)>
3040 _65 = _14 * _16;
3041 accumulator_66 = _65 + accumulator_111;
3042
3043 or its unrolled version, i.e. with several FMA candidates that feed result
3044 of one into the addend of another. Instead, we add them to a list in STATE
3045 and if we later discover an FMA candidate that is not part of such a chain,
3046 we go back and perform all deferred past candidates. */
3552 3047
3553 static bool 3048 static bool
3554 convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2) 3049 convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2,
3050 fma_deferring_state *state)
3555 { 3051 {
3556 tree mul_result = gimple_get_lhs (mul_stmt); 3052 tree mul_result = gimple_get_lhs (mul_stmt);
3557 tree type = TREE_TYPE (mul_result); 3053 tree type = TREE_TYPE (mul_result);
3558 gimple *use_stmt, *neguse_stmt; 3054 gimple *use_stmt, *neguse_stmt;
3559 gassign *fma_stmt;
3560 use_operand_p use_p; 3055 use_operand_p use_p;
3561 imm_use_iterator imm_iter; 3056 imm_use_iterator imm_iter;
3562 3057
3563 if (FLOAT_TYPE_P (type) 3058 if (FLOAT_TYPE_P (type)
3564 && flag_fp_contract_mode == FP_CONTRACT_OFF) 3059 && flag_fp_contract_mode == FP_CONTRACT_OFF)
3565 return false; 3060 return false;
3566 3061
3567 /* We don't want to do bitfield reduction ops. */ 3062 /* We don't want to do bitfield reduction ops. */
3568 if (INTEGRAL_TYPE_P (type) 3063 if (INTEGRAL_TYPE_P (type)
3569 && !type_has_mode_precision_p (type)) 3064 && (!type_has_mode_precision_p (type) || TYPE_OVERFLOW_TRAPS (type)))
3570 return false; 3065 return false;
3571 3066
3572 /* If the target doesn't support it, don't generate it. We assume that 3067 /* If the target doesn't support it, don't generate it. We assume that
3573 if fma isn't available then fms, fnma or fnms are not either. */ 3068 if fma isn't available then fms, fnma or fnms are not either. */
3574 if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing) 3069 optimization_type opt_type = bb_optimization_type (gimple_bb (mul_stmt));
3070 if (!direct_internal_fn_supported_p (IFN_FMA, type, opt_type))
3575 return false; 3071 return false;
3576 3072
3577 /* If the multiplication has zero uses, it is kept around probably because 3073 /* If the multiplication has zero uses, it is kept around probably because
3578 of -fnon-call-exceptions. Don't optimize it away in that case, 3074 of -fnon-call-exceptions. Don't optimize it away in that case,
3579 it is DCE job. */ 3075 it is DCE job. */
3580 if (has_zero_uses (mul_result)) 3076 if (has_zero_uses (mul_result))
3581 return false; 3077 return false;
3582 3078
3079 bool check_defer
3080 = (state->m_deferring_p
3081 && (tree_to_shwi (TYPE_SIZE (type))
3082 <= PARAM_VALUE (PARAM_AVOID_FMA_MAX_BITS)));
3083 bool defer = check_defer;
3583 /* Make sure that the multiplication statement becomes dead after 3084 /* Make sure that the multiplication statement becomes dead after
3584 the transformation, thus that all uses are transformed to FMAs. 3085 the transformation, thus that all uses are transformed to FMAs.
3585 This means we assume that an FMA operation has the same cost 3086 This means we assume that an FMA operation has the same cost
3586 as an addition. */ 3087 as an addition. */
3587 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result) 3088 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
3588 { 3089 {
3589 enum tree_code use_code;
3590 tree result = mul_result; 3090 tree result = mul_result;
3591 bool negate_p = false; 3091 bool negate_p = false;
3592 3092
3593 use_stmt = USE_STMT (use_p); 3093 use_stmt = USE_STMT (use_p);
3594 3094
3605 to form a fma in the then block and sink the multiplication to the 3105 to form a fma in the then block and sink the multiplication to the
3606 else block. */ 3106 else block. */
3607 if (gimple_bb (use_stmt) != gimple_bb (mul_stmt)) 3107 if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3608 return false; 3108 return false;
3609 3109
3610 if (!is_gimple_assign (use_stmt))
3611 return false;
3612
3613 use_code = gimple_assign_rhs_code (use_stmt);
3614
3615 /* A negate on the multiplication leads to FNMA. */ 3110 /* A negate on the multiplication leads to FNMA. */
3616 if (use_code == NEGATE_EXPR) 3111 if (is_gimple_assign (use_stmt)
3112 && gimple_assign_rhs_code (use_stmt) == NEGATE_EXPR)
3617 { 3113 {
3618 ssa_op_iter iter; 3114 ssa_op_iter iter;
3619 use_operand_p usep; 3115 use_operand_p usep;
3620 3116
3621 result = gimple_assign_lhs (use_stmt); 3117 result = gimple_assign_lhs (use_stmt);
3633 3129
3634 /* Re-validate. */ 3130 /* Re-validate. */
3635 use_stmt = neguse_stmt; 3131 use_stmt = neguse_stmt;
3636 if (gimple_bb (use_stmt) != gimple_bb (mul_stmt)) 3132 if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3637 return false; 3133 return false;
3638 if (!is_gimple_assign (use_stmt)) 3134
3639 return false;
3640
3641 use_code = gimple_assign_rhs_code (use_stmt);
3642 negate_p = true; 3135 negate_p = true;
3643 } 3136 }
3644 3137
3645 switch (use_code) 3138 tree cond, else_value, ops[3];
3139 tree_code code;
3140 if (!can_interpret_as_conditional_op_p (use_stmt, &cond, &code, ops,
3141 &else_value))
3142 return false;
3143
3144 switch (code)
3646 { 3145 {
3647 case MINUS_EXPR: 3146 case MINUS_EXPR:
3648 if (gimple_assign_rhs2 (use_stmt) == result) 3147 if (ops[1] == result)
3649 negate_p = !negate_p; 3148 negate_p = !negate_p;
3650 break; 3149 break;
3651 case PLUS_EXPR: 3150 case PLUS_EXPR:
3652 break; 3151 break;
3653 default: 3152 default:
3654 /* FMA can only be formed from PLUS and MINUS. */ 3153 /* FMA can only be formed from PLUS and MINUS. */
3655 return false; 3154 return false;
3656 } 3155 }
3657 3156
3658 /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed 3157 if (cond)
3659 by a MULT_EXPR that we'll visit later, we might be able to 3158 {
3660 get a more profitable match with fnma. 3159 if (cond == result || else_value == result)
3160 return false;
3161 if (!direct_internal_fn_supported_p (IFN_COND_FMA, type, opt_type))
3162 return false;
3163 }
3164
3165 /* If the subtrahend (OPS[1]) is computed by a MULT_EXPR that
3166 we'll visit later, we might be able to get a more profitable
3167 match with fnma.
3661 OTOH, if we don't, a negate / fma pair has likely lower latency 3168 OTOH, if we don't, a negate / fma pair has likely lower latency
3662 that a mult / subtract pair. */ 3169 that a mult / subtract pair. */
3663 if (use_code == MINUS_EXPR && !negate_p 3170 if (code == MINUS_EXPR
3664 && gimple_assign_rhs1 (use_stmt) == result 3171 && !negate_p
3665 && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing 3172 && ops[0] == result
3666 && optab_handler (fnma_optab, TYPE_MODE (type)) != CODE_FOR_nothing) 3173 && !direct_internal_fn_supported_p (IFN_FMS, type, opt_type)
3667 { 3174 && direct_internal_fn_supported_p (IFN_FNMA, type, opt_type)
3668 tree rhs2 = gimple_assign_rhs2 (use_stmt); 3175 && TREE_CODE (ops[1]) == SSA_NAME
3669 3176 && has_single_use (ops[1]))
3670 if (TREE_CODE (rhs2) == SSA_NAME) 3177 {
3178 gimple *stmt2 = SSA_NAME_DEF_STMT (ops[1]);
3179 if (is_gimple_assign (stmt2)
3180 && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
3181 return false;
3182 }
3183
3184 /* We can't handle a * b + a * b. */
3185 if (ops[0] == ops[1])
3186 return false;
3187 /* If deferring, make sure we are not looking at an instruction that
3188 wouldn't have existed if we were not. */
3189 if (state->m_deferring_p
3190 && (state->m_mul_result_set.contains (ops[0])
3191 || state->m_mul_result_set.contains (ops[1])))
3192 return false;
3193
3194 if (check_defer)
3195 {
3196 tree use_lhs = gimple_get_lhs (use_stmt);
3197 if (state->m_last_result)
3671 { 3198 {
3672 gimple *stmt2 = SSA_NAME_DEF_STMT (rhs2); 3199 if (ops[1] == state->m_last_result
3673 if (has_single_use (rhs2) 3200 || ops[0] == state->m_last_result)
3674 && is_gimple_assign (stmt2) 3201 defer = true;
3675 && gimple_assign_rhs_code (stmt2) == MULT_EXPR) 3202 else
3676 return false; 3203 defer = false;
3677 } 3204 }
3678 } 3205 else
3679 3206 {
3680 /* We can't handle a * b + a * b. */ 3207 gcc_checking_assert (!state->m_initial_phi);
3681 if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt)) 3208 gphi *phi;
3682 return false; 3209 if (ops[0] == result)
3683 3210 phi = result_of_phi (ops[1]);
3684 /* While it is possible to validate whether or not the exact form 3211 else
3685 that we've recognized is available in the backend, the assumption 3212 {
3686 is that the transformation is never a loss. For instance, suppose 3213 gcc_assert (ops[1] == result);
3687 the target only has the plain FMA pattern available. Consider 3214 phi = result_of_phi (ops[0]);
3688 a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which 3215 }
3689 is still two operations. Consider -(a*b)-c -> fma(-a,b,-c): we 3216
3690 still have 3 operations, but in the FMA form the two NEGs are 3217 if (phi)
3691 independent and could be run in parallel. */ 3218 {
3692 } 3219 state->m_initial_phi = phi;
3693 3220 defer = true;
3694 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result) 3221 }
3695 { 3222 else
3696 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt); 3223 defer = false;
3697 enum tree_code use_code; 3224 }
3698 tree addop, mulop1 = op1, result = mul_result; 3225
3699 bool negate_p = false; 3226 state->m_last_result = use_lhs;
3700 3227 check_defer = false;
3701 if (is_gimple_debug (use_stmt))
3702 continue;
3703
3704 use_code = gimple_assign_rhs_code (use_stmt);
3705 if (use_code == NEGATE_EXPR)
3706 {
3707 result = gimple_assign_lhs (use_stmt);
3708 single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
3709 gsi_remove (&gsi, true);
3710 release_defs (use_stmt);
3711
3712 use_stmt = neguse_stmt;
3713 gsi = gsi_for_stmt (use_stmt);
3714 use_code = gimple_assign_rhs_code (use_stmt);
3715 negate_p = true;
3716 }
3717
3718 if (gimple_assign_rhs1 (use_stmt) == result)
3719 {
3720 addop = gimple_assign_rhs2 (use_stmt);
3721 /* a * b - c -> a * b + (-c) */
3722 if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3723 addop = force_gimple_operand_gsi (&gsi,
3724 build1 (NEGATE_EXPR,
3725 type, addop),
3726 true, NULL_TREE, true,
3727 GSI_SAME_STMT);
3728 } 3228 }
3729 else 3229 else
3730 { 3230 defer = false;
3731 addop = gimple_assign_rhs1 (use_stmt); 3231
3732 /* a - b * c -> (-b) * c + a */ 3232 /* While it is possible to validate whether or not the exact form that
3733 if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR) 3233 we've recognized is available in the backend, the assumption is that
3734 negate_p = !negate_p; 3234 if the deferring logic above did not trigger, the transformation is
3735 } 3235 never a loss. For instance, suppose the target only has the plain FMA
3736 3236 pattern available. Consider a*b-c -> fma(a,b,-c): we've exchanged
3737 if (negate_p) 3237 MUL+SUB for FMA+NEG, which is still two operations. Consider
3738 mulop1 = force_gimple_operand_gsi (&gsi, 3238 -(a*b)-c -> fma(-a,b,-c): we still have 3 operations, but in the FMA
3739 build1 (NEGATE_EXPR, 3239 form the two NEGs are independent and could be run in parallel. */
3740 type, mulop1), 3240 }
3741 true, NULL_TREE, true, 3241
3742 GSI_SAME_STMT); 3242 if (defer)
3743 3243 {
3744 fma_stmt = gimple_build_assign (gimple_assign_lhs (use_stmt), 3244 fma_transformation_info fti;
3745 FMA_EXPR, mulop1, op2, addop); 3245 fti.mul_stmt = mul_stmt;
3746 gsi_replace (&gsi, fma_stmt, true); 3246 fti.mul_result = mul_result;
3747 widen_mul_stats.fmas_inserted++; 3247 fti.op1 = op1;
3748 } 3248 fti.op2 = op2;
3749 3249 state->m_candidates.safe_push (fti);
3750 return true; 3250 state->m_mul_result_set.add (mul_result);
3251
3252 if (dump_file && (dump_flags & TDF_DETAILS))
3253 {
3254 fprintf (dump_file, "Deferred generating FMA for multiplication ");
3255 print_gimple_stmt (dump_file, mul_stmt, 0, TDF_NONE);
3256 fprintf (dump_file, "\n");
3257 }
3258
3259 return false;
3260 }
3261 else
3262 {
3263 if (state->m_deferring_p)
3264 cancel_fma_deferring (state);
3265 convert_mult_to_fma_1 (mul_result, op1, op2);
3266 return true;
3267 }
3751 } 3268 }
3752 3269
3753 3270
3754 /* Helper function of match_uaddsub_overflow. Return 1 3271 /* Helper function of match_uaddsub_overflow. Return 1
3755 if USE_STMT is unsigned overflow check ovf != 0 for 3272 if USE_STMT is unsigned overflow check ovf != 0 for
4016 IMAGPART_EXPR for mod). */ 3533 IMAGPART_EXPR for mod). */
4017 3534
4018 static bool 3535 static bool
4019 convert_to_divmod (gassign *stmt) 3536 convert_to_divmod (gassign *stmt)
4020 { 3537 {
4021 if (stmt_can_throw_internal (stmt) 3538 if (stmt_can_throw_internal (cfun, stmt)
4022 || !divmod_candidate_p (stmt)) 3539 || !divmod_candidate_p (stmt))
4023 return false; 3540 return false;
4024 3541
4025 tree op1 = gimple_assign_rhs1 (stmt); 3542 tree op1 = gimple_assign_rhs1 (stmt);
4026 tree op2 = gimple_assign_rhs2 (stmt); 3543 tree op2 = gimple_assign_rhs2 (stmt);
4042 && (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR 3559 && (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR
4043 || gimple_assign_rhs_code (use_stmt) == TRUNC_MOD_EXPR) 3560 || gimple_assign_rhs_code (use_stmt) == TRUNC_MOD_EXPR)
4044 && operand_equal_p (op1, gimple_assign_rhs1 (use_stmt), 0) 3561 && operand_equal_p (op1, gimple_assign_rhs1 (use_stmt), 0)
4045 && operand_equal_p (op2, gimple_assign_rhs2 (use_stmt), 0)) 3562 && operand_equal_p (op2, gimple_assign_rhs2 (use_stmt), 0))
4046 { 3563 {
4047 if (stmt_can_throw_internal (use_stmt)) 3564 if (stmt_can_throw_internal (cfun, use_stmt))
4048 continue; 3565 continue;
4049 3566
4050 basic_block bb = gimple_bb (use_stmt); 3567 basic_block bb = gimple_bb (use_stmt);
4051 3568
4052 if (bb == top_bb) 3569 if (bb == top_bb)
4080 || gimple_assign_rhs_code (use_stmt) == TRUNC_MOD_EXPR) 3597 || gimple_assign_rhs_code (use_stmt) == TRUNC_MOD_EXPR)
4081 && operand_equal_p (top_op1, gimple_assign_rhs1 (use_stmt), 0) 3598 && operand_equal_p (top_op1, gimple_assign_rhs1 (use_stmt), 0)
4082 && operand_equal_p (top_op2, gimple_assign_rhs2 (use_stmt), 0)) 3599 && operand_equal_p (top_op2, gimple_assign_rhs2 (use_stmt), 0))
4083 { 3600 {
4084 if (use_stmt == top_stmt 3601 if (use_stmt == top_stmt
4085 || stmt_can_throw_internal (use_stmt) 3602 || stmt_can_throw_internal (cfun, use_stmt)
4086 || !dominated_by_p (CDI_DOMINATORS, gimple_bb (use_stmt), top_bb)) 3603 || !dominated_by_p (CDI_DOMINATORS, gimple_bb (use_stmt), top_bb))
4087 continue; 3604 continue;
4088 3605
4089 stmts.safe_push (use_stmt); 3606 stmts.safe_push (use_stmt);
4090 if (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR) 3607 if (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR)
4150 const pass_data pass_data_optimize_widening_mul = 3667 const pass_data pass_data_optimize_widening_mul =
4151 { 3668 {
4152 GIMPLE_PASS, /* type */ 3669 GIMPLE_PASS, /* type */
4153 "widening_mul", /* name */ 3670 "widening_mul", /* name */
4154 OPTGROUP_NONE, /* optinfo_flags */ 3671 OPTGROUP_NONE, /* optinfo_flags */
4155 TV_NONE, /* tv_id */ 3672 TV_TREE_WIDEN_MUL, /* tv_id */
4156 PROP_ssa, /* properties_required */ 3673 PROP_ssa, /* properties_required */
4157 0, /* properties_provided */ 3674 0, /* properties_provided */
4158 0, /* properties_destroyed */ 3675 0, /* properties_destroyed */
4159 0, /* todo_flags_start */ 3676 0, /* todo_flags_start */
4160 TODO_update_ssa, /* todo_flags_finish */ 3677 TODO_update_ssa, /* todo_flags_finish */
4175 3692
4176 virtual unsigned int execute (function *); 3693 virtual unsigned int execute (function *);
4177 3694
4178 }; // class pass_optimize_widening_mul 3695 }; // class pass_optimize_widening_mul
4179 3696
4180 unsigned int 3697 /* Walker class to perform the transformation in reverse dominance order. */
4181 pass_optimize_widening_mul::execute (function *fun) 3698
4182 { 3699 class math_opts_dom_walker : public dom_walker
4183 basic_block bb; 3700 {
4184 bool cfg_changed = false; 3701 public:
4185 3702 /* Constructor, CFG_CHANGED is a pointer to a boolean flag that will be set
4186 memset (&widen_mul_stats, 0, sizeof (widen_mul_stats)); 3703 if walking modidifes the CFG. */
4187 calculate_dominance_info (CDI_DOMINATORS); 3704
4188 renumber_gimple_stmt_uids (); 3705 math_opts_dom_walker (bool *cfg_changed_p)
4189 3706 : dom_walker (CDI_DOMINATORS), m_last_result_set (),
4190 FOR_EACH_BB_FN (bb, fun) 3707 m_cfg_changed_p (cfg_changed_p) {}
4191 { 3708
4192 gimple_stmt_iterator gsi; 3709 /* The actual actions performed in the walk. */
4193 3710
4194 for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);) 3711 virtual void after_dom_children (basic_block);
4195 { 3712
4196 gimple *stmt = gsi_stmt (gsi); 3713 /* Set of results of chains of multiply and add statement combinations that
4197 enum tree_code code; 3714 were not transformed into FMAs because of active deferring. */
4198 3715 hash_set<tree> m_last_result_set;
4199 if (is_gimple_assign (stmt)) 3716
3717 /* Pointer to a flag of the user that needs to be set if CFG has been
3718 modified. */
3719 bool *m_cfg_changed_p;
3720 };
3721
3722 void
3723 math_opts_dom_walker::after_dom_children (basic_block bb)
3724 {
3725 gimple_stmt_iterator gsi;
3726
3727 fma_deferring_state fma_state (PARAM_VALUE (PARAM_AVOID_FMA_MAX_BITS) > 0);
3728
3729 for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
3730 {
3731 gimple *stmt = gsi_stmt (gsi);
3732 enum tree_code code;
3733
3734 if (is_gimple_assign (stmt))
3735 {
3736 code = gimple_assign_rhs_code (stmt);
3737 switch (code)
4200 { 3738 {
4201 code = gimple_assign_rhs_code (stmt); 3739 case MULT_EXPR:
4202 switch (code) 3740 if (!convert_mult_to_widen (stmt, &gsi)
3741 && !convert_expand_mult_copysign (stmt, &gsi)
3742 && convert_mult_to_fma (stmt,
3743 gimple_assign_rhs1 (stmt),
3744 gimple_assign_rhs2 (stmt),
3745 &fma_state))
4203 { 3746 {
4204 case MULT_EXPR: 3747 gsi_remove (&gsi, true);
4205 if (!convert_mult_to_widen (stmt, &gsi) 3748 release_defs (stmt);
4206 && !convert_expand_mult_copysign (stmt, &gsi) 3749 continue;
3750 }
3751 break;
3752
3753 case PLUS_EXPR:
3754 case MINUS_EXPR:
3755 if (!convert_plusminus_to_widen (&gsi, stmt, code))
3756 match_uaddsub_overflow (&gsi, stmt, code);
3757 break;
3758
3759 case TRUNC_MOD_EXPR:
3760 convert_to_divmod (as_a<gassign *> (stmt));
3761 break;
3762
3763 default:;
3764 }
3765 }
3766 else if (is_gimple_call (stmt))
3767 {
3768 tree fndecl = gimple_call_fndecl (stmt);
3769 if (fndecl && gimple_call_builtin_p (stmt, BUILT_IN_NORMAL))
3770 {
3771 switch (DECL_FUNCTION_CODE (fndecl))
3772 {
3773 case BUILT_IN_POWF:
3774 case BUILT_IN_POW:
3775 case BUILT_IN_POWL:
3776 if (gimple_call_lhs (stmt)
3777 && TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
3778 && real_equal
3779 (&TREE_REAL_CST (gimple_call_arg (stmt, 1)),
3780 &dconst2)
4207 && convert_mult_to_fma (stmt, 3781 && convert_mult_to_fma (stmt,
4208 gimple_assign_rhs1 (stmt), 3782 gimple_call_arg (stmt, 0),
4209 gimple_assign_rhs2 (stmt))) 3783 gimple_call_arg (stmt, 0),
3784 &fma_state))
4210 { 3785 {
4211 gsi_remove (&gsi, true); 3786 unlink_stmt_vdef (stmt);
3787 if (gsi_remove (&gsi, true)
3788 && gimple_purge_dead_eh_edges (bb))
3789 *m_cfg_changed_p = true;
4212 release_defs (stmt); 3790 release_defs (stmt);
4213 continue; 3791 continue;
4214 } 3792 }
4215 break; 3793 break;
4216 3794
4217 case PLUS_EXPR:
4218 case MINUS_EXPR:
4219 if (!convert_plusminus_to_widen (&gsi, stmt, code))
4220 match_uaddsub_overflow (&gsi, stmt, code);
4221 break;
4222
4223 case TRUNC_MOD_EXPR:
4224 convert_to_divmod (as_a<gassign *> (stmt));
4225 break;
4226
4227 default:; 3795 default:;
4228 } 3796 }
4229 } 3797 }
4230 else if (is_gimple_call (stmt) 3798 else
4231 && gimple_call_lhs (stmt)) 3799 cancel_fma_deferring (&fma_state);
4232 { 3800 }
4233 tree fndecl = gimple_call_fndecl (stmt); 3801 gsi_next (&gsi);
4234 if (fndecl 3802 }
4235 && gimple_call_builtin_p (stmt, BUILT_IN_NORMAL)) 3803 if (fma_state.m_deferring_p
4236 { 3804 && fma_state.m_initial_phi)
4237 switch (DECL_FUNCTION_CODE (fndecl)) 3805 {
4238 { 3806 gcc_checking_assert (fma_state.m_last_result);
4239 case BUILT_IN_POWF: 3807 if (!last_fma_candidate_feeds_initial_phi (&fma_state,
4240 case BUILT_IN_POW: 3808 &m_last_result_set))
4241 case BUILT_IN_POWL: 3809 cancel_fma_deferring (&fma_state);
4242 if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST 3810 else
4243 && real_equal 3811 m_last_result_set.add (fma_state.m_last_result);
4244 (&TREE_REAL_CST (gimple_call_arg (stmt, 1)), 3812 }
4245 &dconst2) 3813 }
4246 && convert_mult_to_fma (stmt, 3814
4247 gimple_call_arg (stmt, 0), 3815
4248 gimple_call_arg (stmt, 0))) 3816 unsigned int
4249 { 3817 pass_optimize_widening_mul::execute (function *fun)
4250 unlink_stmt_vdef (stmt); 3818 {
4251 if (gsi_remove (&gsi, true) 3819 bool cfg_changed = false;
4252 && gimple_purge_dead_eh_edges (bb)) 3820
4253 cfg_changed = true; 3821 memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
4254 release_defs (stmt); 3822 calculate_dominance_info (CDI_DOMINATORS);
4255 continue; 3823 renumber_gimple_stmt_uids ();
4256 } 3824
4257 break; 3825 math_opts_dom_walker (&cfg_changed).walk (ENTRY_BLOCK_PTR_FOR_FN (cfun));
4258
4259 default:;
4260 }
4261 }
4262 }
4263 gsi_next (&gsi);
4264 }
4265 }
4266 3826
4267 statistics_counter_event (fun, "widening multiplications inserted", 3827 statistics_counter_event (fun, "widening multiplications inserted",
4268 widen_mul_stats.widen_mults_inserted); 3828 widen_mul_stats.widen_mults_inserted);
4269 statistics_counter_event (fun, "widening maccs inserted", 3829 statistics_counter_event (fun, "widening maccs inserted",
4270 widen_mul_stats.maccs_inserted); 3830 widen_mul_stats.maccs_inserted);