Mercurial > hg > CbC > CbC_gcc
diff gcc/shrink-wrap.c @ 111:04ced10e8804
gcc 7
author | kono |
---|---|
date | Fri, 27 Oct 2017 22:46:09 +0900 |
parents | |
children | 84e7813d76e9 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gcc/shrink-wrap.c Fri Oct 27 22:46:09 2017 +0900 @@ -0,0 +1,1854 @@ +/* Shrink-wrapping related optimizations. + Copyright (C) 1987-2017 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +<http://www.gnu.org/licenses/>. */ + +/* This file handles shrink-wrapping related optimizations. */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "backend.h" +#include "target.h" +#include "rtl.h" +#include "tree.h" +#include "cfghooks.h" +#include "df.h" +#include "memmodel.h" +#include "tm_p.h" +#include "regs.h" +#include "insn-config.h" +#include "emit-rtl.h" +#include "output.h" +#include "tree-pass.h" +#include "cfgrtl.h" +#include "cfgbuild.h" +#include "params.h" +#include "bb-reorder.h" +#include "shrink-wrap.h" +#include "regcprop.h" +#include "rtl-iter.h" +#include "valtrack.h" + + +/* Return true if INSN requires the stack frame to be set up. + PROLOGUE_USED contains the hard registers used in the function + prologue. SET_UP_BY_PROLOGUE is the set of registers we expect the + prologue to set up for the function. */ +bool +requires_stack_frame_p (rtx_insn *insn, HARD_REG_SET prologue_used, + HARD_REG_SET set_up_by_prologue) +{ + df_ref def, use; + HARD_REG_SET hardregs; + unsigned regno; + + if (CALL_P (insn)) + return !SIBLING_CALL_P (insn); + + /* We need a frame to get the unique CFA expected by the unwinder. */ + if (cfun->can_throw_non_call_exceptions && can_throw_internal (insn)) + return true; + + CLEAR_HARD_REG_SET (hardregs); + FOR_EACH_INSN_DEF (def, insn) + { + rtx dreg = DF_REF_REG (def); + + if (!REG_P (dreg)) + continue; + + add_to_hard_reg_set (&hardregs, GET_MODE (dreg), REGNO (dreg)); + } + if (hard_reg_set_intersect_p (hardregs, prologue_used)) + return true; + AND_COMPL_HARD_REG_SET (hardregs, call_used_reg_set); + for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) + if (TEST_HARD_REG_BIT (hardregs, regno) + && df_regs_ever_live_p (regno)) + return true; + + FOR_EACH_INSN_USE (use, insn) + { + rtx reg = DF_REF_REG (use); + + if (!REG_P (reg)) + continue; + + add_to_hard_reg_set (&hardregs, GET_MODE (reg), + REGNO (reg)); + } + if (hard_reg_set_intersect_p (hardregs, set_up_by_prologue)) + return true; + + return false; +} + +/* See whether there has a single live edge from BB, which dest uses + [REGNO, END_REGNO). Return the live edge if its dest bb has + one or two predecessors. Otherwise return NULL. */ + +static edge +live_edge_for_reg (basic_block bb, int regno, int end_regno) +{ + edge e, live_edge; + edge_iterator ei; + bitmap live; + int i; + + live_edge = NULL; + FOR_EACH_EDGE (e, ei, bb->succs) + { + live = df_get_live_in (e->dest); + for (i = regno; i < end_regno; i++) + if (REGNO_REG_SET_P (live, i)) + { + if (live_edge && live_edge != e) + return NULL; + live_edge = e; + } + } + + /* We can sometimes encounter dead code. Don't try to move it + into the exit block. */ + if (!live_edge || live_edge->dest == EXIT_BLOCK_PTR_FOR_FN (cfun)) + return NULL; + + /* Reject targets of abnormal edges. This is needed for correctness + on ports like Alpha and MIPS, whose pic_offset_table_rtx can die on + exception edges even though it is generally treated as call-saved + for the majority of the compilation. Moving across abnormal edges + isn't going to be interesting for shrink-wrap usage anyway. */ + if (live_edge->flags & EDGE_ABNORMAL) + return NULL; + + /* When live_edge->dest->preds == 2, we can create a new block on + the edge to make it meet the requirement. */ + if (EDGE_COUNT (live_edge->dest->preds) > 2) + return NULL; + + return live_edge; +} + +/* Try to move INSN from BB to a successor. Return true on success. + USES and DEFS are the set of registers that are used and defined + after INSN in BB. SPLIT_P indicates whether a live edge from BB + is splitted or not. */ + +static bool +move_insn_for_shrink_wrap (basic_block bb, rtx_insn *insn, + const HARD_REG_SET uses, + const HARD_REG_SET defs, + bool *split_p, + struct dead_debug_local *debug) +{ + rtx set, src, dest; + bitmap live_out, live_in, bb_uses, bb_defs; + unsigned int i, dregno, end_dregno; + unsigned int sregno = FIRST_PSEUDO_REGISTER; + unsigned int end_sregno = FIRST_PSEUDO_REGISTER; + basic_block next_block; + edge live_edge; + rtx_insn *dinsn; + df_ref def; + + /* Look for a simple register assignment. We don't use single_set here + because we can't deal with any CLOBBERs, USEs, or REG_UNUSED secondary + destinations. */ + if (!INSN_P (insn)) + return false; + set = PATTERN (insn); + if (GET_CODE (set) != SET) + return false; + src = SET_SRC (set); + dest = SET_DEST (set); + + /* For the destination, we want only a register. Also disallow STACK + or FRAME related adjustments. They are likely part of the prologue, + so keep them in the entry block. */ + if (!REG_P (dest) + || dest == stack_pointer_rtx + || dest == frame_pointer_rtx + || dest == hard_frame_pointer_rtx) + return false; + + /* For the source, we want one of: + (1) A (non-overlapping) register + (2) A constant, + (3) An expression involving no more than one register. + + That last point comes from the code following, which was originally + written to handle only register move operations, and still only handles + a single source register when checking for overlaps. Happily, the + same checks can be applied to expressions like (plus reg const). */ + + if (CONSTANT_P (src)) + ; + else if (!REG_P (src)) + { + rtx src_inner = NULL_RTX; + + if (can_throw_internal (insn)) + return false; + + subrtx_var_iterator::array_type array; + FOR_EACH_SUBRTX_VAR (iter, array, src, ALL) + { + rtx x = *iter; + switch (GET_RTX_CLASS (GET_CODE (x))) + { + case RTX_CONST_OBJ: + case RTX_COMPARE: + case RTX_COMM_COMPARE: + case RTX_BIN_ARITH: + case RTX_COMM_ARITH: + case RTX_UNARY: + case RTX_TERNARY: + /* Constant or expression. Continue. */ + break; + + case RTX_OBJ: + case RTX_EXTRA: + switch (GET_CODE (x)) + { + case UNSPEC: + case SUBREG: + case STRICT_LOW_PART: + case PC: + case LO_SUM: + /* Ok. Continue. */ + break; + + case REG: + /* Fail if we see a second inner register. */ + if (src_inner != NULL) + return false; + src_inner = x; + break; + + default: + return false; + } + break; + + default: + return false; + } + } + + if (src_inner != NULL) + src = src_inner; + } + + /* Make sure that the source register isn't defined later in BB. */ + if (REG_P (src)) + { + sregno = REGNO (src); + end_sregno = END_REGNO (src); + if (overlaps_hard_reg_set_p (defs, GET_MODE (src), sregno)) + return false; + } + + /* Make sure that the destination register isn't referenced later in BB. */ + dregno = REGNO (dest); + end_dregno = END_REGNO (dest); + if (overlaps_hard_reg_set_p (uses, GET_MODE (dest), dregno) + || overlaps_hard_reg_set_p (defs, GET_MODE (dest), dregno)) + return false; + + /* See whether there is a successor block to which we could move INSN. */ + live_edge = live_edge_for_reg (bb, dregno, end_dregno); + if (!live_edge) + return false; + + next_block = live_edge->dest; + /* Create a new basic block on the edge. */ + if (EDGE_COUNT (next_block->preds) == 2) + { + /* split_edge for a block with only one successor is meaningless. */ + if (EDGE_COUNT (bb->succs) == 1) + return false; + + /* If DF_LIVE doesn't exist, i.e. at -O1, just give up. */ + if (!df_live) + return false; + + basic_block old_dest = live_edge->dest; + next_block = split_edge (live_edge); + + /* We create a new basic block. Call df_grow_bb_info to make sure + all data structures are allocated. */ + df_grow_bb_info (df_live); + + bitmap_and (df_get_live_in (next_block), df_get_live_out (bb), + df_get_live_in (old_dest)); + df_set_bb_dirty (next_block); + + /* We should not split more than once for a function. */ + if (*split_p) + return false; + + *split_p = true; + } + + /* At this point we are committed to moving INSN, but let's try to + move it as far as we can. */ + do + { + if (MAY_HAVE_DEBUG_INSNS) + { + FOR_BB_INSNS_REVERSE (bb, dinsn) + if (DEBUG_INSN_P (dinsn)) + { + df_ref use; + FOR_EACH_INSN_USE (use, dinsn) + if (refers_to_regno_p (dregno, end_dregno, + DF_REF_REG (use), (rtx *) NULL)) + dead_debug_add (debug, use, DF_REF_REGNO (use)); + } + else if (dinsn == insn) + break; + } + live_out = df_get_live_out (bb); + live_in = df_get_live_in (next_block); + bb = next_block; + + /* Check whether BB uses DEST or clobbers DEST. We need to add + INSN to BB if so. Either way, DEST is no longer live on entry, + except for any part that overlaps SRC (next loop). */ + bb_uses = &DF_LR_BB_INFO (bb)->use; + bb_defs = &DF_LR_BB_INFO (bb)->def; + if (df_live) + { + for (i = dregno; i < end_dregno; i++) + { + if (*split_p + || REGNO_REG_SET_P (bb_uses, i) + || REGNO_REG_SET_P (bb_defs, i) + || REGNO_REG_SET_P (&DF_LIVE_BB_INFO (bb)->gen, i)) + next_block = NULL; + CLEAR_REGNO_REG_SET (live_out, i); + CLEAR_REGNO_REG_SET (live_in, i); + } + + /* Check whether BB clobbers SRC. We need to add INSN to BB if so. + Either way, SRC is now live on entry. */ + for (i = sregno; i < end_sregno; i++) + { + if (*split_p + || REGNO_REG_SET_P (bb_defs, i) + || REGNO_REG_SET_P (&DF_LIVE_BB_INFO (bb)->gen, i)) + next_block = NULL; + SET_REGNO_REG_SET (live_out, i); + SET_REGNO_REG_SET (live_in, i); + } + } + else + { + /* DF_LR_BB_INFO (bb)->def does not comprise the DF_REF_PARTIAL and + DF_REF_CONDITIONAL defs. So if DF_LIVE doesn't exist, i.e. + at -O1, just give up searching NEXT_BLOCK. */ + next_block = NULL; + for (i = dregno; i < end_dregno; i++) + { + CLEAR_REGNO_REG_SET (live_out, i); + CLEAR_REGNO_REG_SET (live_in, i); + } + + for (i = sregno; i < end_sregno; i++) + { + SET_REGNO_REG_SET (live_out, i); + SET_REGNO_REG_SET (live_in, i); + } + } + + /* If we don't need to add the move to BB, look for a single + successor block. */ + if (next_block) + { + live_edge = live_edge_for_reg (next_block, dregno, end_dregno); + if (!live_edge || EDGE_COUNT (live_edge->dest->preds) > 1) + break; + next_block = live_edge->dest; + } + } + while (next_block); + + /* For the new created basic block, there is no dataflow info at all. + So skip the following dataflow update and check. */ + if (!(*split_p)) + { + /* BB now defines DEST. It only uses the parts of DEST that overlap SRC + (next loop). */ + for (i = dregno; i < end_dregno; i++) + { + CLEAR_REGNO_REG_SET (bb_uses, i); + SET_REGNO_REG_SET (bb_defs, i); + } + + /* BB now uses SRC. */ + for (i = sregno; i < end_sregno; i++) + SET_REGNO_REG_SET (bb_uses, i); + } + + /* Insert debug temps for dead REGs used in subsequent debug insns. */ + if (debug->used && !bitmap_empty_p (debug->used)) + FOR_EACH_INSN_DEF (def, insn) + dead_debug_insert_temp (debug, DF_REF_REGNO (def), insn, + DEBUG_TEMP_BEFORE_WITH_VALUE); + + emit_insn_after (PATTERN (insn), bb_note (bb)); + delete_insn (insn); + return true; +} + +/* Look for register copies in the first block of the function, and move + them down into successor blocks if the register is used only on one + path. This exposes more opportunities for shrink-wrapping. These + kinds of sets often occur when incoming argument registers are moved + to call-saved registers because their values are live across one or + more calls during the function. */ + +static void +prepare_shrink_wrap (basic_block entry_block) +{ + rtx_insn *insn, *curr; + rtx x; + HARD_REG_SET uses, defs; + df_ref def, use; + bool split_p = false; + unsigned int i; + struct dead_debug_local debug; + + if (JUMP_P (BB_END (entry_block))) + { + /* To have more shrink-wrapping opportunities, prepare_shrink_wrap tries + to sink the copies from parameter to callee saved register out of + entry block. copyprop_hardreg_forward_bb_without_debug_insn is called + to release some dependences. */ + copyprop_hardreg_forward_bb_without_debug_insn (entry_block); + } + + dead_debug_local_init (&debug, NULL, NULL); + CLEAR_HARD_REG_SET (uses); + CLEAR_HARD_REG_SET (defs); + + FOR_BB_INSNS_REVERSE_SAFE (entry_block, insn, curr) + if (NONDEBUG_INSN_P (insn) + && !move_insn_for_shrink_wrap (entry_block, insn, uses, defs, + &split_p, &debug)) + { + /* Add all defined registers to DEFs. */ + FOR_EACH_INSN_DEF (def, insn) + { + x = DF_REF_REG (def); + if (REG_P (x) && HARD_REGISTER_P (x)) + for (i = REGNO (x); i < END_REGNO (x); i++) + SET_HARD_REG_BIT (defs, i); + } + + /* Add all used registers to USESs. */ + FOR_EACH_INSN_USE (use, insn) + { + x = DF_REF_REG (use); + if (REG_P (x) && HARD_REGISTER_P (x)) + for (i = REGNO (x); i < END_REGNO (x); i++) + SET_HARD_REG_BIT (uses, i); + } + } + + dead_debug_local_finish (&debug, NULL); +} + +/* Return whether basic block PRO can get the prologue. It can not if it + has incoming complex edges that need a prologue inserted (we make a new + block for the prologue, so those edges would need to be redirected, which + does not work). It also can not if there exist registers live on entry + to PRO that are clobbered by the prologue. */ + +static bool +can_get_prologue (basic_block pro, HARD_REG_SET prologue_clobbered) +{ + edge e; + edge_iterator ei; + FOR_EACH_EDGE (e, ei, pro->preds) + if (e->flags & (EDGE_COMPLEX | EDGE_CROSSING) + && !dominated_by_p (CDI_DOMINATORS, e->src, pro)) + return false; + + HARD_REG_SET live; + REG_SET_TO_HARD_REG_SET (live, df_get_live_in (pro)); + if (hard_reg_set_intersect_p (live, prologue_clobbered)) + return false; + + return true; +} + +/* Return whether we can duplicate basic block BB for shrink wrapping. We + cannot if the block cannot be duplicated at all, or if any of its incoming + edges are complex and come from a block that does not require a prologue + (we cannot redirect such edges), or if the block is too big to copy. + PRO is the basic block before which we would put the prologue, MAX_SIZE is + the maximum size block we allow to be copied. */ + +static bool +can_dup_for_shrink_wrapping (basic_block bb, basic_block pro, unsigned max_size) +{ + if (!can_duplicate_block_p (bb)) + return false; + + edge e; + edge_iterator ei; + FOR_EACH_EDGE (e, ei, bb->preds) + if (e->flags & (EDGE_COMPLEX | EDGE_CROSSING) + && !dominated_by_p (CDI_DOMINATORS, e->src, pro)) + return false; + + unsigned size = 0; + + rtx_insn *insn; + FOR_BB_INSNS (bb, insn) + if (NONDEBUG_INSN_P (insn)) + { + size += get_attr_min_length (insn); + if (size > max_size) + return false; + } + + return true; +} + +/* Do whatever needs to be done for exits that run without prologue. + Sibcalls need nothing done. Normal exits get a simple_return inserted. */ + +static void +handle_simple_exit (edge e) +{ + + if (e->flags & EDGE_SIBCALL) + { + /* Tell function.c to take no further action on this edge. */ + e->flags |= EDGE_IGNORE; + + e->flags &= ~EDGE_FALLTHRU; + emit_barrier_after_bb (e->src); + return; + } + + /* If the basic block the edge comes from has multiple successors, + split the edge. */ + if (EDGE_COUNT (e->src->succs) > 1) + { + basic_block old_bb = e->src; + rtx_insn *end = BB_END (old_bb); + rtx_note *note = emit_note_after (NOTE_INSN_DELETED, end); + basic_block new_bb = create_basic_block (note, note, old_bb); + BB_COPY_PARTITION (new_bb, old_bb); + BB_END (old_bb) = end; + + redirect_edge_succ (e, new_bb); + new_bb->frequency = EDGE_FREQUENCY (e); + e->flags |= EDGE_FALLTHRU; + + e = make_single_succ_edge (new_bb, EXIT_BLOCK_PTR_FOR_FN (cfun), 0); + } + + e->flags &= ~EDGE_FALLTHRU; + rtx_jump_insn *ret = emit_jump_insn_after (targetm.gen_simple_return (), + BB_END (e->src)); + JUMP_LABEL (ret) = simple_return_rtx; + emit_barrier_after_bb (e->src); + + if (dump_file) + fprintf (dump_file, "Made simple_return with UID %d in bb %d\n", + INSN_UID (ret), e->src->index); +} + +/* Try to perform a kind of shrink-wrapping, making sure the + prologue/epilogue is emitted only around those parts of the + function that require it. + + There will be exactly one prologue, and it will be executed either + zero or one time, on any path. Depending on where the prologue is + placed, some of the basic blocks can be reached via both paths with + and without a prologue. Such blocks will be duplicated here, and the + edges changed to match. + + Paths that go to the exit without going through the prologue will use + a simple_return instead of the epilogue. We maximize the number of + those, making sure to only duplicate blocks that can be duplicated. + If the prologue can then still be placed in multiple locations, we + place it as early as possible. + + An example, where we duplicate blocks with control flow (legend: + _B_egin, _R_eturn and _S_imple_return; edges without arrowhead should + be taken to point down or to the right, to simplify the diagram; here, + block 3 needs a prologue, the rest does not): + + + B B + | | + 2 2 + |\ |\ + | 3 becomes | 3 + |/ | \ + 4 7 4 + |\ |\ |\ + | 5 | 8 | 5 + |/ |/ |/ + 6 9 6 + | | | + R S R + + + (bb 4 is duplicated to 7, and so on; the prologue is inserted on the + edge 2->3). + + Another example, where part of a loop is duplicated (again, bb 3 is + the only block that needs a prologue): + + + B 3<-- B ->3<-- + | | | | | | | + | v | becomes | | v | + 2---4--- 2---5-- 4--- + | | | + R S R + + + (bb 4 is duplicated to 5; the prologue is inserted on the edge 5->3). + + ENTRY_EDGE is the edge where the prologue will be placed, possibly + changed by this function. PROLOGUE_SEQ is the prologue we will insert. */ + +void +try_shrink_wrapping (edge *entry_edge, rtx_insn *prologue_seq) +{ + /* If we cannot shrink-wrap, are told not to shrink-wrap, or it makes + no sense to shrink-wrap: then do not shrink-wrap! */ + + if (!SHRINK_WRAPPING_ENABLED) + return; + + if (crtl->profile && !targetm.profile_before_prologue ()) + return; + + if (crtl->calls_eh_return) + return; + + bool empty_prologue = true; + for (rtx_insn *insn = prologue_seq; insn; insn = NEXT_INSN (insn)) + if (!(NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_PROLOGUE_END)) + { + empty_prologue = false; + break; + } + if (empty_prologue) + return; + + /* Move some code down to expose more shrink-wrapping opportunities. */ + + basic_block entry = (*entry_edge)->dest; + prepare_shrink_wrap (entry); + + if (dump_file) + fprintf (dump_file, "Attempting shrink-wrapping optimization.\n"); + + /* Compute the registers set and used in the prologue. */ + + HARD_REG_SET prologue_clobbered, prologue_used; + CLEAR_HARD_REG_SET (prologue_clobbered); + CLEAR_HARD_REG_SET (prologue_used); + for (rtx_insn *insn = prologue_seq; insn; insn = NEXT_INSN (insn)) + if (NONDEBUG_INSN_P (insn)) + { + HARD_REG_SET this_used; + CLEAR_HARD_REG_SET (this_used); + note_uses (&PATTERN (insn), record_hard_reg_uses, &this_used); + AND_COMPL_HARD_REG_SET (this_used, prologue_clobbered); + IOR_HARD_REG_SET (prologue_used, this_used); + note_stores (PATTERN (insn), record_hard_reg_sets, &prologue_clobbered); + } + CLEAR_HARD_REG_BIT (prologue_clobbered, STACK_POINTER_REGNUM); + if (frame_pointer_needed) + CLEAR_HARD_REG_BIT (prologue_clobbered, HARD_FRAME_POINTER_REGNUM); + + /* Find out what registers are set up by the prologue; any use of these + cannot happen before the prologue. */ + + struct hard_reg_set_container set_up_by_prologue; + CLEAR_HARD_REG_SET (set_up_by_prologue.set); + add_to_hard_reg_set (&set_up_by_prologue.set, Pmode, STACK_POINTER_REGNUM); + add_to_hard_reg_set (&set_up_by_prologue.set, Pmode, ARG_POINTER_REGNUM); + if (frame_pointer_needed) + add_to_hard_reg_set (&set_up_by_prologue.set, Pmode, + HARD_FRAME_POINTER_REGNUM); + if (pic_offset_table_rtx + && (unsigned) PIC_OFFSET_TABLE_REGNUM != INVALID_REGNUM) + add_to_hard_reg_set (&set_up_by_prologue.set, Pmode, + PIC_OFFSET_TABLE_REGNUM); + if (crtl->drap_reg) + add_to_hard_reg_set (&set_up_by_prologue.set, + GET_MODE (crtl->drap_reg), + REGNO (crtl->drap_reg)); + if (targetm.set_up_by_prologue) + targetm.set_up_by_prologue (&set_up_by_prologue); + + /* We will insert the prologue before the basic block PRO. PRO should + dominate all basic blocks that need the prologue to be executed + before them. First, make PRO the "tightest wrap" possible. */ + + calculate_dominance_info (CDI_DOMINATORS); + + basic_block pro = 0; + + basic_block bb; + edge e; + edge_iterator ei; + FOR_EACH_BB_FN (bb, cfun) + { + rtx_insn *insn; + FOR_BB_INSNS (bb, insn) + if (NONDEBUG_INSN_P (insn) + && requires_stack_frame_p (insn, prologue_used, + set_up_by_prologue.set)) + { + if (dump_file) + fprintf (dump_file, "Block %d needs the prologue.\n", bb->index); + pro = nearest_common_dominator (CDI_DOMINATORS, pro, bb); + break; + } + } + + /* If nothing needs a prologue, just put it at the start. This really + shouldn't happen, but we cannot fix it here. */ + + if (pro == 0) + { + if (dump_file) + fprintf(dump_file, "Nothing needs a prologue, but it isn't empty; " + "putting it at the start.\n"); + pro = entry; + } + + if (dump_file) + fprintf (dump_file, "After wrapping required blocks, PRO is now %d\n", + pro->index); + + /* Now see if we can put the prologue at the start of PRO. Putting it + there might require duplicating a block that cannot be duplicated, + or in some cases we cannot insert the prologue there at all. If PRO + wont't do, try again with the immediate dominator of PRO, and so on. + + The blocks that need duplicating are those reachable from PRO but + not dominated by it. We keep in BB_WITH a bitmap of the blocks + reachable from PRO that we already found, and in VEC a stack of + those we still need to consider (to find successors). */ + + auto_bitmap bb_with; + bitmap_set_bit (bb_with, pro->index); + + vec<basic_block> vec; + vec.create (n_basic_blocks_for_fn (cfun)); + vec.quick_push (pro); + + unsigned max_grow_size = get_uncond_jump_length (); + max_grow_size *= PARAM_VALUE (PARAM_MAX_GROW_COPY_BB_INSNS); + + while (!vec.is_empty () && pro != entry) + { + while (pro != entry && !can_get_prologue (pro, prologue_clobbered)) + { + pro = get_immediate_dominator (CDI_DOMINATORS, pro); + + if (bitmap_set_bit (bb_with, pro->index)) + vec.quick_push (pro); + } + + basic_block bb = vec.pop (); + if (!can_dup_for_shrink_wrapping (bb, pro, max_grow_size)) + while (!dominated_by_p (CDI_DOMINATORS, bb, pro)) + { + gcc_assert (pro != entry); + + pro = get_immediate_dominator (CDI_DOMINATORS, pro); + + if (bitmap_set_bit (bb_with, pro->index)) + vec.quick_push (pro); + } + + FOR_EACH_EDGE (e, ei, bb->succs) + if (e->dest != EXIT_BLOCK_PTR_FOR_FN (cfun) + && bitmap_set_bit (bb_with, e->dest->index)) + vec.quick_push (e->dest); + } + + if (dump_file) + fprintf (dump_file, "Avoiding non-duplicatable blocks, PRO is now %d\n", + pro->index); + + /* If we can move PRO back without having to duplicate more blocks, do so. + We do this because putting the prologue earlier is better for scheduling. + + We can move back to a block PRE if every path from PRE will eventually + need a prologue, that is, PRO is a post-dominator of PRE. PRE needs + to dominate every block reachable from itself. We keep in BB_TMP a + bitmap of the blocks reachable from PRE that we already found, and in + VEC a stack of those we still need to consider. + + Any block reachable from PRE is also reachable from all predecessors + of PRE, so if we find we need to move PRE back further we can leave + everything not considered so far on the stack. Any block dominated + by PRE is also dominated by all other dominators of PRE, so anything + found good for some PRE does not need to be reconsidered later. + + We don't need to update BB_WITH because none of the new blocks found + can jump to a block that does not need the prologue. */ + + if (pro != entry) + { + calculate_dominance_info (CDI_POST_DOMINATORS); + + auto_bitmap bb_tmp; + bitmap_copy (bb_tmp, bb_with); + basic_block last_ok = pro; + vec.truncate (0); + + while (pro != entry) + { + basic_block pre = get_immediate_dominator (CDI_DOMINATORS, pro); + if (!dominated_by_p (CDI_POST_DOMINATORS, pre, pro)) + break; + + if (bitmap_set_bit (bb_tmp, pre->index)) + vec.quick_push (pre); + + bool ok = true; + while (!vec.is_empty ()) + { + if (!dominated_by_p (CDI_DOMINATORS, vec.last (), pre)) + { + ok = false; + break; + } + + basic_block bb = vec.pop (); + FOR_EACH_EDGE (e, ei, bb->succs) + if (bitmap_set_bit (bb_tmp, e->dest->index)) + vec.quick_push (e->dest); + } + + if (ok && can_get_prologue (pre, prologue_clobbered)) + last_ok = pre; + + pro = pre; + } + + pro = last_ok; + + free_dominance_info (CDI_POST_DOMINATORS); + } + + vec.release (); + + if (dump_file) + fprintf (dump_file, "Bumping back to anticipatable blocks, PRO is now %d\n", + pro->index); + + if (pro == entry) + { + free_dominance_info (CDI_DOMINATORS); + return; + } + + /* Compute what fraction of the frequency and count of the blocks that run + both with and without prologue are for running with prologue. This gives + the correct answer for reducible flow graphs; for irreducible flow graphs + our profile is messed up beyond repair anyway. */ + + gcov_type num = 0; + gcov_type den = 0; + + FOR_EACH_EDGE (e, ei, pro->preds) + if (!dominated_by_p (CDI_DOMINATORS, e->src, pro)) + { + num += EDGE_FREQUENCY (e); + den += e->src->frequency; + } + + if (den == 0) + den = 1; + + /* All is okay, so do it. */ + + crtl->shrink_wrapped = true; + if (dump_file) + fprintf (dump_file, "Performing shrink-wrapping.\n"); + + /* Copy the blocks that can run both with and without prologue. The + originals run with prologue, the copies without. Store a pointer to + the copy in the ->aux field of the original. */ + + FOR_EACH_BB_FN (bb, cfun) + if (bitmap_bit_p (bb_with, bb->index) + && !dominated_by_p (CDI_DOMINATORS, bb, pro)) + { + basic_block dup = duplicate_block (bb, 0, 0); + + bb->aux = dup; + + if (JUMP_P (BB_END (dup)) && !any_condjump_p (BB_END (dup))) + emit_barrier_after_bb (dup); + + if (EDGE_COUNT (dup->succs) == 0) + emit_barrier_after_bb (dup); + + if (dump_file) + fprintf (dump_file, "Duplicated %d to %d\n", bb->index, dup->index); + + bb->frequency = RDIV (num * bb->frequency, den); + dup->frequency -= bb->frequency; + bb->count = bb->count.apply_scale (num, den); + dup->count -= bb->count; + } + + /* Now change the edges to point to the copies, where appropriate. */ + + FOR_EACH_BB_FN (bb, cfun) + if (!dominated_by_p (CDI_DOMINATORS, bb, pro)) + { + basic_block src = bb; + if (bitmap_bit_p (bb_with, bb->index)) + src = (basic_block) bb->aux; + + FOR_EACH_EDGE (e, ei, src->succs) + { + if (e->dest == EXIT_BLOCK_PTR_FOR_FN (cfun)) + continue; + + if (bitmap_bit_p (bb_with, e->dest->index) + && !dominated_by_p (CDI_DOMINATORS, e->dest, pro)) + { + if (dump_file) + fprintf (dump_file, "Redirecting edge %d->%d to %d\n", + e->src->index, e->dest->index, + ((basic_block) e->dest->aux)->index); + redirect_edge_and_branch_force (e, (basic_block) e->dest->aux); + } + else if (e->flags & EDGE_FALLTHRU + && bitmap_bit_p (bb_with, bb->index)) + force_nonfallthru (e); + } + } + + /* Also redirect the function entry edge if necessary. */ + + FOR_EACH_EDGE (e, ei, ENTRY_BLOCK_PTR_FOR_FN (cfun)->succs) + if (bitmap_bit_p (bb_with, e->dest->index) + && !dominated_by_p (CDI_DOMINATORS, e->dest, pro)) + { + basic_block split_bb = split_edge (e); + e = single_succ_edge (split_bb); + redirect_edge_and_branch_force (e, (basic_block) e->dest->aux); + } + + /* Make a simple_return for those exits that run without prologue. */ + + FOR_EACH_BB_REVERSE_FN (bb, cfun) + if (!bitmap_bit_p (bb_with, bb->index)) + FOR_EACH_EDGE (e, ei, bb->succs) + if (e->dest == EXIT_BLOCK_PTR_FOR_FN (cfun)) + handle_simple_exit (e); + + /* Finally, we want a single edge to put the prologue on. Make a new + block before the PRO block; the edge beteen them is the edge we want. + Then redirect those edges into PRO that come from blocks without the + prologue, to point to the new block instead. The new prologue block + is put at the end of the insn chain. */ + + basic_block new_bb = create_empty_bb (EXIT_BLOCK_PTR_FOR_FN (cfun)->prev_bb); + BB_COPY_PARTITION (new_bb, pro); + new_bb->count = profile_count::zero (); + if (dump_file) + fprintf (dump_file, "Made prologue block %d\n", new_bb->index); + + for (ei = ei_start (pro->preds); (e = ei_safe_edge (ei)); ) + { + if (bitmap_bit_p (bb_with, e->src->index) + || dominated_by_p (CDI_DOMINATORS, e->src, pro)) + { + ei_next (&ei); + continue; + } + + new_bb->count += e->src->count.apply_probability (e->probability); + new_bb->frequency += EDGE_FREQUENCY (e); + + redirect_edge_and_branch_force (e, new_bb); + if (dump_file) + fprintf (dump_file, "Redirected edge from %d\n", e->src->index); + } + + *entry_edge = make_single_succ_edge (new_bb, pro, EDGE_FALLTHRU); + force_nonfallthru (*entry_edge); + + free_dominance_info (CDI_DOMINATORS); +} + +/* Separate shrink-wrapping + + Instead of putting all of the prologue and epilogue in one spot, we + can put parts of it in places where those components are executed less + frequently. The following code does this, for prologue and epilogue + components that can be put in more than one location, and where those + components can be executed more than once (the epilogue component will + always be executed before the prologue component is executed a second + time). + + What exactly is a component is target-dependent. The more usual + components are simple saves/restores to/from the frame of callee-saved + registers. This code treats components abstractly (as an sbitmap), + letting the target handle all details. + + Prologue components are placed in such a way that for every component + the prologue is executed as infrequently as possible. We do this by + walking the dominator tree, comparing the cost of placing a prologue + component before a block to the sum of costs determined for all subtrees + of that block. + + From this placement, we then determine for each component all blocks + where at least one of this block's dominators (including itself) will + get a prologue inserted. That then is how the components are placed. + We could place the epilogue components a bit smarter (we can save a + bit of code size sometimes); this is a possible future improvement. + + Prologues and epilogues are preferably placed into a block, either at + the beginning or end of it, if it is needed for all predecessor resp. + successor edges; or placed on the edge otherwise. + + If the placement of any prologue/epilogue leads to a situation we cannot + handle (for example, an abnormal edge would need to be split, or some + targets want to use some specific registers that may not be available + where we want to put them), separate shrink-wrapping for the components + in that prologue/epilogue is aborted. */ + + +/* Print the sbitmap COMPONENTS to the DUMP_FILE if not empty, with the + label LABEL. */ +static void +dump_components (const char *label, sbitmap components) +{ + if (bitmap_empty_p (components)) + return; + + fprintf (dump_file, " [%s", label); + + for (unsigned int j = 0; j < components->n_bits; j++) + if (bitmap_bit_p (components, j)) + fprintf (dump_file, " %u", j); + + fprintf (dump_file, "]"); +} + +/* The data we collect for each bb. */ +struct sw { + /* What components does this BB need? */ + sbitmap needs_components; + + /* What components does this BB have? This is the main decision this + pass makes. */ + sbitmap has_components; + + /* The components for which we placed code at the start of the BB (instead + of on all incoming edges). */ + sbitmap head_components; + + /* The components for which we placed code at the end of the BB (instead + of on all outgoing edges). */ + sbitmap tail_components; + + /* The frequency of executing the prologue for this BB, if a prologue is + placed on this BB. This is a pessimistic estimate (no prologue is + needed for edges from blocks that have the component under consideration + active already). */ + gcov_type own_cost; + + /* The frequency of executing the prologue for this BB and all BBs + dominated by it. */ + gcov_type total_cost; +}; + +/* A helper function for accessing the pass-specific info. */ +static inline struct sw * +SW (basic_block bb) +{ + gcc_assert (bb->aux); + return (struct sw *) bb->aux; +} + +/* Create the pass-specific data structures for separately shrink-wrapping + with components COMPONENTS. */ +static void +init_separate_shrink_wrap (sbitmap components) +{ + basic_block bb; + FOR_ALL_BB_FN (bb, cfun) + { + bb->aux = xcalloc (1, sizeof (struct sw)); + + SW (bb)->needs_components = targetm.shrink_wrap.components_for_bb (bb); + + /* Mark all basic blocks without successor as needing all components. + This avoids problems in at least cfgcleanup, sel-sched, and + regrename (largely to do with all paths to such a block still + needing the same dwarf CFI info). */ + if (EDGE_COUNT (bb->succs) == 0) + bitmap_copy (SW (bb)->needs_components, components); + + if (dump_file) + { + fprintf (dump_file, "bb %d components:", bb->index); + dump_components ("has", SW (bb)->needs_components); + fprintf (dump_file, "\n"); + } + + SW (bb)->has_components = sbitmap_alloc (SBITMAP_SIZE (components)); + SW (bb)->head_components = sbitmap_alloc (SBITMAP_SIZE (components)); + SW (bb)->tail_components = sbitmap_alloc (SBITMAP_SIZE (components)); + bitmap_clear (SW (bb)->has_components); + } +} + +/* Destroy the pass-specific data. */ +static void +fini_separate_shrink_wrap (void) +{ + basic_block bb; + FOR_ALL_BB_FN (bb, cfun) + if (bb->aux) + { + sbitmap_free (SW (bb)->needs_components); + sbitmap_free (SW (bb)->has_components); + sbitmap_free (SW (bb)->head_components); + sbitmap_free (SW (bb)->tail_components); + free (bb->aux); + bb->aux = 0; + } +} + +/* Place the prologue for component WHICH, in the basic blocks dominated + by HEAD. Do a DFS over the dominator tree, and set bit WHICH in the + HAS_COMPONENTS of a block if either the block has that bit set in + NEEDS_COMPONENTS, or it is cheaper to place the prologue here than in all + dominator subtrees separately. */ +static void +place_prologue_for_one_component (unsigned int which, basic_block head) +{ + /* The block we are currently dealing with. */ + basic_block bb = head; + /* Is this the first time we visit this block, i.e. have we just gone + down the tree. */ + bool first_visit = true; + + /* Walk the dominator tree, visit one block per iteration of this loop. + Each basic block is visited twice: once before visiting any children + of the block, and once after visiting all of them (leaf nodes are + visited only once). As an optimization, we do not visit subtrees + that can no longer influence the prologue placement. */ + for (;;) + { + /* First visit of a block: set the (children) cost accumulator to zero; + if the block does not have the component itself, walk down. */ + if (first_visit) + { + /* Initialize the cost. The cost is the block execution frequency + that does not come from backedges. Calculating this by simply + adding the cost of all edges that aren't backedges does not + work: this does not always add up to the block frequency at + all, and even if it does, rounding error makes for bad + decisions. */ + SW (bb)->own_cost = bb->frequency; + + edge e; + edge_iterator ei; + FOR_EACH_EDGE (e, ei, bb->preds) + if (dominated_by_p (CDI_DOMINATORS, e->src, bb)) + { + if (SW (bb)->own_cost > EDGE_FREQUENCY (e)) + SW (bb)->own_cost -= EDGE_FREQUENCY (e); + else + SW (bb)->own_cost = 0; + } + + SW (bb)->total_cost = 0; + + if (!bitmap_bit_p (SW (bb)->needs_components, which) + && first_dom_son (CDI_DOMINATORS, bb)) + { + bb = first_dom_son (CDI_DOMINATORS, bb); + continue; + } + } + + /* If this block does need the component itself, or it is cheaper to + put the prologue here than in all the descendants that need it, + mark it so. If this block's immediate post-dominator is dominated + by this block, and that needs the prologue, we can put it on this + block as well (earlier is better). */ + if (bitmap_bit_p (SW (bb)->needs_components, which) + || SW (bb)->total_cost > SW (bb)->own_cost) + { + SW (bb)->total_cost = SW (bb)->own_cost; + bitmap_set_bit (SW (bb)->has_components, which); + } + else + { + basic_block kid = get_immediate_dominator (CDI_POST_DOMINATORS, bb); + if (dominated_by_p (CDI_DOMINATORS, kid, bb) + && bitmap_bit_p (SW (kid)->has_components, which)) + { + SW (bb)->total_cost = SW (bb)->own_cost; + bitmap_set_bit (SW (bb)->has_components, which); + } + } + + /* We are back where we started, so we are done now. */ + if (bb == head) + return; + + /* We now know the cost of the subtree rooted at the current block. + Accumulate this cost in the parent. */ + basic_block parent = get_immediate_dominator (CDI_DOMINATORS, bb); + SW (parent)->total_cost += SW (bb)->total_cost; + + /* Don't walk the tree down unless necessary. */ + if (next_dom_son (CDI_DOMINATORS, bb) + && SW (parent)->total_cost <= SW (parent)->own_cost) + { + bb = next_dom_son (CDI_DOMINATORS, bb); + first_visit = true; + } + else + { + bb = parent; + first_visit = false; + } + } +} + +/* Set HAS_COMPONENTS in every block to the maximum it can be set to without + setting it on any path from entry to exit where it was not already set + somewhere (or, for blocks that have no path to the exit, consider only + paths from the entry to the block itself). */ +static void +spread_components (sbitmap components) +{ + basic_block entry_block = ENTRY_BLOCK_PTR_FOR_FN (cfun); + basic_block exit_block = EXIT_BLOCK_PTR_FOR_FN (cfun); + + /* A stack of all blocks left to consider, and a bitmap of all blocks + on that stack. */ + vec<basic_block> todo; + todo.create (n_basic_blocks_for_fn (cfun)); + auto_bitmap seen; + + auto_sbitmap old (SBITMAP_SIZE (components)); + + /* Find for every block the components that are *not* needed on some path + from the entry to that block. Do this with a flood fill from the entry + block. Every block can be visited at most as often as the number of + components (plus one), and usually much less often. */ + + if (dump_file) + fprintf (dump_file, "Spreading down...\n"); + + basic_block bb; + FOR_ALL_BB_FN (bb, cfun) + bitmap_clear (SW (bb)->head_components); + + bitmap_copy (SW (entry_block)->head_components, components); + + edge e; + edge_iterator ei; + + todo.quick_push (single_succ (entry_block)); + bitmap_set_bit (seen, single_succ (entry_block)->index); + while (!todo.is_empty ()) + { + bb = todo.pop (); + + bitmap_copy (old, SW (bb)->head_components); + + FOR_EACH_EDGE (e, ei, bb->preds) + bitmap_ior (SW (bb)->head_components, SW (bb)->head_components, + SW (e->src)->head_components); + + bitmap_and_compl (SW (bb)->head_components, SW (bb)->head_components, + SW (bb)->has_components); + + if (!bitmap_equal_p (old, SW (bb)->head_components)) + FOR_EACH_EDGE (e, ei, bb->succs) + if (bitmap_set_bit (seen, e->dest->index)) + todo.quick_push (e->dest); + + bitmap_clear_bit (seen, bb->index); + } + + /* Find for every block the components that are *not* needed on some reverse + path from the exit to that block. */ + + if (dump_file) + fprintf (dump_file, "Spreading up...\n"); + + /* First, mark all blocks not reachable from the exit block as not needing + any component on any path to the exit. Mark everything, and then clear + again by a flood fill. */ + + FOR_ALL_BB_FN (bb, cfun) + bitmap_copy (SW (bb)->tail_components, components); + + FOR_EACH_EDGE (e, ei, exit_block->preds) + { + todo.quick_push (e->src); + bitmap_set_bit (seen, e->src->index); + } + + while (!todo.is_empty ()) + { + bb = todo.pop (); + + if (!bitmap_empty_p (SW (bb)->tail_components)) + FOR_EACH_EDGE (e, ei, bb->preds) + if (bitmap_set_bit (seen, e->src->index)) + todo.quick_push (e->src); + + bitmap_clear (SW (bb)->tail_components); + + bitmap_clear_bit (seen, bb->index); + } + + /* And then, flood fill backwards to find for every block the components + not needed on some path to the exit. */ + + bitmap_copy (SW (exit_block)->tail_components, components); + + FOR_EACH_EDGE (e, ei, exit_block->preds) + { + todo.quick_push (e->src); + bitmap_set_bit (seen, e->src->index); + } + + while (!todo.is_empty ()) + { + bb = todo.pop (); + + bitmap_copy (old, SW (bb)->tail_components); + + FOR_EACH_EDGE (e, ei, bb->succs) + bitmap_ior (SW (bb)->tail_components, SW (bb)->tail_components, + SW (e->dest)->tail_components); + + bitmap_and_compl (SW (bb)->tail_components, SW (bb)->tail_components, + SW (bb)->has_components); + + if (!bitmap_equal_p (old, SW (bb)->tail_components)) + FOR_EACH_EDGE (e, ei, bb->preds) + if (bitmap_set_bit (seen, e->src->index)) + todo.quick_push (e->src); + + bitmap_clear_bit (seen, bb->index); + } + + /* Finally, mark everything not not needed both forwards and backwards. */ + + FOR_EACH_BB_FN (bb, cfun) + { + bitmap_and (SW (bb)->head_components, SW (bb)->head_components, + SW (bb)->tail_components); + bitmap_and_compl (SW (bb)->has_components, components, + SW (bb)->head_components); + } + + FOR_ALL_BB_FN (bb, cfun) + { + if (dump_file) + { + fprintf (dump_file, "bb %d components:", bb->index); + dump_components ("has", SW (bb)->has_components); + fprintf (dump_file, "\n"); + } + } +} + +/* If we cannot handle placing some component's prologues or epilogues where + we decided we should place them, unmark that component in COMPONENTS so + that it is not wrapped separately. */ +static void +disqualify_problematic_components (sbitmap components) +{ + auto_sbitmap pro (SBITMAP_SIZE (components)); + auto_sbitmap epi (SBITMAP_SIZE (components)); + + basic_block bb; + FOR_EACH_BB_FN (bb, cfun) + { + edge e; + edge_iterator ei; + FOR_EACH_EDGE (e, ei, bb->succs) + { + /* Find which components we want pro/epilogues for here. */ + bitmap_and_compl (epi, SW (e->src)->has_components, + SW (e->dest)->has_components); + bitmap_and_compl (pro, SW (e->dest)->has_components, + SW (e->src)->has_components); + + /* Ask the target what it thinks about things. */ + if (!bitmap_empty_p (epi)) + targetm.shrink_wrap.disqualify_components (components, e, epi, + false); + if (!bitmap_empty_p (pro)) + targetm.shrink_wrap.disqualify_components (components, e, pro, + true); + + /* If this edge doesn't need splitting, we're fine. */ + if (single_pred_p (e->dest) + && e->dest != EXIT_BLOCK_PTR_FOR_FN (cfun)) + continue; + + /* If the edge can be split, that is fine too. */ + if ((e->flags & EDGE_ABNORMAL) == 0) + continue; + + /* We also can handle sibcalls. */ + if (e->dest == EXIT_BLOCK_PTR_FOR_FN (cfun)) + { + gcc_assert (e->flags & EDGE_SIBCALL); + continue; + } + + /* Remove from consideration those components we would need + pro/epilogues for on edges where we cannot insert them. */ + bitmap_and_compl (components, components, epi); + bitmap_and_compl (components, components, pro); + + if (dump_file && !bitmap_subset_p (epi, components)) + { + fprintf (dump_file, " BAD epi %d->%d", e->src->index, + e->dest->index); + if (e->flags & EDGE_EH) + fprintf (dump_file, " for EH"); + dump_components ("epi", epi); + fprintf (dump_file, "\n"); + } + + if (dump_file && !bitmap_subset_p (pro, components)) + { + fprintf (dump_file, " BAD pro %d->%d", e->src->index, + e->dest->index); + if (e->flags & EDGE_EH) + fprintf (dump_file, " for EH"); + dump_components ("pro", pro); + fprintf (dump_file, "\n"); + } + } + } +} + +/* Place code for prologues and epilogues for COMPONENTS where we can put + that code at the start of basic blocks. */ +static void +emit_common_heads_for_components (sbitmap components) +{ + auto_sbitmap pro (SBITMAP_SIZE (components)); + auto_sbitmap epi (SBITMAP_SIZE (components)); + auto_sbitmap tmp (SBITMAP_SIZE (components)); + + basic_block bb; + FOR_ALL_BB_FN (bb, cfun) + bitmap_clear (SW (bb)->head_components); + + FOR_EACH_BB_FN (bb, cfun) + { + /* Find which prologue resp. epilogue components are needed for all + predecessor edges to this block. */ + + /* First, select all possible components. */ + bitmap_copy (epi, components); + bitmap_copy (pro, components); + + edge e; + edge_iterator ei; + FOR_EACH_EDGE (e, ei, bb->preds) + { + if (e->flags & EDGE_ABNORMAL) + { + bitmap_clear (epi); + bitmap_clear (pro); + break; + } + + /* Deselect those epilogue components that should not be inserted + for this edge. */ + bitmap_and_compl (tmp, SW (e->src)->has_components, + SW (e->dest)->has_components); + bitmap_and (epi, epi, tmp); + + /* Similar, for the prologue. */ + bitmap_and_compl (tmp, SW (e->dest)->has_components, + SW (e->src)->has_components); + bitmap_and (pro, pro, tmp); + } + + if (dump_file && !(bitmap_empty_p (epi) && bitmap_empty_p (pro))) + fprintf (dump_file, " bb %d", bb->index); + + if (dump_file && !bitmap_empty_p (epi)) + dump_components ("epi", epi); + if (dump_file && !bitmap_empty_p (pro)) + dump_components ("pro", pro); + + if (dump_file && !(bitmap_empty_p (epi) && bitmap_empty_p (pro))) + fprintf (dump_file, "\n"); + + /* Place code after the BB note. */ + if (!bitmap_empty_p (pro)) + { + start_sequence (); + targetm.shrink_wrap.emit_prologue_components (pro); + rtx_insn *seq = get_insns (); + end_sequence (); + record_prologue_seq (seq); + + emit_insn_after (seq, bb_note (bb)); + + bitmap_ior (SW (bb)->head_components, SW (bb)->head_components, pro); + } + + if (!bitmap_empty_p (epi)) + { + start_sequence (); + targetm.shrink_wrap.emit_epilogue_components (epi); + rtx_insn *seq = get_insns (); + end_sequence (); + record_epilogue_seq (seq); + + emit_insn_after (seq, bb_note (bb)); + + bitmap_ior (SW (bb)->head_components, SW (bb)->head_components, epi); + } + } +} + +/* Place code for prologues and epilogues for COMPONENTS where we can put + that code at the end of basic blocks. */ +static void +emit_common_tails_for_components (sbitmap components) +{ + auto_sbitmap pro (SBITMAP_SIZE (components)); + auto_sbitmap epi (SBITMAP_SIZE (components)); + auto_sbitmap tmp (SBITMAP_SIZE (components)); + + basic_block bb; + FOR_ALL_BB_FN (bb, cfun) + bitmap_clear (SW (bb)->tail_components); + + FOR_EACH_BB_FN (bb, cfun) + { + /* Find which prologue resp. epilogue components are needed for all + successor edges from this block. */ + if (EDGE_COUNT (bb->succs) == 0) + continue; + + /* First, select all possible components. */ + bitmap_copy (epi, components); + bitmap_copy (pro, components); + + edge e; + edge_iterator ei; + FOR_EACH_EDGE (e, ei, bb->succs) + { + if (e->flags & EDGE_ABNORMAL) + { + bitmap_clear (epi); + bitmap_clear (pro); + break; + } + + /* Deselect those epilogue components that should not be inserted + for this edge, and also those that are already put at the head + of the successor block. */ + bitmap_and_compl (tmp, SW (e->src)->has_components, + SW (e->dest)->has_components); + bitmap_and_compl (tmp, tmp, SW (e->dest)->head_components); + bitmap_and (epi, epi, tmp); + + /* Similarly, for the prologue. */ + bitmap_and_compl (tmp, SW (e->dest)->has_components, + SW (e->src)->has_components); + bitmap_and_compl (tmp, tmp, SW (e->dest)->head_components); + bitmap_and (pro, pro, tmp); + } + + /* If the last insn of this block is a control flow insn we cannot + put anything after it. We can put our code before it instead, + but only if that jump insn is a simple jump. */ + rtx_insn *last_insn = BB_END (bb); + if (control_flow_insn_p (last_insn) && !simplejump_p (last_insn)) + { + bitmap_clear (epi); + bitmap_clear (pro); + } + + if (dump_file && !(bitmap_empty_p (epi) && bitmap_empty_p (pro))) + fprintf (dump_file, " bb %d", bb->index); + + if (dump_file && !bitmap_empty_p (epi)) + dump_components ("epi", epi); + if (dump_file && !bitmap_empty_p (pro)) + dump_components ("pro", pro); + + if (dump_file && !(bitmap_empty_p (epi) && bitmap_empty_p (pro))) + fprintf (dump_file, "\n"); + + /* Put the code at the end of the BB, but before any final jump. */ + if (!bitmap_empty_p (epi)) + { + start_sequence (); + targetm.shrink_wrap.emit_epilogue_components (epi); + rtx_insn *seq = get_insns (); + end_sequence (); + record_epilogue_seq (seq); + + if (control_flow_insn_p (last_insn)) + emit_insn_before (seq, last_insn); + else + emit_insn_after (seq, last_insn); + + bitmap_ior (SW (bb)->tail_components, SW (bb)->tail_components, epi); + } + + if (!bitmap_empty_p (pro)) + { + start_sequence (); + targetm.shrink_wrap.emit_prologue_components (pro); + rtx_insn *seq = get_insns (); + end_sequence (); + record_prologue_seq (seq); + + if (control_flow_insn_p (last_insn)) + emit_insn_before (seq, last_insn); + else + emit_insn_after (seq, last_insn); + + bitmap_ior (SW (bb)->tail_components, SW (bb)->tail_components, pro); + } + } +} + +/* Place prologues and epilogues for COMPONENTS on edges, if we haven't already + placed them inside blocks directly. */ +static void +insert_prologue_epilogue_for_components (sbitmap components) +{ + auto_sbitmap pro (SBITMAP_SIZE (components)); + auto_sbitmap epi (SBITMAP_SIZE (components)); + + basic_block bb; + FOR_EACH_BB_FN (bb, cfun) + { + if (!bb->aux) + continue; + + edge e; + edge_iterator ei; + FOR_EACH_EDGE (e, ei, bb->succs) + { + /* Find which pro/epilogue components are needed on this edge. */ + bitmap_and_compl (epi, SW (e->src)->has_components, + SW (e->dest)->has_components); + bitmap_and_compl (pro, SW (e->dest)->has_components, + SW (e->src)->has_components); + bitmap_and (epi, epi, components); + bitmap_and (pro, pro, components); + + /* Deselect those we already have put at the head or tail of the + edge's dest resp. src. */ + bitmap_and_compl (epi, epi, SW (e->dest)->head_components); + bitmap_and_compl (pro, pro, SW (e->dest)->head_components); + bitmap_and_compl (epi, epi, SW (e->src)->tail_components); + bitmap_and_compl (pro, pro, SW (e->src)->tail_components); + + if (!bitmap_empty_p (epi) || !bitmap_empty_p (pro)) + { + if (dump_file) + { + fprintf (dump_file, " %d->%d", e->src->index, + e->dest->index); + dump_components ("epi", epi); + dump_components ("pro", pro); + if (e->flags & EDGE_SIBCALL) + fprintf (dump_file, " (SIBCALL)"); + else if (e->flags & EDGE_ABNORMAL) + fprintf (dump_file, " (ABNORMAL)"); + fprintf (dump_file, "\n"); + } + + /* Put the epilogue components in place. */ + start_sequence (); + targetm.shrink_wrap.emit_epilogue_components (epi); + rtx_insn *seq = get_insns (); + end_sequence (); + record_epilogue_seq (seq); + + if (e->flags & EDGE_SIBCALL) + { + gcc_assert (e->dest == EXIT_BLOCK_PTR_FOR_FN (cfun)); + + rtx_insn *insn = BB_END (e->src); + gcc_assert (CALL_P (insn) && SIBLING_CALL_P (insn)); + emit_insn_before (seq, insn); + } + else if (e->dest == EXIT_BLOCK_PTR_FOR_FN (cfun)) + { + gcc_assert (e->flags & EDGE_FALLTHRU); + basic_block new_bb = split_edge (e); + emit_insn_after (seq, BB_END (new_bb)); + } + else + insert_insn_on_edge (seq, e); + + /* Put the prologue components in place. */ + start_sequence (); + targetm.shrink_wrap.emit_prologue_components (pro); + seq = get_insns (); + end_sequence (); + record_prologue_seq (seq); + + insert_insn_on_edge (seq, e); + } + } + } + + commit_edge_insertions (); +} + +/* The main entry point to this subpass. FIRST_BB is where the prologue + would be normally put. */ +void +try_shrink_wrapping_separate (basic_block first_bb) +{ + if (HAVE_cc0) + return; + + if (!(SHRINK_WRAPPING_ENABLED + && flag_shrink_wrap_separate + && optimize_function_for_speed_p (cfun) + && targetm.shrink_wrap.get_separate_components)) + return; + + /* We don't handle "strange" functions. */ + if (cfun->calls_alloca + || cfun->calls_setjmp + || cfun->can_throw_non_call_exceptions + || crtl->calls_eh_return + || crtl->has_nonlocal_goto + || crtl->saves_all_registers) + return; + + /* Ask the target what components there are. If it returns NULL, don't + do anything. */ + sbitmap components = targetm.shrink_wrap.get_separate_components (); + if (!components) + return; + + /* We need LIVE info, not defining anything in the entry block and not + using anything in the exit block. A block then needs a component if + the register for that component is in the IN or GEN or KILL set for + that block. */ + df_scan->local_flags |= DF_SCAN_EMPTY_ENTRY_EXIT; + df_update_entry_exit_and_calls (); + df_live_add_problem (); + df_live_set_all_dirty (); + df_analyze (); + + calculate_dominance_info (CDI_DOMINATORS); + calculate_dominance_info (CDI_POST_DOMINATORS); + + init_separate_shrink_wrap (components); + + sbitmap_iterator sbi; + unsigned int j; + EXECUTE_IF_SET_IN_BITMAP (components, 0, j, sbi) + place_prologue_for_one_component (j, first_bb); + + spread_components (components); + + disqualify_problematic_components (components); + + /* Don't separately shrink-wrap anything where the "main" prologue will + go; the target code can often optimize things if it is presented with + all components together (say, if it generates store-multiple insns). */ + bitmap_and_compl (components, components, SW (first_bb)->has_components); + + if (bitmap_empty_p (components)) + { + if (dump_file) + fprintf (dump_file, "Not wrapping anything separately.\n"); + } + else + { + if (dump_file) + { + fprintf (dump_file, "The components we wrap separately are"); + dump_components ("sep", components); + fprintf (dump_file, "\n"); + + fprintf (dump_file, "... Inserting common heads...\n"); + } + + emit_common_heads_for_components (components); + + if (dump_file) + fprintf (dump_file, "... Inserting common tails...\n"); + + emit_common_tails_for_components (components); + + if (dump_file) + fprintf (dump_file, "... Inserting the more difficult ones...\n"); + + insert_prologue_epilogue_for_components (components); + + if (dump_file) + fprintf (dump_file, "... Done.\n"); + + targetm.shrink_wrap.set_handled_components (components); + + crtl->shrink_wrapped_separate = true; + } + + fini_separate_shrink_wrap (); + + sbitmap_free (components); + free_dominance_info (CDI_DOMINATORS); + free_dominance_info (CDI_POST_DOMINATORS); + + /* All done. */ + df_scan->local_flags &= ~DF_SCAN_EMPTY_ENTRY_EXIT; + df_update_entry_exit_and_calls (); + df_live_set_all_dirty (); + df_analyze (); +}