Mercurial > hg > CbC > CbC_gcc
diff gcc/config/arm/arm.c @ 63:b7f97abdc517 gcc-4.6-20100522
update gcc from gcc-4.5.0 to gcc-4.6
author | ryoma <e075725@ie.u-ryukyu.ac.jp> |
---|---|
date | Mon, 24 May 2010 12:47:05 +0900 |
parents | 77e2b8dfacca |
children | f6334be47118 |
line wrap: on
line diff
--- a/gcc/config/arm/arm.c Fri Feb 12 23:41:23 2010 +0900 +++ b/gcc/config/arm/arm.c Mon May 24 12:47:05 2010 +0900 @@ -1,6 +1,6 @@ /* Output routines for GCC for ARM. Copyright (C) 1991, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, - 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009 + 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free Software Foundation, Inc. Contributed by Pieter `Tiggr' Schoenmakers (rcpieter@win.tue.nl) and Martin Simmons (@harleqn.co.uk). @@ -31,7 +31,6 @@ #include "obstack.h" #include "regs.h" #include "hard-reg-set.h" -#include "real.h" #include "insn-config.h" #include "conditions.h" #include "output.h" @@ -151,7 +150,6 @@ static bool arm_cirrus_insn_p (rtx); static void cirrus_reorg (rtx); static void arm_init_builtins (void); -static rtx arm_expand_builtin (tree, rtx, rtx, enum machine_mode, int); static void arm_init_iwmmxt_builtins (void); static rtx safe_vector_operand (rtx, enum machine_mode); static rtx arm_expand_binop_builtin (enum insn_code, tree, rtx); @@ -224,6 +222,7 @@ static void arm_asm_trampoline_template (FILE *); static void arm_trampoline_init (rtx, tree, rtx); static rtx arm_trampoline_adjust_address (rtx); +static rtx arm_pic_static_addr (rtx orig, rtx reg); /* Table of machine attributes. */ @@ -525,6 +524,9 @@ /* The processor for which instructions should be scheduled. */ enum processor_type arm_tune = arm_none; +/* The current tuning set. */ +const struct tune_params *current_tune; + /* The default processor used if not overridden by commandline. */ static enum processor_type arm_default_cpu = arm_none; @@ -601,7 +603,7 @@ #define FL_FOR_ARCH6T2 (FL_FOR_ARCH6 | FL_THUMB2) #define FL_FOR_ARCH6M (FL_FOR_ARCH6 & ~FL_NOTM) #define FL_FOR_ARCH7 (FL_FOR_ARCH6T2 &~ FL_NOTM) -#define FL_FOR_ARCH7A (FL_FOR_ARCH7 | FL_NOTM) +#define FL_FOR_ARCH7A (FL_FOR_ARCH7 | FL_NOTM | FL_ARCH6K) #define FL_FOR_ARCH7R (FL_FOR_ARCH7A | FL_DIV) #define FL_FOR_ARCH7M (FL_FOR_ARCH7 | FL_DIV) #define FL_FOR_ARCH7EM (FL_FOR_ARCH7M | FL_ARCH7EM) @@ -697,9 +699,6 @@ the next function. */ static int after_arm_reorg = 0; -/* The maximum number of insns to be used when loading a constant. */ -static int arm_constant_limit = 3; - static enum arm_pcs arm_pcs_default; /* For an explanation of these variables, see final_prescan_insn below. */ @@ -738,7 +737,31 @@ enum processor_type core; const char *arch; const unsigned long flags; - bool (* rtx_costs) (rtx, enum rtx_code, enum rtx_code, int *, bool); + const struct tune_params *const tune; +}; + +const struct tune_params arm_slowmul_tune = +{ + arm_slowmul_rtx_costs, + 3 +}; + +const struct tune_params arm_fastmul_tune = +{ + arm_fastmul_rtx_costs, + 1 +}; + +const struct tune_params arm_xscale_tune = +{ + arm_xscale_rtx_costs, + 2 +}; + +const struct tune_params arm_9e_tune = +{ + arm_9e_rtx_costs, + 1 }; /* Not all of these give usefully different compilation alternatives, @@ -747,7 +770,7 @@ { /* ARM Cores */ #define ARM_CORE(NAME, IDENT, ARCH, FLAGS, COSTS) \ - {NAME, arm_none, #ARCH, FLAGS | FL_FOR_ARCH##ARCH, arm_##COSTS##_rtx_costs}, + {NAME, arm_none, #ARCH, FLAGS | FL_FOR_ARCH##ARCH, &arm_##COSTS##_tune}, #include "arm-cores.def" #undef ARM_CORE {NULL, arm_none, NULL, 0, NULL} @@ -756,7 +779,7 @@ static const struct processors all_architectures[] = { /* ARM Architectures */ - /* We don't specify rtx_costs here as it will be figured out + /* We don't specify tuning costs here as it will be figured out from the core. */ {"armv2", arm2, "2", FL_CO_PROC | FL_MODE26 | FL_FOR_ARCH2, NULL}, @@ -905,6 +928,13 @@ TLS_LE32 }; +/* The maximum number of insns to be used when loading a constant. */ +inline static int +arm_constant_limit (bool size_p) +{ + return size_p ? 1 : current_tune->constant_limit; +} + /* Emit an insn that's a simple single-set. Both the operands must be known to be valid. */ inline static rtx @@ -1445,6 +1475,7 @@ gcc_assert (arm_tune != arm_none); tune_flags = all_cores[(int)arm_tune].flags; + current_tune = all_cores[(int)arm_tune].tune; if (target_fp16_format_name) { @@ -1639,8 +1670,12 @@ break; } } + if (!arm_fpu_desc) - error ("invalid floating point option: -mfpu=%s", target_fpu_name); + { + error ("invalid floating point option: -mfpu=%s", target_fpu_name); + return; + } switch (arm_fpu_desc->model) { @@ -1758,7 +1793,7 @@ /* Use the cp15 method if it is available. */ if (target_thread_pointer == TP_AUTO) { - if (arm_arch6k && !TARGET_THUMB) + if (arm_arch6k && !TARGET_THUMB1) target_thread_pointer = TP_CP15; else target_thread_pointer = TP_SOFT; @@ -1837,26 +1872,12 @@ if (optimize_size) { - arm_constant_limit = 1; - /* If optimizing for size, bump the number of instructions that we are prepared to conditionally execute (even on a StrongARM). */ max_insns_skipped = 6; } else { - /* For processors with load scheduling, it never costs more than - 2 cycles to load a constant, and the load scheduler may well - reduce that to 1. */ - if (arm_ld_sched) - arm_constant_limit = 1; - - /* On XScale the longer latency of a load makes it more difficult - to achieve a good schedule, so it's faster to synthesize - constants that can be done in two insns. */ - if (arm_tune_xscale) - arm_constant_limit = 2; - /* StrongARM has early execution of branches, so a sequence that is worth skipping is shorter. */ if (arm_tune_strongarm) @@ -1873,13 +1894,6 @@ flag_reorder_blocks = 1; } - /* Ideally we would want to use CFI directives to generate - debug info. However this also creates the .eh_frame - section, so disable them until GAS can handle - this properly. See PR40521. */ - if (TARGET_AAPCS_BASED) - flag_dwarf2_cfi_asm = 0; - /* Register global variables with the garbage collector. */ arm_add_gc_roots (); } @@ -2364,7 +2378,8 @@ && !cond && (arm_gen_constant (code, mode, NULL_RTX, val, target, source, 1, 0) - > arm_constant_limit + (code != SET))) + > (arm_constant_limit (optimize_function_for_size_p (cfun)) + + (code != SET)))) { if (code == SET) { @@ -2524,7 +2539,6 @@ int can_negate = 0; int final_invert = 0; int can_negate_initial = 0; - int can_shift = 0; int i; int num_bits_set = 0; int set_sign_bit_copies = 0; @@ -2543,7 +2557,6 @@ { case SET: can_invert = 1; - can_shift = 1; can_negate = 1; break; @@ -4781,8 +4794,8 @@ return false; /* Never tailcall something for which we have no decl, or if we - are in Thumb mode. */ - if (decl == NULL || TARGET_THUMB) + are generating code for Thumb-1. */ + if (decl == NULL || TARGET_THUMB1) return false; /* The PIC register is live on entry to VxWorks PLT entries, so we @@ -4908,31 +4921,16 @@ { rtx pic_ref, address; rtx insn; - int subregs = 0; - - /* If this function doesn't have a pic register, create one now. */ - require_pic_register (); if (reg == 0) { gcc_assert (can_create_pseudo_p ()); reg = gen_reg_rtx (Pmode); - - subregs = 1; - } - - if (subregs) - address = gen_reg_rtx (Pmode); + address = gen_reg_rtx (Pmode); + } else address = reg; - if (TARGET_ARM) - emit_insn (gen_pic_load_addr_arm (address, orig)); - else if (TARGET_THUMB2) - emit_insn (gen_pic_load_addr_thumb2 (address, orig)); - else /* TARGET_THUMB1 */ - emit_insn (gen_pic_load_addr_thumb1 (address, orig)); - /* VxWorks does not impose a fixed gap between segments; the run-time gap can be different from the object-file gap. We therefore can't use GOTOFF unless we are absolutely sure that the symbol is in the @@ -4944,15 +4942,22 @@ SYMBOL_REF_LOCAL_P (orig))) && NEED_GOT_RELOC && !TARGET_VXWORKS_RTP) - pic_ref = gen_rtx_PLUS (Pmode, cfun->machine->pic_reg, address); - else - { + insn = arm_pic_static_addr (orig, reg); + else + { + /* If this function doesn't have a pic register, create one now. */ + require_pic_register (); + + if (TARGET_32BIT) + emit_insn (gen_pic_load_addr_32bit (address, orig)); + else /* TARGET_THUMB1 */ + emit_insn (gen_pic_load_addr_thumb1 (address, orig)); + pic_ref = gen_const_mem (Pmode, gen_rtx_PLUS (Pmode, cfun->machine->pic_reg, address)); - } - - insn = emit_move_insn (reg, pic_ref); + insn = emit_move_insn (reg, pic_ref); + } /* Put a REG_EQUAL note on this insn, so that it can be optimized by loop. */ @@ -5106,7 +5111,7 @@ { pic_rtx = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE); pic_rtx = gen_rtx_CONST (Pmode, pic_rtx); - emit_insn (gen_pic_load_addr_arm (pic_reg, pic_rtx)); + emit_insn (gen_pic_load_addr_32bit (pic_reg, pic_rtx)); emit_insn (gen_rtx_SET (Pmode, pic_reg, gen_rtx_MEM (Pmode, pic_reg))); @@ -5129,29 +5134,13 @@ UNSPEC_GOTSYM_OFF); pic_rtx = gen_rtx_CONST (Pmode, pic_rtx); - if (TARGET_ARM) - { - emit_insn (gen_pic_load_addr_arm (pic_reg, pic_rtx)); - emit_insn (gen_pic_add_dot_plus_eight (pic_reg, pic_reg, labelno)); - } - else if (TARGET_THUMB2) - { - /* Thumb-2 only allows very limited access to the PC. Calculate the - address in a temporary register. */ - if (arm_pic_register != INVALID_REGNUM) - { - pic_tmp = gen_rtx_REG (SImode, - thumb_find_work_register (saved_regs)); - } + if (TARGET_32BIT) + { + emit_insn (gen_pic_load_addr_32bit (pic_reg, pic_rtx)); + if (TARGET_ARM) + emit_insn (gen_pic_add_dot_plus_eight (pic_reg, pic_reg, labelno)); else - { - gcc_assert (can_create_pseudo_p ()); - pic_tmp = gen_reg_rtx (Pmode); - } - - emit_insn (gen_pic_load_addr_thumb2 (pic_reg, pic_rtx)); - emit_insn (gen_pic_load_dot_plus_four (pic_tmp, labelno)); - emit_insn (gen_addsi3 (pic_reg, pic_reg, pic_tmp)); + emit_insn (gen_pic_add_dot_plus_four (pic_reg, pic_reg, labelno)); } else /* TARGET_THUMB1 */ { @@ -5176,6 +5165,43 @@ emit_use (pic_reg); } +/* Generate code to load the address of a static var when flag_pic is set. */ +static rtx +arm_pic_static_addr (rtx orig, rtx reg) +{ + rtx l1, labelno, offset_rtx, insn; + + gcc_assert (flag_pic); + + /* We use an UNSPEC rather than a LABEL_REF because this label + never appears in the code stream. */ + labelno = GEN_INT (pic_labelno++); + l1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, labelno), UNSPEC_PIC_LABEL); + l1 = gen_rtx_CONST (VOIDmode, l1); + + /* On the ARM the PC register contains 'dot + 8' at the time of the + addition, on the Thumb it is 'dot + 4'. */ + offset_rtx = plus_constant (l1, TARGET_ARM ? 8 : 4); + offset_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (2, orig, offset_rtx), + UNSPEC_SYMBOL_OFFSET); + offset_rtx = gen_rtx_CONST (Pmode, offset_rtx); + + if (TARGET_32BIT) + { + emit_insn (gen_pic_load_addr_32bit (reg, offset_rtx)); + if (TARGET_ARM) + insn = emit_insn (gen_pic_add_dot_plus_eight (reg, reg, labelno)); + else + insn = emit_insn (gen_pic_add_dot_plus_four (reg, reg, labelno)); + } + else /* TARGET_THUMB1 */ + { + emit_insn (gen_pic_load_addr_thumb1 (reg, offset_rtx)); + insn = emit_insn (gen_pic_add_dot_plus_four (reg, reg, labelno)); + } + + return insn; +} /* Return nonzero if X is valid as an ARM state addressing register. */ static int @@ -5808,14 +5834,7 @@ if (TARGET_ARM) emit_insn (gen_pic_add_dot_plus_eight (reg, reg, labelno)); else if (TARGET_THUMB2) - { - rtx tmp; - /* Thumb-2 only allows very limited access to the PC. Calculate - the address in a temporary register. */ - tmp = gen_reg_rtx (SImode); - emit_insn (gen_pic_load_dot_plus_four (tmp, labelno)); - emit_insn (gen_addsi3(reg, reg, tmp)); - } + emit_insn (gen_pic_add_dot_plus_four (reg, reg, labelno)); else /* TARGET_THUMB1 */ emit_insn (gen_pic_add_dot_plus_four (reg, reg, labelno)); @@ -5871,15 +5890,7 @@ if (TARGET_ARM) emit_insn (gen_tls_load_dot_plus_eight (reg, reg, labelno)); else if (TARGET_THUMB2) - { - rtx tmp; - /* Thumb-2 only allows very limited access to the PC. Calculate - the address in a temporary register. */ - tmp = gen_reg_rtx (SImode); - emit_insn (gen_pic_load_dot_plus_four (tmp, labelno)); - emit_insn (gen_addsi3(reg, reg, tmp)); - emit_move_insn (reg, gen_const_mem (SImode, reg)); - } + emit_insn (gen_tls_load_dot_plus_four (reg, NULL, reg, labelno)); else { emit_insn (gen_pic_add_dot_plus_four (reg, reg, labelno)); @@ -6264,6 +6275,15 @@ else if ((outer == IOR || outer == XOR || outer == AND) && INTVAL (x) < 256 && INTVAL (x) >= -256) return COSTS_N_INSNS (1); + else if (outer == AND) + { + int i; + /* This duplicates the tests in the andsi3 expander. */ + for (i = 9; i <= 31; i++) + if ((((HOST_WIDE_INT) 1) << i) - 1 == INTVAL (x) + || (((HOST_WIDE_INT) 1) << i) - 1 == ~INTVAL (x)) + return COSTS_N_INSNS (2); + } else if (outer == ASHIFT || outer == ASHIFTRT || outer == LSHIFTRT) return 0; @@ -6335,7 +6355,6 @@ enum rtx_code subcode; rtx operand; enum rtx_code code = GET_CODE (x); - int extra_cost; *total = 0; switch (code) @@ -6559,7 +6578,6 @@ /* Fall through */ case AND: case XOR: case IOR: - extra_cost = 0; /* Normally the frame registers will be spilt into reg+const during reload, so it is a bad idea to combine them with other instructions, @@ -6911,6 +6929,130 @@ } } +/* Estimates the size cost of thumb1 instructions. + For now most of the code is copied from thumb1_rtx_costs. We need more + fine grain tuning when we have more related test cases. */ +static inline int +thumb1_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer) +{ + enum machine_mode mode = GET_MODE (x); + + switch (code) + { + case ASHIFT: + case ASHIFTRT: + case LSHIFTRT: + case ROTATERT: + case PLUS: + case MINUS: + case COMPARE: + case NEG: + case NOT: + return COSTS_N_INSNS (1); + + case MULT: + if (GET_CODE (XEXP (x, 1)) == CONST_INT) + { + /* Thumb1 mul instruction can't operate on const. We must Load it + into a register first. */ + int const_size = thumb1_size_rtx_costs (XEXP (x, 1), CONST_INT, SET); + return COSTS_N_INSNS (1) + const_size; + } + return COSTS_N_INSNS (1); + + case SET: + return (COSTS_N_INSNS (1) + + 4 * ((GET_CODE (SET_SRC (x)) == MEM) + + GET_CODE (SET_DEST (x)) == MEM)); + + case CONST_INT: + if (outer == SET) + { + if ((unsigned HOST_WIDE_INT) INTVAL (x) < 256) + return 0; + if (thumb_shiftable_const (INTVAL (x))) + return COSTS_N_INSNS (2); + return COSTS_N_INSNS (3); + } + else if ((outer == PLUS || outer == COMPARE) + && INTVAL (x) < 256 && INTVAL (x) > -256) + return 0; + else if ((outer == IOR || outer == XOR || outer == AND) + && INTVAL (x) < 256 && INTVAL (x) >= -256) + return COSTS_N_INSNS (1); + else if (outer == AND) + { + int i; + /* This duplicates the tests in the andsi3 expander. */ + for (i = 9; i <= 31; i++) + if ((((HOST_WIDE_INT) 1) << i) - 1 == INTVAL (x) + || (((HOST_WIDE_INT) 1) << i) - 1 == ~INTVAL (x)) + return COSTS_N_INSNS (2); + } + else if (outer == ASHIFT || outer == ASHIFTRT + || outer == LSHIFTRT) + return 0; + return COSTS_N_INSNS (2); + + case CONST: + case CONST_DOUBLE: + case LABEL_REF: + case SYMBOL_REF: + return COSTS_N_INSNS (3); + + case UDIV: + case UMOD: + case DIV: + case MOD: + return 100; + + case TRUNCATE: + return 99; + + case AND: + case XOR: + case IOR: + /* XXX guess. */ + return 8; + + case MEM: + /* XXX another guess. */ + /* Memory costs quite a lot for the first word, but subsequent words + load at the equivalent of a single insn each. */ + return (10 + 4 * ((GET_MODE_SIZE (mode) - 1) / UNITS_PER_WORD) + + ((GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)) + ? 4 : 0)); + + case IF_THEN_ELSE: + /* XXX a guess. */ + if (GET_CODE (XEXP (x, 1)) == PC || GET_CODE (XEXP (x, 2)) == PC) + return 14; + return 2; + + case ZERO_EXTEND: + /* XXX still guessing. */ + switch (GET_MODE (XEXP (x, 0))) + { + case QImode: + return (1 + (mode == DImode ? 4 : 0) + + (GET_CODE (XEXP (x, 0)) == MEM ? 10 : 0)); + + case HImode: + return (4 + (mode == DImode ? 4 : 0) + + (GET_CODE (XEXP (x, 0)) == MEM ? 10 : 0)); + + case SImode: + return (1 + (GET_CODE (XEXP (x, 0)) == MEM ? 10 : 0)); + + default: + return 99; + } + + default: + return 99; + } +} + /* RTX costs when optimizing for size. */ static bool arm_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code, @@ -6919,8 +7061,7 @@ enum machine_mode mode = GET_MODE (x); if (TARGET_THUMB1) { - /* XXX TBD. For now, use the standard costs. */ - *total = thumb1_rtx_costs (x, code, outer_code); + *total = thumb1_size_rtx_costs (x, code, outer_code); return true; } @@ -7170,9 +7311,9 @@ return arm_size_rtx_costs (x, (enum rtx_code) code, (enum rtx_code) outer_code, total); else - return all_cores[(int)arm_tune].rtx_costs (x, (enum rtx_code) code, - (enum rtx_code) outer_code, - total, speed); + return current_tune->rtx_costs (x, (enum rtx_code) code, + (enum rtx_code) outer_code, + total, speed); } /* RTX costs for cores with a slow MUL implementation. Thumb-2 is not @@ -7317,7 +7458,8 @@ so it can be ignored. */ static bool -arm_xscale_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code, int *total, bool speed) +arm_xscale_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code, + int *total, bool speed) { enum machine_mode mode = GET_MODE (x); @@ -7657,7 +7799,7 @@ init_fp_table (); REAL_VALUE_FROM_CONST_DOUBLE (r, x); - r = REAL_VALUE_NEGATE (r); + r = real_value_negate (&r); if (REAL_VALUE_MINUS_ZERO (r)) return 0; @@ -7708,7 +7850,7 @@ /* Extract sign, exponent and mantissa. */ sign = REAL_VALUE_NEGATIVE (r) ? 1 : 0; - r = REAL_VALUE_ABS (r); + r = real_value_abs (&r); exponent = REAL_EXP (&r); /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the highest (sign) bit, with a fixed binary point at bit point_pos. @@ -8828,28 +8970,21 @@ } } -/* Must not copy a SET whose source operand is PC-relative. */ +/* Must not copy any rtx that uses a pc-relative address. */ + +static int +arm_note_pic_base (rtx *x, void *date ATTRIBUTE_UNUSED) +{ + if (GET_CODE (*x) == UNSPEC + && XINT (*x, 1) == UNSPEC_PIC_BASE) + return 1; + return 0; +} static bool arm_cannot_copy_insn_p (rtx insn) { - rtx pat = PATTERN (insn); - - if (GET_CODE (pat) == SET) - { - rtx rhs = SET_SRC (pat); - - if (GET_CODE (rhs) == UNSPEC - && XINT (rhs, 1) == UNSPEC_PIC_BASE) - return TRUE; - - if (GET_CODE (rhs) == MEM - && GET_CODE (XEXP (rhs, 0)) == UNSPEC - && XINT (XEXP (rhs, 0), 1) == UNSPEC_PIC_BASE) - return TRUE; - } - - return FALSE; + return for_each_rtx (&PATTERN (insn), arm_note_pic_base, NULL); } enum rtx_code @@ -8937,21 +9072,105 @@ return 0; } +/* Return true iff it would be profitable to turn a sequence of NOPS loads + or stores (depending on IS_STORE) into a load-multiple or store-multiple + instruction. ADD_OFFSET is nonzero if the base address register needs + to be modified with an add instruction before we can use it. */ + +static bool +multiple_operation_profitable_p (bool is_store ATTRIBUTE_UNUSED, + int nops, HOST_WIDE_INT add_offset) + { + /* For ARM8,9 & StrongARM, 2 ldr instructions are faster than an ldm + if the offset isn't small enough. The reason 2 ldrs are faster + is because these ARMs are able to do more than one cache access + in a single cycle. The ARM9 and StrongARM have Harvard caches, + whilst the ARM8 has a double bandwidth cache. This means that + these cores can do both an instruction fetch and a data fetch in + a single cycle, so the trick of calculating the address into a + scratch register (one of the result regs) and then doing a load + multiple actually becomes slower (and no smaller in code size). + That is the transformation + + ldr rd1, [rbase + offset] + ldr rd2, [rbase + offset + 4] + + to + + add rd1, rbase, offset + ldmia rd1, {rd1, rd2} + + produces worse code -- '3 cycles + any stalls on rd2' instead of + '2 cycles + any stalls on rd2'. On ARMs with only one cache + access per cycle, the first sequence could never complete in less + than 6 cycles, whereas the ldm sequence would only take 5 and + would make better use of sequential accesses if not hitting the + cache. + + We cheat here and test 'arm_ld_sched' which we currently know to + only be true for the ARM8, ARM9 and StrongARM. If this ever + changes, then the test below needs to be reworked. */ + if (nops == 2 && arm_ld_sched && add_offset != 0) + return false; + + return true; +} + +/* Subroutine of load_multiple_sequence and store_multiple_sequence. + Given an array of UNSORTED_OFFSETS, of which there are NOPS, compute + an array ORDER which describes the sequence to use when accessing the + offsets that produces an ascending order. In this sequence, each + offset must be larger by exactly 4 than the previous one. ORDER[0] + must have been filled in with the lowest offset by the caller. + If UNSORTED_REGS is nonnull, it is an array of register numbers that + we use to verify that ORDER produces an ascending order of registers. + Return true if it was possible to construct such an order, false if + not. */ + +static bool +compute_offset_order (int nops, HOST_WIDE_INT *unsorted_offsets, int *order, + int *unsorted_regs) +{ + int i; + for (i = 1; i < nops; i++) + { + int j; + + order[i] = order[i - 1]; + for (j = 0; j < nops; j++) + if (unsorted_offsets[j] == unsorted_offsets[order[i - 1]] + 4) + { + /* We must find exactly one offset that is higher than the + previous one by 4. */ + if (order[i] != order[i - 1]) + return false; + order[i] = j; + } + if (order[i] == order[i - 1]) + return false; + /* The register numbers must be ascending. */ + if (unsorted_regs != NULL + && unsorted_regs[order[i]] <= unsorted_regs[order[i - 1]]) + return false; + } + return true; +} + int load_multiple_sequence (rtx *operands, int nops, int *regs, int *base, HOST_WIDE_INT *load_offset) { - int unsorted_regs[4]; - HOST_WIDE_INT unsorted_offsets[4]; - int order[4]; + int unsorted_regs[MAX_LDM_STM_OPS]; + HOST_WIDE_INT unsorted_offsets[MAX_LDM_STM_OPS]; + int order[MAX_LDM_STM_OPS]; int base_reg = -1; - int i; - - /* Can only handle 2, 3, or 4 insns at present, - though could be easily extended if required. */ - gcc_assert (nops >= 2 && nops <= 4); - - memset (order, 0, 4 * sizeof (int)); + int i, ldm_case; + + /* Can only handle up to MAX_LDM_STM_OPS insns at present, though could be + easily extended if required. */ + gcc_assert (nops >= 2 && nops <= MAX_LDM_STM_OPS); + + memset (order, 0, MAX_LDM_STM_OPS * sizeof (int)); /* Loop over the operands and check that the memory references are suitable (i.e. immediate offsets from the same base register). At @@ -8987,25 +9206,16 @@ == CONST_INT))) { if (i == 0) - { - base_reg = REGNO (reg); - unsorted_regs[0] = (GET_CODE (operands[i]) == REG - ? REGNO (operands[i]) - : REGNO (SUBREG_REG (operands[i]))); - order[0] = 0; - } + base_reg = REGNO (reg); else { if (base_reg != (int) REGNO (reg)) /* Not addressed from the same base register. */ return 0; - - unsorted_regs[i] = (GET_CODE (operands[i]) == REG - ? REGNO (operands[i]) - : REGNO (SUBREG_REG (operands[i]))); - if (unsorted_regs[i] < unsorted_regs[order[0]]) - order[0] = i; - } + } + unsorted_regs[i] = (GET_CODE (operands[i]) == REG + ? REGNO (operands[i]) + : REGNO (SUBREG_REG (operands[i]))); /* If it isn't an integer register, or if it overwrites the base register but isn't the last insn in the list, then @@ -9015,6 +9225,8 @@ return 0; unsorted_offsets[i] = INTVAL (offset); + if (i == 0 || unsorted_offsets[i] < unsorted_offsets[order[0]]) + order[0] = i; } else /* Not a suitable memory address. */ @@ -9023,30 +9235,11 @@ /* All the useful information has now been extracted from the operands into unsorted_regs and unsorted_offsets; additionally, - order[0] has been set to the lowest numbered register in the - list. Sort the registers into order, and check that the memory - offsets are ascending and adjacent. */ - - for (i = 1; i < nops; i++) - { - int j; - - order[i] = order[i - 1]; - for (j = 0; j < nops; j++) - if (unsorted_regs[j] > unsorted_regs[order[i - 1]] - && (order[i] == order[i - 1] - || unsorted_regs[j] < unsorted_regs[order[i]])) - order[i] = j; - - /* Have we found a suitable register? if not, one must be used more - than once. */ - if (order[i] == order[i - 1]) - return 0; - - /* Is the memory address adjacent and ascending? */ - if (unsorted_offsets[order[i]] != unsorted_offsets[order[i - 1]] + 4) - return 0; - } + order[0] has been set to the lowest offset in the list. Sort + the offsets into order, verifying that they are adjacent, and + check that the register numbers are ascending. */ + if (!compute_offset_order (nops, unsorted_offsets, order, unsorted_regs)) + return 0; if (base) { @@ -9059,59 +9252,31 @@ } if (unsorted_offsets[order[0]] == 0) - return 1; /* ldmia */ - - if (TARGET_ARM && unsorted_offsets[order[0]] == 4) - return 2; /* ldmib */ - - if (TARGET_ARM && unsorted_offsets[order[nops - 1]] == 0) - return 3; /* ldmda */ - - if (unsorted_offsets[order[nops - 1]] == -4) - return 4; /* ldmdb */ - - /* For ARM8,9 & StrongARM, 2 ldr instructions are faster than an ldm - if the offset isn't small enough. The reason 2 ldrs are faster - is because these ARMs are able to do more than one cache access - in a single cycle. The ARM9 and StrongARM have Harvard caches, - whilst the ARM8 has a double bandwidth cache. This means that - these cores can do both an instruction fetch and a data fetch in - a single cycle, so the trick of calculating the address into a - scratch register (one of the result regs) and then doing a load - multiple actually becomes slower (and no smaller in code size). - That is the transformation - - ldr rd1, [rbase + offset] - ldr rd2, [rbase + offset + 4] - - to - - add rd1, rbase, offset - ldmia rd1, {rd1, rd2} - - produces worse code -- '3 cycles + any stalls on rd2' instead of - '2 cycles + any stalls on rd2'. On ARMs with only one cache - access per cycle, the first sequence could never complete in less - than 6 cycles, whereas the ldm sequence would only take 5 and - would make better use of sequential accesses if not hitting the - cache. - - We cheat here and test 'arm_ld_sched' which we currently know to - only be true for the ARM8, ARM9 and StrongARM. If this ever - changes, then the test below needs to be reworked. */ - if (nops == 2 && arm_ld_sched) - return 0; - - /* Can't do it without setting up the offset, only do this if it takes - no more than one insn. */ - return (const_ok_for_arm (unsorted_offsets[order[0]]) - || const_ok_for_arm (-unsorted_offsets[order[0]])) ? 5 : 0; + ldm_case = 1; /* ldmia */ + else if (TARGET_ARM && unsorted_offsets[order[0]] == 4) + ldm_case = 2; /* ldmib */ + else if (TARGET_ARM && unsorted_offsets[order[nops - 1]] == 0) + ldm_case = 3; /* ldmda */ + else if (unsorted_offsets[order[nops - 1]] == -4) + ldm_case = 4; /* ldmdb */ + else if (const_ok_for_arm (unsorted_offsets[order[0]]) + || const_ok_for_arm (-unsorted_offsets[order[0]])) + ldm_case = 5; + else + return 0; + + if (!multiple_operation_profitable_p (false, nops, + ldm_case == 5 + ? unsorted_offsets[order[0]] : 0)) + return 0; + + return ldm_case; } const char * emit_ldm_seq (rtx *operands, int nops) { - int regs[4]; + int regs[MAX_LDM_STM_OPS]; int base_reg; HOST_WIDE_INT offset; char buf[100]; @@ -9170,17 +9335,17 @@ store_multiple_sequence (rtx *operands, int nops, int *regs, int *base, HOST_WIDE_INT * load_offset) { - int unsorted_regs[4]; - HOST_WIDE_INT unsorted_offsets[4]; - int order[4]; + int unsorted_regs[MAX_LDM_STM_OPS]; + HOST_WIDE_INT unsorted_offsets[MAX_LDM_STM_OPS]; + int order[MAX_LDM_STM_OPS]; int base_reg = -1; - int i; - - /* Can only handle 2, 3, or 4 insns at present, though could be easily - extended if required. */ - gcc_assert (nops >= 2 && nops <= 4); - - memset (order, 0, 4 * sizeof (int)); + int i, stm_case; + + /* Can only handle up to MAX_LDM_STM_OPS insns at present, though could be + easily extended if required. */ + gcc_assert (nops >= 2 && nops <= MAX_LDM_STM_OPS); + + memset (order, 0, MAX_LDM_STM_OPS * sizeof (int)); /* Loop over the operands and check that the memory references are suitable (i.e. immediate offsets from the same base register). At @@ -9215,32 +9380,22 @@ && (GET_CODE (offset = XEXP (XEXP (operands[nops + i], 0), 1)) == CONST_INT))) { + unsorted_regs[i] = (GET_CODE (operands[i]) == REG + ? REGNO (operands[i]) + : REGNO (SUBREG_REG (operands[i]))); if (i == 0) - { - base_reg = REGNO (reg); - unsorted_regs[0] = (GET_CODE (operands[i]) == REG - ? REGNO (operands[i]) - : REGNO (SUBREG_REG (operands[i]))); - order[0] = 0; - } - else - { - if (base_reg != (int) REGNO (reg)) - /* Not addressed from the same base register. */ - return 0; - - unsorted_regs[i] = (GET_CODE (operands[i]) == REG - ? REGNO (operands[i]) - : REGNO (SUBREG_REG (operands[i]))); - if (unsorted_regs[i] < unsorted_regs[order[0]]) - order[0] = i; - } + base_reg = REGNO (reg); + else if (base_reg != (int) REGNO (reg)) + /* Not addressed from the same base register. */ + return 0; /* If it isn't an integer register, then we can't do this. */ if (unsorted_regs[i] < 0 || unsorted_regs[i] > 14) return 0; unsorted_offsets[i] = INTVAL (offset); + if (i == 0 || unsorted_offsets[i] < unsorted_offsets[order[0]]) + order[0] = i; } else /* Not a suitable memory address. */ @@ -9249,30 +9404,11 @@ /* All the useful information has now been extracted from the operands into unsorted_regs and unsorted_offsets; additionally, - order[0] has been set to the lowest numbered register in the - list. Sort the registers into order, and check that the memory - offsets are ascending and adjacent. */ - - for (i = 1; i < nops; i++) - { - int j; - - order[i] = order[i - 1]; - for (j = 0; j < nops; j++) - if (unsorted_regs[j] > unsorted_regs[order[i - 1]] - && (order[i] == order[i - 1] - || unsorted_regs[j] < unsorted_regs[order[i]])) - order[i] = j; - - /* Have we found a suitable register? if not, one must be used more - than once. */ - if (order[i] == order[i - 1]) - return 0; - - /* Is the memory address adjacent and ascending? */ - if (unsorted_offsets[order[i]] != unsorted_offsets[order[i - 1]] + 4) - return 0; - } + order[0] has been set to the lowest offset in the list. Sort + the offsets into order, verifying that they are adjacent, and + check that the register numbers are ascending. */ + if (!compute_offset_order (nops, unsorted_offsets, order, unsorted_regs)) + return 0; if (base) { @@ -9285,24 +9421,26 @@ } if (unsorted_offsets[order[0]] == 0) - return 1; /* stmia */ - - if (unsorted_offsets[order[0]] == 4) - return 2; /* stmib */ - - if (unsorted_offsets[order[nops - 1]] == 0) - return 3; /* stmda */ - - if (unsorted_offsets[order[nops - 1]] == -4) - return 4; /* stmdb */ - - return 0; + stm_case = 1; /* stmia */ + else if (TARGET_ARM && unsorted_offsets[order[0]] == 4) + stm_case = 2; /* stmib */ + else if (TARGET_ARM && unsorted_offsets[order[nops - 1]] == 0) + stm_case = 3; /* stmda */ + else if (unsorted_offsets[order[nops - 1]] == -4) + stm_case = 4; /* stmdb */ + else + return 0; + + if (!multiple_operation_profitable_p (false, nops, 0)) + return 0; + + return stm_case; } const char * emit_stm_seq (rtx *operands, int nops) { - int regs[4]; + int regs[MAX_LDM_STM_OPS]; int base_reg; HOST_WIDE_INT offset; char buf[100]; @@ -11684,9 +11822,14 @@ XVECEXP (par, 0, 0) = gen_rtx_SET (VOIDmode, - gen_frame_mem (BLKmode, - gen_rtx_PRE_DEC (BLKmode, - stack_pointer_rtx)), + gen_frame_mem + (BLKmode, + gen_rtx_PRE_MODIFY (Pmode, + stack_pointer_rtx, + plus_constant + (stack_pointer_rtx, + - (count * 8))) + ), gen_rtx_UNSPEC (BLKmode, gen_rtvec (1, reg), UNSPEC_PUSH_MULT)); @@ -13753,24 +13896,29 @@ if (TARGET_HARD_FLOAT && TARGET_VFP) { - start_reg = FIRST_VFP_REGNUM; - for (reg = FIRST_VFP_REGNUM; reg < LAST_VFP_REGNUM; reg += 2) + int end_reg = LAST_VFP_REGNUM + 1; + + /* Scan the registers in reverse order. We need to match + any groupings made in the prologue and generate matching + pop operations. */ + for (reg = LAST_VFP_REGNUM - 1; reg >= FIRST_VFP_REGNUM; reg -= 2) { if ((!df_regs_ever_live_p (reg) || call_used_regs[reg]) - && (!df_regs_ever_live_p (reg + 1) || call_used_regs[reg + 1])) - { - if (start_reg != reg) + && (!df_regs_ever_live_p (reg + 1) + || call_used_regs[reg + 1])) + { + if (end_reg > reg + 2) vfp_output_fldmd (f, SP_REGNUM, - (start_reg - FIRST_VFP_REGNUM) / 2, - (reg - start_reg) / 2); - start_reg = reg + 2; - } - } - if (start_reg != reg) - vfp_output_fldmd (f, SP_REGNUM, - (start_reg - FIRST_VFP_REGNUM) / 2, - (reg - start_reg) / 2); - } + (reg + 2 - FIRST_VFP_REGNUM) / 2, + (end_reg - (reg + 2)) / 2); + end_reg = reg; + } + } + if (end_reg > reg + 2) + vfp_output_fldmd (f, SP_REGNUM, 0, + (end_reg - (reg + 2)) / 2); + } + if (TARGET_IWMMXT) for (reg = FIRST_IWMMXT_REGNUM; reg <= LAST_IWMMXT_REGNUM; reg++) if (df_regs_ever_live_p (reg) && !call_used_regs[reg]) @@ -13939,16 +14087,17 @@ /* For the body of the insn we are going to generate an UNSPEC in parallel with several USEs. This allows the insn to be recognized - by the push_multi pattern in the arm.md file. The insn looks - something like this: + by the push_multi pattern in the arm.md file. + + The body of the insn looks something like this: (parallel [ - (set (mem:BLK (pre_dec:BLK (reg:SI sp))) + (set (mem:BLK (pre_modify:SI (reg:SI sp) + (const_int:SI <num>))) (unspec:BLK [(reg:SI r4)] UNSPEC_PUSH_MULT)) - (use (reg:SI 11 fp)) - (use (reg:SI 12 ip)) - (use (reg:SI 14 lr)) - (use (reg:SI 15 pc)) + (use (reg:SI XX)) + (use (reg:SI YY)) + ... ]) For the frame note however, we try to be more explicit and actually @@ -13961,13 +14110,20 @@ (sequence [ (set (reg:SI sp) (plus:SI (reg:SI sp) (const_int -20))) (set (mem:SI (reg:SI sp)) (reg:SI r4)) - (set (mem:SI (plus:SI (reg:SI sp) (const_int 4))) (reg:SI fp)) - (set (mem:SI (plus:SI (reg:SI sp) (const_int 8))) (reg:SI ip)) - (set (mem:SI (plus:SI (reg:SI sp) (const_int 12))) (reg:SI lr)) + (set (mem:SI (plus:SI (reg:SI sp) (const_int 4))) (reg:SI XX)) + (set (mem:SI (plus:SI (reg:SI sp) (const_int 8))) (reg:SI YY)) + ... ]) - This sequence is used both by the code to support stack unwinding for - exceptions handlers and the code to generate dwarf2 frame debugging. */ + FIXME:: In an ideal world the PRE_MODIFY would not exist and + instead we'd have a parallel expression detailing all + the stores to the various memory addresses so that debug + information is more up-to-date. Remember however while writing + this to take care of the constraints with the push instruction. + + Note also that this has to be taken care of for the VFP registers. + + For more see PR43399. */ par = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (num_regs)); dwarf = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (num_dwarf_regs + 1)); @@ -13981,9 +14137,14 @@ XVECEXP (par, 0, 0) = gen_rtx_SET (VOIDmode, - gen_frame_mem (BLKmode, - gen_rtx_PRE_DEC (BLKmode, - stack_pointer_rtx)), + gen_frame_mem + (BLKmode, + gen_rtx_PRE_MODIFY (Pmode, + stack_pointer_rtx, + plus_constant + (stack_pointer_rtx, + -4 * num_regs)) + ), gen_rtx_UNSPEC (BLKmode, gen_rtvec (1, reg), UNSPEC_PUSH_MULT)); @@ -14014,9 +14175,10 @@ { tmp = gen_rtx_SET (VOIDmode, - gen_frame_mem (SImode, - plus_constant (stack_pointer_rtx, - 4 * j)), + gen_frame_mem + (SImode, + plus_constant (stack_pointer_rtx, + 4 * j)), reg); RTX_FRAME_RELATED_P (tmp) = 1; XVECEXP (dwarf, 0, dwarf_par_index++) = tmp; @@ -14068,9 +14230,14 @@ XVECEXP (par, 0, 0) = gen_rtx_SET (VOIDmode, - gen_frame_mem (BLKmode, - gen_rtx_PRE_DEC (BLKmode, - stack_pointer_rtx)), + gen_frame_mem + (BLKmode, + gen_rtx_PRE_MODIFY (Pmode, + stack_pointer_rtx, + plus_constant + (stack_pointer_rtx, + -12 * count)) + ), gen_rtx_UNSPEC (BLKmode, gen_rtvec (1, reg), UNSPEC_PUSH_MULT)); @@ -14429,7 +14596,7 @@ for (reg = LAST_IWMMXT_REGNUM; reg >= FIRST_IWMMXT_REGNUM; reg--) if (df_regs_ever_live_p (reg) && ! call_used_regs[reg]) { - insn = gen_rtx_PRE_DEC (V2SImode, stack_pointer_rtx); + insn = gen_rtx_PRE_DEC (Pmode, stack_pointer_rtx); insn = gen_rtx_MEM (V2SImode, insn); insn = emit_set_insn (insn, gen_rtx_REG (V2SImode, reg)); RTX_FRAME_RELATED_P (insn) = 1; @@ -14443,7 +14610,7 @@ for (reg = LAST_FPA_REGNUM; reg >= FIRST_FPA_REGNUM; reg--) if (df_regs_ever_live_p (reg) && !call_used_regs[reg]) { - insn = gen_rtx_PRE_DEC (XFmode, stack_pointer_rtx); + insn = gen_rtx_PRE_DEC (Pmode, stack_pointer_rtx); insn = gen_rtx_MEM (XFmode, insn); insn = emit_set_insn (insn, gen_rtx_REG (XFmode, reg)); RTX_FRAME_RELATED_P (insn) = 1; @@ -14966,7 +15133,7 @@ { REAL_VALUE_TYPE r; REAL_VALUE_FROM_CONST_DOUBLE (r, x); - r = REAL_VALUE_NEGATE (r); + r = real_value_negate (&r); fprintf (stream, "%s", fp_const_from_val (&r)); } return; @@ -19236,6 +19403,51 @@ } } +/* Given the stack offsets and register mask in OFFSETS, decide + how many additional registers to push instead of subtracting + a constant from SP. */ +static int +thumb1_extra_regs_pushed (arm_stack_offsets *offsets) +{ + HOST_WIDE_INT amount = offsets->outgoing_args - offsets->saved_regs; + unsigned long live_regs_mask = offsets->saved_regs_mask; + /* Extract a mask of the ones we can give to the Thumb's push instruction. */ + unsigned long l_mask = live_regs_mask & 0x40ff; + /* Then count how many other high registers will need to be pushed. */ + unsigned long high_regs_pushed = bit_count (live_regs_mask & 0x0f00); + int n_free; + + /* If the stack frame size is 512 exactly, we can save one load + instruction, which should make this a win even when optimizing + for speed. */ + if (!optimize_size && amount != 512) + return 0; + + /* Can't do this if there are high registers to push, or if we + are not going to do a push at all. */ + if (high_regs_pushed != 0 || l_mask == 0) + return 0; + + /* Don't do this if thumb1_expand_prologue wants to emit instructions + between the push and the stack frame allocation. */ + if ((flag_pic && arm_pic_register != INVALID_REGNUM) + || (!frame_pointer_needed && CALLER_INTERWORKING_SLOT_SIZE > 0)) + return 0; + + for (n_free = 0; n_free < 8 && !(live_regs_mask & 1); live_regs_mask >>= 1) + n_free++; + + if (n_free == 0) + return 0; + gcc_assert (amount / 4 * 4 == amount); + + if (amount >= 512 && (amount - n_free * 4) < 512) + return (amount - 508) / 4; + if (amount <= n_free * 4) + return amount / 4; + return 0; +} + /* Generate the rest of a function's prologue. */ void thumb1_expand_prologue (void) @@ -19272,6 +19484,7 @@ stack_pointer_rtx); amount = offsets->outgoing_args - offsets->saved_regs; + amount -= 4 * thumb1_extra_regs_pushed (offsets); if (amount) { if (amount < 512) @@ -19576,7 +19789,11 @@ register. */ else if ((l_mask & 0xff) != 0 || (high_regs_pushed == 0 && l_mask)) - thumb_pushpop (f, l_mask, 1, &cfa_offset, l_mask); + { + unsigned long mask = l_mask; + mask |= (1 << thumb1_extra_regs_pushed (offsets)) - 1; + thumb_pushpop (f, mask, 1, &cfa_offset, mask); + } if (high_regs_pushed) { @@ -20729,6 +20946,13 @@ return false; } +/* Implements target hook small_register_classes_for_mode_p. */ +bool +arm_small_register_classes_for_mode_p (enum machine_mode mode ATTRIBUTE_UNUSED) +{ + return TARGET_THUMB1; +} + /* Implement TARGET_SHIFT_TRUNCATION_MASK. SImode shifts use normal ARM insns and therefore guarantee that the shift count is modulo 256. DImode shifts (those implemented by lib1funcs.asm or by optabs.c) @@ -20971,7 +21195,7 @@ offset = INTVAL (XEXP (e1, 1)); asm_fprintf (asm_out_file, "\t.setfp %r, %r, #%wd\n", HARD_FRAME_POINTER_REGNUM, reg, - INTVAL (XEXP (e1, 1))); + offset); } else if (GET_CODE (e1) == REG) { @@ -21207,6 +21431,16 @@ fputc (')', fp); return TRUE; } + else if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SYMBOL_OFFSET) + { + output_addr_const (fp, XVECEXP (x, 0, 0)); + if (GOT_PCREL) + fputs ("+.", fp); + fputs ("-(", fp); + output_addr_const (fp, XVECEXP (x, 0, 1)); + fputc (')', fp); + return TRUE; + } else if (GET_CODE (x) == CONST_VECTOR) return arm_emit_vector_const (fp, x); @@ -21251,12 +21485,9 @@ thumb1_output_casesi (rtx *operands) { rtx diff_vec = PATTERN (next_real_insn (operands[0])); - addr_diff_vec_flags flags; gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC); - flags = ADDR_DIFF_VEC_FLAGS (diff_vec); - switch (GET_MODE(diff_vec)) { case QImode: @@ -21372,7 +21603,7 @@ && lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type)) { static bool warned; - if (!warned && warn_psabi) + if (!warned && warn_psabi && !in_system_header) { warned = true; inform (input_location,