Mercurial > hg > CbC > CbC_gcc
diff gcc/config/i386/i386.c @ 63:b7f97abdc517 gcc-4.6-20100522
update gcc from gcc-4.5.0 to gcc-4.6
author | ryoma <e075725@ie.u-ryukyu.ac.jp> |
---|---|
date | Mon, 24 May 2010 12:47:05 +0900 |
parents | 77e2b8dfacca |
children | f6334be47118 |
line wrap: on
line diff
--- a/gcc/config/i386/i386.c Fri Feb 12 23:41:23 2010 +0900 +++ b/gcc/config/i386/i386.c Mon May 24 12:47:05 2010 +0900 @@ -1,6 +1,6 @@ /* Subroutines used for code generation on IA-32. - Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, - 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009 + Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000, + 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free Software Foundation, Inc. This file is part of GCC. @@ -28,7 +28,6 @@ #include "tm_p.h" #include "regs.h" #include "hard-reg-set.h" -#include "real.h" #include "insn-config.h" #include "conditions.h" #include "output.h" @@ -53,6 +52,8 @@ #include "tm-constrs.h" #include "params.h" #include "cselib.h" +#include "debug.h" +#include "dwarf2out.h" static rtx legitimize_dllimport_symbol (rtx, bool); @@ -817,6 +818,93 @@ 1, /* cond_not_taken_branch_cost. */ }; +struct processor_costs bdver1_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (2), /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ + COSTS_N_INSNS (4), /* HI */ + COSTS_N_INSNS (3), /* SI */ + COSTS_N_INSNS (4), /* DI */ + COSTS_N_INSNS (5)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (35), /* HI */ + COSTS_N_INSNS (51), /* SI */ + COSTS_N_INSNS (83), /* DI */ + COSTS_N_INSNS (83)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 9, /* MOVE_RATIO */ + 4, /* cost for loading QImode using movzbl */ + {3, 4, 3}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {3, 4, 3}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {4, 4, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {6, 6, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {3, 3}, /* cost of loading MMX registers + in SImode and DImode */ + {4, 4}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {4, 4, 3}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {4, 4, 5}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 3, /* MMX or SSE register to integer */ + /* On K8 + MOVD reg64, xmmreg Double FSTORE 4 + MOVD reg32, xmmreg Double FSTORE 4 + On AMDFAM10 + MOVD reg64, xmmreg Double FADD 3 + 1/1 1/1 + MOVD reg32, xmmreg Double FADD 3 + 1/1 1/1 */ + 64, /* size of l1 cache. */ + 1024, /* size of l2 cache. */ + 64, /* size of prefetch block */ + /* New AMD processors never drop prefetches; if they cannot be performed + immediately, they are queued. We set number of simultaneous prefetches + to a large constant to reflect this (it probably is not a good idea not + to limit number of prefetches at all, as their execution also takes some + time). */ + 100, /* number of parallel prefetches */ + 2, /* Branch cost */ + COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (4), /* cost of FMUL instruction. */ + COSTS_N_INSNS (19), /* cost of FDIV instruction. */ + COSTS_N_INSNS (2), /* cost of FABS instruction. */ + COSTS_N_INSNS (2), /* cost of FCHS instruction. */ + COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ + + /* BDVER1 has optimized REP instruction for medium sized blocks, but for + very small blocks it is better to use loop. For large blocks, libcall can + do nontemporary accesses and beat inline considerably. */ + {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}}, + {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + {{libcall, {{8, loop}, {24, unrolled_loop}, + {2048, rep_prefix_4_byte}, {-1, libcall}}}, + {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + 4, /* scalar_stmt_cost. */ + 2, /* scalar load_cost. */ + 2, /* scalar_store_cost. */ + 6, /* vec_stmt_cost. */ + 0, /* vec_to_scalar_cost. */ + 2, /* scalar_to_vec_cost. */ + 2, /* vec_align_load_cost. */ + 2, /* vec_unalign_load_cost. */ + 2, /* vec_store_cost. */ + 2, /* cond_taken_branch_cost. */ + 1, /* cond_not_taken_branch_cost. */ +}; + static const struct processor_costs pentium4_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ @@ -1274,7 +1362,8 @@ #define m_ATHLON (1<<PROCESSOR_ATHLON) #define m_ATHLON_K8 (m_K8 | m_ATHLON) #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10) -#define m_AMD_MULTIPLE (m_K8 | m_ATHLON | m_AMDFAM10) +#define m_BDVER1 (1<<PROCESSOR_BDVER1) +#define m_AMD_MULTIPLE (m_K8 | m_ATHLON | m_AMDFAM10 | m_BDVER1) #define m_GENERIC32 (1<<PROCESSOR_GENERIC32) #define m_GENERIC64 (1<<PROCESSOR_GENERIC64) @@ -1319,7 +1408,7 @@ ~m_386, /* X86_TUNE_USE_SAHF */ - m_ATOM | m_PPRO | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_PENT4 + m_ATOM | m_PPRO | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER1 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC, /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid @@ -1423,10 +1512,16 @@ while enabling it on K8 brings roughly 2.4% regression that can be partly masked by careful scheduling of moves. */ m_ATOM | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC - | m_AMDFAM10, - - /* X86_TUNE_SSE_UNALIGNED_MOVE_OPTIMAL */ - m_AMDFAM10, + | m_AMDFAM10 | m_BDVER1, + + /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */ + m_AMDFAM10 | m_BDVER1, + + /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */ + m_BDVER1, + + /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */ + m_BDVER1, /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies are resolved on SSE register parts instead of whole registers, so we may @@ -1456,10 +1551,10 @@ m_AMD_MULTIPLE, /* X86_TUNE_INTER_UNIT_MOVES */ - ~(m_AMD_MULTIPLE | m_ATOM | m_GENERIC), + ~(m_AMD_MULTIPLE | m_GENERIC), /* X86_TUNE_INTER_UNIT_CONVERSIONS */ - ~(m_AMDFAM10), + ~(m_AMDFAM10 | m_BDVER1), /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more than 4 branch instructions in the 16 byte window. */ @@ -1495,11 +1590,11 @@ /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is vector path on AMD machines. */ - m_K8 | m_GENERIC64 | m_AMDFAM10, + m_K8 | m_GENERIC64 | m_AMDFAM10 | m_BDVER1, /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD machines. */ - m_K8 | m_GENERIC64 | m_AMDFAM10, + m_K8 | m_GENERIC64 | m_AMDFAM10 | m_BDVER1, /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR than a MOV. */ @@ -1525,7 +1620,7 @@ /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction with a subsequent conditional jump instruction into a single compare-and-branch uop. */ - m_CORE2, + m_CORE2 | m_BDVER1, /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag will impact LEA instruction selection. */ @@ -1878,6 +1973,7 @@ static struct machine_function * ix86_init_machine_status (void); static rtx ix86_function_value (const_tree, const_tree, bool); +static bool ix86_function_value_regno_p (const unsigned int); static rtx ix86_static_chain (const_tree, bool); static int ix86_function_regparm (const_tree, const_tree); static void ix86_compute_frame_layout (struct ix86_frame *); @@ -1910,6 +2006,10 @@ static enum calling_abi ix86_function_abi (const_tree); +#ifndef SUBTARGET32_DEFAULT_CPU +#define SUBTARGET32_DEFAULT_CPU "i386" +#endif + /* The svr4 ABI for the i386 says that records and unions are returned in memory. */ #ifndef DEFAULT_PCC_STRUCT_RETURN @@ -2060,6 +2160,7 @@ {&generic32_cost, 16, 7, 16, 7, 16}, {&generic64_cost, 16, 10, 16, 10, 16}, {&amdfam10_cost, 32, 24, 32, 7, 32}, + {&bdver1_cost, 32, 24, 32, 7, 32}, {&atom_cost, 16, 7, 16, 7, 16} }; @@ -2086,7 +2187,8 @@ "athlon", "athlon-4", "k8", - "amdfam10" + "amdfam10", + "bdver1" }; /* Implement TARGET_HANDLE_OPTION. */ @@ -2400,7 +2502,7 @@ } } -/* Return a string the documents the current -m options. The caller is +/* Return a string that documents the current -m options. The caller is responsible for freeing the string. */ static char * @@ -2419,6 +2521,7 @@ { { "-m64", OPTION_MASK_ISA_64BIT }, { "-mfma4", OPTION_MASK_ISA_FMA4 }, + { "-mfma", OPTION_MASK_ISA_FMA }, { "-mxop", OPTION_MASK_ISA_XOP }, { "-mlwp", OPTION_MASK_ISA_LWP }, { "-msse4a", OPTION_MASK_ISA_SSE4A }, @@ -2505,7 +2608,7 @@ if (isa && add_nl_p) { opts[num++][0] = isa_other; - sprintf (isa_other, "(other isa: 0x%x)", isa); + sprintf (isa_other, "(other isa: %#x)", isa); } /* Add flag options. */ @@ -2521,7 +2624,7 @@ if (flags && add_nl_p) { opts[num++][0] = target_other; - sprintf (target_other, "(other flags: 0x%x)", isa); + sprintf (target_other, "(other flags: %#x)", flags); } /* Add -fpmath= option. */ @@ -2621,6 +2724,7 @@ { int i; unsigned int ix86_arch_mask, ix86_tune_mask; + const bool ix86_tune_specified = (ix86_tune_string != NULL); const char *prefix; const char *suffix; const char *sw; @@ -2742,6 +2846,11 @@ {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10, PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM}, + {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1, + PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE + | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM + | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AES + | PTA_PCLMUL | PTA_AVX | PTA_FMA4 | PTA_XOP | PTA_LWP}, {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO, 0 /* flags are only used for -march switch. */ }, {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64, @@ -2821,8 +2930,12 @@ || !strcmp (ix86_tune_string, "generic64"))) ; else if (!strncmp (ix86_tune_string, "generic", 7)) - error ("bad value (%s) for %stune=%s %s", + error ("bad value (%s) for %stune=%s %s", ix86_tune_string, prefix, suffix, sw); + else if (!strcmp (ix86_tune_string, "x86-64")) + warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated. Use " + "%stune=k8%s or %stune=generic%s instead as appropriate.", + prefix, suffix, prefix, suffix, prefix, suffix); } else { @@ -2846,6 +2959,7 @@ ix86_tune_string = "generic32"; } } + if (ix86_stringop_string) { if (!strcmp (ix86_stringop_string, "rep_byte")) @@ -2868,23 +2982,12 @@ error ("bad value (%s) for %sstringop-strategy=%s %s", ix86_stringop_string, prefix, suffix, sw); } - if (!strcmp (ix86_tune_string, "x86-64")) - warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated. Use " - "%stune=k8%s or %stune=generic%s instead as appropriate.", - prefix, suffix, prefix, suffix, prefix, suffix); if (!ix86_arch_string) - ix86_arch_string = TARGET_64BIT ? "x86-64" : "i386"; + ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU; else ix86_arch_specified = 1; - if (!strcmp (ix86_arch_string, "generic")) - error ("generic CPU can be used only for %stune=%s %s", - prefix, suffix, sw); - if (!strncmp (ix86_arch_string, "generic", 7)) - error ("bad value (%s) for %sarch=%s %s", - ix86_arch_string, prefix, suffix, sw); - /* Validate -mabi= value. */ if (ix86_abi_string) { @@ -3032,7 +3135,10 @@ break; } - if (i == pta_size) + if (!strcmp (ix86_arch_string, "generic")) + error ("generic CPU can be used only for %stune=%s %s", + prefix, suffix, sw); + else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size) error ("bad value (%s) for %sarch=%s %s", ix86_arch_string, prefix, suffix, sw); @@ -3071,7 +3177,8 @@ x86_prefetch_sse = true; break; } - if (i == pta_size) + + if (ix86_tune_specified && i == pta_size) error ("bad value (%s) for %stune=%s %s", ix86_tune_string, prefix, suffix, sw); @@ -3191,8 +3298,6 @@ ix86_tls_dialect = TLS_DIALECT_GNU; else if (strcmp (ix86_tls_dialect_string, "gnu2") == 0) ix86_tls_dialect = TLS_DIALECT_GNU2; - else if (strcmp (ix86_tls_dialect_string, "sun") == 0) - ix86_tls_dialect = TLS_DIALECT_SUN; else error ("bad value (%s) for %stls-dialect=%s %s", ix86_tls_dialect_string, prefix, suffix, sw); @@ -4284,6 +4389,10 @@ flag_schedule_insns = 0; #endif + /* For -O2 and beyond, turn on -fzee for x86_64 target. */ + if (level > 1 && TARGET_64BIT) + flag_zee = 1; + if (TARGET_MACHO) /* The Darwin libraries never set errno, so we might as well avoid calling them when that's the only reason we would. */ @@ -4391,8 +4500,8 @@ return true; } -/* Handle "cdecl", "stdcall", "fastcall", "regparm" and "sseregparm" - calling convention attributes; +/* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall", + and "sseregparm" calling convention attributes; arguments as in struct attribute_spec.handler. */ static tree @@ -4422,6 +4531,11 @@ error ("fastcall and regparm attributes are not compatible"); } + if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node))) + { + error ("regparam and thiscall attributes are not compatible"); + } + cst = TREE_VALUE (args); if (TREE_CODE (cst) != INTEGER_CST) { @@ -4443,7 +4557,8 @@ if (TARGET_64BIT) { /* Do not warn when emulating the MS ABI. */ - if (TREE_CODE (*node) != FUNCTION_TYPE + if ((TREE_CODE (*node) != FUNCTION_TYPE + && TREE_CODE (*node) != METHOD_TYPE) || ix86_function_type_abi (*node) != MS_ABI) warning (OPT_Wattributes, "%qE attribute ignored", name); @@ -4466,6 +4581,10 @@ { error ("fastcall and regparm attributes are not compatible"); } + if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node))) + { + error ("fastcall and thiscall attributes are not compatible"); + } } /* Can combine stdcall with fastcall (redundant), regparm and @@ -4480,6 +4599,10 @@ { error ("stdcall and fastcall attributes are not compatible"); } + if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node))) + { + error ("stdcall and thiscall attributes are not compatible"); + } } /* Can combine cdecl with regparm and sseregparm. */ @@ -4493,6 +4616,28 @@ { error ("fastcall and cdecl attributes are not compatible"); } + if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node))) + { + error ("cdecl and thiscall attributes are not compatible"); + } + } + else if (is_attribute_p ("thiscall", name)) + { + if (TREE_CODE (*node) != METHOD_TYPE && pedantic) + warning (OPT_Wattributes, "%qE attribute is used for none class-method", + name); + if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node))) + { + error ("stdcall and thiscall attributes are not compatible"); + } + if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node))) + { + error ("fastcall and thiscall attributes are not compatible"); + } + if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node))) + { + error ("cdecl and thiscall attributes are not compatible"); + } } /* Can combine sseregparm with all attributes. */ @@ -4526,6 +4671,11 @@ != !lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type2))) return 0; + /* Check for mismatched thiscall types. */ + if (!lookup_attribute ("thiscall", TYPE_ATTRIBUTES (type1)) + != !lookup_attribute ("thiscall", TYPE_ATTRIBUTES (type2))) + return 0; + /* Check for mismatched return types (cdecl vs stdcall). */ if (!lookup_attribute (rtdstr, TYPE_ATTRIBUTES (type1)) != !lookup_attribute (rtdstr, TYPE_ATTRIBUTES (type2))) @@ -4559,6 +4709,9 @@ if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type))) return 2; + if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (type))) + return 1; + /* Use register calling convention for local functions when possible. */ if (decl && TREE_CODE (decl) == FUNCTION_DECL @@ -4696,7 +4849,8 @@ /* Stdcall and fastcall functions will pop the stack if not variable args. */ if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (funtype)) - || lookup_attribute ("fastcall", TYPE_ATTRIBUTES (funtype))) + || lookup_attribute ("fastcall", TYPE_ATTRIBUTES (funtype)) + || lookup_attribute ("thiscall", TYPE_ATTRIBUTES (funtype))) rtd = 1; if (rtd && ! stdarg_p (funtype)) @@ -4959,7 +5113,12 @@ else look for regparm information. */ if (fntype) { - if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype))) + if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype))) + { + cum->nregs = 1; + cum->fastcall = 1; /* Same first register as in fastcall. */ + } + else if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype))) { cum->nregs = 2; cum->fastcall = 1; @@ -6277,8 +6436,8 @@ /* Return true if N is a possible register number of function value. */ -bool -ix86_function_value_regno_p (int regno) +static bool +ix86_function_value_regno_p (const unsigned int regno) { switch (regno) { @@ -6736,7 +6895,6 @@ { rtx save_area, mem; rtx label; - rtx label_ref; rtx tmp_reg; rtx nsse_reg; alias_set_type set; @@ -6787,35 +6945,9 @@ SSE saves. We need some preparation work to get this working. */ label = gen_label_rtx (); - label_ref = gen_rtx_LABEL_REF (Pmode, label); - - /* Compute address to jump to : - label - eax*4 + nnamed_sse_arguments*4 Or - label - eax*5 + nnamed_sse_arguments*5 for AVX. */ - tmp_reg = gen_reg_rtx (Pmode); + nsse_reg = gen_reg_rtx (Pmode); emit_insn (gen_zero_extendqidi2 (nsse_reg, gen_rtx_REG (QImode, AX_REG))); - emit_insn (gen_rtx_SET (VOIDmode, tmp_reg, - gen_rtx_MULT (Pmode, nsse_reg, - GEN_INT (4)))); - - /* vmovaps is one byte longer than movaps. */ - if (TARGET_AVX) - emit_insn (gen_rtx_SET (VOIDmode, tmp_reg, - gen_rtx_PLUS (Pmode, tmp_reg, - nsse_reg))); - - if (cum->sse_regno) - emit_move_insn - (nsse_reg, - gen_rtx_CONST (DImode, - gen_rtx_PLUS (DImode, - label_ref, - GEN_INT (cum->sse_regno - * (TARGET_AVX ? 5 : 4))))); - else - emit_move_insn (nsse_reg, label_ref); - emit_insn (gen_subdi3 (nsse_reg, nsse_reg, tmp_reg)); /* Compute address of memory block we save into. We always use pointer pointing 127 bytes after first byte to store - this is needed to keep @@ -6828,11 +6960,12 @@ mem = gen_rtx_MEM (BLKmode, plus_constant (tmp_reg, -127)); MEM_NOTRAP_P (mem) = 1; set_mem_alias_set (mem, set); - set_mem_align (mem, BITS_PER_WORD); + set_mem_align (mem, 64); /* And finally do the dirty job! */ emit_insn (gen_sse_prologue_save (mem, nsse_reg, - GEN_INT (cum->sse_regno), label)); + GEN_INT (cum->sse_regno), label, + gen_reg_rtx (Pmode))); } } @@ -6993,7 +7126,7 @@ int indirect_p = 0; tree ptrtype; enum machine_mode nat_mode; - int arg_boundary; + unsigned int arg_boundary; /* Only 64bit target needs something special. */ if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist))) @@ -7225,6 +7358,8 @@ t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t, size_int (-align)); t = fold_convert (TREE_TYPE (ovf), t); + if (crtl->stack_alignment_needed < arg_boundary) + crtl->stack_alignment_needed = arg_boundary; } gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue); gimplify_assign (addr, t, pre_p); @@ -7434,15 +7569,27 @@ case MODE_V4SF: return TARGET_AVX ? "vxorps\t%0, %0, %0" : "xorps\t%0, %0"; case MODE_V2DF: - return TARGET_AVX ? "vxorpd\t%0, %0, %0" : "xorpd\t%0, %0"; + if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + return TARGET_AVX ? "vxorps\t%0, %0, %0" : "xorps\t%0, %0"; + else + return TARGET_AVX ? "vxorpd\t%0, %0, %0" : "xorpd\t%0, %0"; case MODE_TI: - return TARGET_AVX ? "vpxor\t%0, %0, %0" : "pxor\t%0, %0"; + if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + return TARGET_AVX ? "vxorps\t%0, %0, %0" : "xorps\t%0, %0"; + else + return TARGET_AVX ? "vpxor\t%0, %0, %0" : "pxor\t%0, %0"; case MODE_V8SF: return "vxorps\t%x0, %x0, %x0"; case MODE_V4DF: - return "vxorpd\t%x0, %x0, %x0"; + if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + return "vxorps\t%x0, %x0, %x0"; + else + return "vxorpd\t%x0, %x0, %x0"; case MODE_OI: - return "vpxor\t%x0, %x0, %x0"; + if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + return "vxorps\t%x0, %x0, %x0"; + else + return "vpxor\t%x0, %x0, %x0"; default: break; } @@ -7576,8 +7723,8 @@ /* This function generates code for -fpic that loads %ebx with the return address of the caller and then returns. */ -void -ix86_file_end (void) +static void +ix86_code_end (void) { rtx xops[2]; int regno; @@ -7585,12 +7732,21 @@ for (regno = 0; regno < 8; ++regno) { char name[32]; + tree decl; if (! ((pic_labels_used >> regno) & 1)) continue; get_pc_thunk_name (name, regno); + decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL, + get_identifier (name), + build_function_type (void_type_node, void_list_node)); + DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL, + NULL_TREE, void_type_node); + TREE_PUBLIC (decl) = 1; + TREE_STATIC (decl) = 1; + #if TARGET_MACHO if (TARGET_MACHO) { @@ -7601,18 +7757,12 @@ assemble_name (asm_out_file, name); fputs ("\n", asm_out_file); ASM_OUTPUT_LABEL (asm_out_file, name); + DECL_WEAK (decl) = 1; } else #endif if (USE_HIDDEN_LINKONCE) { - tree decl; - - decl = build_decl (BUILTINS_LOCATION, - FUNCTION_DECL, get_identifier (name), - error_mark_node); - TREE_PUBLIC (decl) = 1; - TREE_STATIC (decl) = 1; DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl); (*targetm.asm_out.unique_section) (decl, 0); @@ -7630,14 +7780,23 @@ ASM_OUTPUT_LABEL (asm_out_file, name); } + DECL_INITIAL (decl) = make_node (BLOCK); + current_function_decl = decl; + init_function_start (decl); + first_function_block_is_cold = false; + /* Make sure unwind info is emitted for the thunk if needed. */ + final_start_function (emit_barrier (), asm_out_file, 1); + xops[0] = gen_rtx_REG (Pmode, regno); xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx); output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops); output_asm_insn ("ret", xops); - } - - if (NEED_INDICATE_EXEC_STACK) - file_end_indicate_exec_stack (); + final_end_function (); + init_insn_lengths (); + free_after_compilation (cfun); + set_cfun (NULL); + current_function_decl = NULL; + } } /* Emit code for the SET_GOT patterns. */ @@ -7674,7 +7833,24 @@ if (!flag_pic) output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops); else - output_asm_insn ("call\t%a2", xops); + { + output_asm_insn ("call\t%a2", xops); +#ifdef DWARF2_UNWIND_INFO + /* The call to next label acts as a push. */ + if (dwarf2out_do_frame ()) + { + rtx insn; + start_sequence (); + insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, + gen_rtx_PLUS (Pmode, + stack_pointer_rtx, + GEN_INT (-4)))); + RTX_FRAME_RELATED_P (insn) = 1; + dwarf2out_frame_debug (insn, true); + end_sequence (); + } +#endif + } #if TARGET_MACHO /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This @@ -7687,7 +7863,27 @@ CODE_LABEL_NUMBER (XEXP (xops[2], 0))); if (flag_pic) - output_asm_insn ("pop%z0\t%0", xops); + { + output_asm_insn ("pop%z0\t%0", xops); +#ifdef DWARF2_UNWIND_INFO + /* The pop is a pop and clobbers dest, but doesn't restore it + for unwind info purposes. */ + if (dwarf2out_do_frame ()) + { + rtx insn; + start_sequence (); + insn = emit_insn (gen_rtx_SET (VOIDmode, dest, const0_rtx)); + dwarf2out_frame_debug (insn, true); + insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, + gen_rtx_PLUS (Pmode, + stack_pointer_rtx, + GEN_INT (4)))); + RTX_FRAME_RELATED_P (insn) = 1; + dwarf2out_frame_debug (insn, true); + end_sequence (); + } +#endif + } } else { @@ -7695,6 +7891,18 @@ get_pc_thunk_name (name, REGNO (dest)); pic_labels_used |= 1 << REGNO (dest); +#ifdef DWARF2_UNWIND_INFO + /* Ensure all queued register saves are flushed before the + call. */ + if (dwarf2out_do_frame ()) + { + rtx insn; + start_sequence (); + insn = emit_barrier (); + end_sequence (); + dwarf2out_frame_debug (insn, false); + } +#endif xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name)); xops[2] = gen_rtx_MEM (QImode, xops[2]); output_asm_insn ("call\t%X2", xops); @@ -7919,6 +8127,7 @@ && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs) { int count = frame->nregs; + struct cgraph_node *node = cgraph_node (current_function_decl); cfun->machine->use_fast_prologue_epilogue_nregs = count; /* The fast prologue uses move instead of push to save registers. This @@ -7933,9 +8142,9 @@ slow to use many of them. */ if (count) count = (count - 1) * FAST_PROLOGUE_INSN_COUNT; - if (cfun->function_frequency < FUNCTION_FREQUENCY_NORMAL + if (node->frequency < NODE_FREQUENCY_NORMAL || (flag_branch_probabilities - && cfun->function_frequency < FUNCTION_FREQUENCY_HOT)) + && node->frequency < NODE_FREQUENCY_HOT)) cfun->machine->use_fast_prologue_epilogue = false; else cfun->machine->use_fast_prologue_epilogue @@ -8237,6 +8446,8 @@ passing. */ if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2 && !lookup_attribute ("fastcall", + TYPE_ATTRIBUTES (TREE_TYPE (decl))) + && !lookup_attribute ("thiscall", TYPE_ATTRIBUTES (TREE_TYPE (decl)))) return CX_REG; else @@ -8331,7 +8542,11 @@ end_sequence (); insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ())); - RTX_FRAME_RELATED_P (insn) = 1; + if (!optimize) + { + add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg); + RTX_FRAME_RELATED_P (insn) = 1; + } return drap_vreg; } else @@ -8559,13 +8774,10 @@ ix86_cfa_state->reg == stack_pointer_rtx); else { - /* Only valid for Win32. */ rtx eax = gen_rtx_REG (Pmode, AX_REG); bool eax_live; rtx t; - gcc_assert (!TARGET_64BIT || cfun->machine->call_abi == MS_ABI); - if (cfun->machine->call_abi == MS_ABI) eax_live = false; else @@ -9263,6 +9475,7 @@ rtx base_reg, index_reg; HOST_WIDE_INT scale = 1; rtx scale_rtx = NULL_RTX; + rtx tmp; int retval = 1; enum ix86_address_seg seg = SEG_DEFAULT; @@ -9298,6 +9511,19 @@ scale_rtx = XEXP (op, 1); break; + case ASHIFT: + if (index) + return 0; + index = XEXP (op, 0); + tmp = XEXP (op, 1); + if (!CONST_INT_P (tmp)) + return 0; + scale = INTVAL (tmp); + if ((unsigned HOST_WIDE_INT) scale > 3) + return 0; + scale = 1 << scale; + break; + case UNSPEC: if (XINT (op, 1) == UNSPEC_TP && TARGET_TLS_DIRECT_SEG_REFS @@ -9338,8 +9564,6 @@ } else if (GET_CODE (addr) == ASHIFT) { - rtx tmp; - /* We're called for lea too, which implements ashift on occasion. */ index = XEXP (addr, 0); tmp = XEXP (addr, 1); @@ -10794,29 +11018,29 @@ break; case UNSPEC_GOTTPOFF: /* FIXME: This might be @TPOFF in Sun ld too. */ - fputs ("@GOTTPOFF", file); + fputs ("@gottpoff", file); break; case UNSPEC_TPOFF: - fputs ("@TPOFF", file); + fputs ("@tpoff", file); break; case UNSPEC_NTPOFF: if (TARGET_64BIT) - fputs ("@TPOFF", file); - else - fputs ("@NTPOFF", file); + fputs ("@tpoff", file); + else + fputs ("@ntpoff", file); break; case UNSPEC_DTPOFF: - fputs ("@DTPOFF", file); + fputs ("@dtpoff", file); break; case UNSPEC_GOTNTPOFF: if (TARGET_64BIT) fputs (ASSEMBLER_DIALECT == ASM_ATT ? - "@GOTTPOFF(%rip)": "@GOTTPOFF[rip]", file); - else - fputs ("@GOTNTPOFF", file); + "@gottpoff(%rip)": "@gottpoff[rip]", file); + else + fputs ("@gotntpoff", file); break; case UNSPEC_INDNTPOFF: - fputs ("@INDNTPOFF", file); + fputs ("@indntpoff", file); break; #if TARGET_MACHO case UNSPEC_MACHOPIC_OFFSET: @@ -10843,7 +11067,7 @@ { fputs (ASM_LONG, file); output_addr_const (file, x); - fputs ("@DTPOFF", file); + fputs ("@dtpoff", file); switch (size) { case 4: @@ -10884,6 +11108,9 @@ ix86_delegitimize_address (rtx x) { rtx orig_x = delegitimize_mem_from_attrs (x); + /* addend is NULL or some rtx if x is something+GOTOFF where + something doesn't include the PIC register. */ + rtx addend = NULL_RTX; /* reg_addend is NULL or a multiple of some register. */ rtx reg_addend = NULL_RTX; /* const_addend is NULL or a const_int. */ @@ -10903,7 +11130,10 @@ || XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL || !MEM_P (orig_x)) return orig_x; - return XVECEXP (XEXP (x, 0), 0, 0); + x = XVECEXP (XEXP (x, 0), 0, 0); + if (GET_MODE (orig_x) != Pmode) + return simplify_gen_subreg (GET_MODE (orig_x), x, Pmode, 0); + return x; } if (GET_CODE (x) != PLUS @@ -10922,14 +11152,13 @@ else if (ix86_pic_register_p (XEXP (reg_addend, 1))) reg_addend = XEXP (reg_addend, 0); else - return orig_x; - if (!REG_P (reg_addend) - && GET_CODE (reg_addend) != MULT - && GET_CODE (reg_addend) != ASHIFT) - return orig_x; - } - else - return orig_x; + { + reg_addend = NULL_RTX; + addend = XEXP (x, 0); + } + } + else + addend = XEXP (x, 0); x = XEXP (XEXP (x, 1), 0); if (GET_CODE (x) == PLUS @@ -10940,7 +11169,7 @@ } if (GET_CODE (x) == UNSPEC - && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x)) + && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend) || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x)))) result = XVECEXP (x, 0, 0); @@ -10955,6 +11184,24 @@ result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend)); if (reg_addend) result = gen_rtx_PLUS (Pmode, reg_addend, result); + if (addend) + { + /* If the rest of original X doesn't involve the PIC register, add + addend and subtract pic_offset_table_rtx. This can happen e.g. + for code like: + leal (%ebx, %ecx, 4), %ecx + ... + movl foo@GOTOFF(%ecx), %edx + in which case we return (%ecx - %ebx) + foo. */ + if (pic_offset_table_rtx) + result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend), + pic_offset_table_rtx), + result); + else + return orig_x; + } + if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x)) + return simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0); return result; } @@ -11295,7 +11542,7 @@ return cfun->machine->some_ld_name; for (insn = get_insns (); insn ; insn = NEXT_INSN (insn)) - if (INSN_P (insn) + if (NONDEBUG_INSN_P (insn) && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0)) return cfun->machine->some_ld_name; @@ -11306,7 +11553,6 @@ L,W,B,Q,S,T -- print the opcode suffix for specified size of operand. C -- print opcode suffix for set/cmov insn. c -- like C, but print reversed condition - E,e -- likewise, but for compare-and-branch fused insn. F,f -- likewise, but for floating-point. O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.", otherwise nothing @@ -11711,14 +11957,6 @@ put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 1, file); return; - case 'E': - put_condition_code (GET_CODE (x), CCmode, 0, 0, file); - return; - - case 'e': - put_condition_code (GET_CODE (x), CCmode, 1, 0, file); - return; - case 'H': /* It doesn't actually matter what mode we use here, as we're only going to use this for printing. */ @@ -11817,10 +12055,8 @@ return; case ';': -#if TARGET_MACHO - fputs (" ; ", file); -#else - putc (' ', file); +#if TARGET_MACHO || !HAVE_AS_IX86_REP_LOCK_PREFIX + fputs (";", file); #endif return; @@ -12102,34 +12338,34 @@ case UNSPEC_GOTTPOFF: output_addr_const (file, op); /* FIXME: This might be @TPOFF in Sun ld. */ - fputs ("@GOTTPOFF", file); + fputs ("@gottpoff", file); break; case UNSPEC_TPOFF: output_addr_const (file, op); - fputs ("@TPOFF", file); + fputs ("@tpoff", file); break; case UNSPEC_NTPOFF: output_addr_const (file, op); if (TARGET_64BIT) - fputs ("@TPOFF", file); - else - fputs ("@NTPOFF", file); + fputs ("@tpoff", file); + else + fputs ("@ntpoff", file); break; case UNSPEC_DTPOFF: output_addr_const (file, op); - fputs ("@DTPOFF", file); + fputs ("@dtpoff", file); break; case UNSPEC_GOTNTPOFF: output_addr_const (file, op); if (TARGET_64BIT) fputs (ASSEMBLER_DIALECT == ASM_ATT ? - "@GOTTPOFF(%rip)" : "@GOTTPOFF[rip]", file); - else - fputs ("@GOTNTPOFF", file); + "@gottpoff(%rip)" : "@gottpoff[rip]", file); + else + fputs ("@gotntpoff", file); break; case UNSPEC_INDNTPOFF: output_addr_const (file, op); - fputs ("@INDNTPOFF", file); + fputs ("@indntpoff", file); break; #if TARGET_MACHO case UNSPEC_MACHOPIC_OFFSET: @@ -13107,6 +13343,14 @@ switch (GET_MODE_SIZE (mode)) { case 16: + /* If we're optimizing for size, movups is the smallest. */ + if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + { + op0 = gen_lowpart (V4SFmode, op0); + op1 = gen_lowpart (V4SFmode, op1); + emit_insn (gen_avx_movups (op0, op1)); + return; + } op0 = gen_lowpart (V16QImode, op0); op1 = gen_lowpart (V16QImode, op1); emit_insn (gen_avx_movdqu (op0, op1)); @@ -13133,6 +13377,13 @@ emit_insn (gen_avx_movups256 (op0, op1)); break; case V2DFmode: + if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + { + op0 = gen_lowpart (V4SFmode, op0); + op1 = gen_lowpart (V4SFmode, op1); + emit_insn (gen_avx_movups (op0, op1)); + return; + } emit_insn (gen_avx_movupd (op0, op1)); break; case V4DFmode: @@ -13153,7 +13404,8 @@ if (MEM_P (op1)) { /* If we're optimizing for size, movups is the smallest. */ - if (optimize_insn_for_size_p ()) + if (optimize_insn_for_size_p () + || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) { op0 = gen_lowpart (V4SFmode, op0); op1 = gen_lowpart (V4SFmode, op1); @@ -13176,13 +13428,13 @@ { rtx zero; - if (TARGET_SSE_UNALIGNED_MOVE_OPTIMAL) - { - op0 = gen_lowpart (V2DFmode, op0); - op1 = gen_lowpart (V2DFmode, op1); - emit_insn (gen_sse2_movupd (op0, op1)); - return; - } + if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL) + { + op0 = gen_lowpart (V2DFmode, op0); + op1 = gen_lowpart (V2DFmode, op1); + emit_insn (gen_sse2_movupd (op0, op1)); + return; + } /* When SSE registers are split into halves, we can avoid writing to the top half twice. */ @@ -13211,12 +13463,12 @@ } else { - if (TARGET_SSE_UNALIGNED_MOVE_OPTIMAL) - { - op0 = gen_lowpart (V4SFmode, op0); - op1 = gen_lowpart (V4SFmode, op1); - emit_insn (gen_sse_movups (op0, op1)); - return; + if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL) + { + op0 = gen_lowpart (V4SFmode, op0); + op1 = gen_lowpart (V4SFmode, op1); + emit_insn (gen_sse_movups (op0, op1)); + return; } if (TARGET_SSE_PARTIAL_REG_DEPENDENCY) @@ -13235,7 +13487,8 @@ else if (MEM_P (op0)) { /* If we're optimizing for size, movups is the smallest. */ - if (optimize_insn_for_size_p ()) + if (optimize_insn_for_size_p () + || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) { op0 = gen_lowpart (V4SFmode, op0); op1 = gen_lowpart (V4SFmode, op1); @@ -13256,19 +13509,37 @@ if (TARGET_SSE2 && mode == V2DFmode) { - m = adjust_address (op0, DFmode, 0); - emit_insn (gen_sse2_storelpd (m, op1)); - m = adjust_address (op0, DFmode, 8); - emit_insn (gen_sse2_storehpd (m, op1)); + if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL) + { + op0 = gen_lowpart (V2DFmode, op0); + op1 = gen_lowpart (V2DFmode, op1); + emit_insn (gen_sse2_movupd (op0, op1)); + } + else + { + m = adjust_address (op0, DFmode, 0); + emit_insn (gen_sse2_storelpd (m, op1)); + m = adjust_address (op0, DFmode, 8); + emit_insn (gen_sse2_storehpd (m, op1)); + } } else { if (mode != V4SFmode) op1 = gen_lowpart (V4SFmode, op1); - m = adjust_address (op0, V2SFmode, 0); - emit_insn (gen_sse_storelps (m, op1)); - m = adjust_address (op0, V2SFmode, 8); - emit_insn (gen_sse_storehps (m, op1)); + + if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL) + { + op0 = gen_lowpart (V4SFmode, op0); + emit_insn (gen_sse_movups (op0, op1)); + } + else + { + m = adjust_address (op0, V2SFmode, 0); + emit_insn (gen_sse_storelps (m, op1)); + m = adjust_address (op0, V2SFmode, 8); + emit_insn (gen_sse_storehps (m, op1)); + } } } else @@ -13387,16 +13658,6 @@ if (MEM_P (src1) && !rtx_equal_p (dst, src1)) src1 = force_reg (mode, src1); - /* In order for the multiply-add patterns to get matched, we need - to aid combine by forcing all operands into registers to start. */ - if (optimize && TARGET_FMA4) - { - if (MEM_P (src2)) - src2 = force_reg (GET_MODE (src2), src2); - else if (MEM_P (src1)) - src1 = force_reg (GET_MODE (src1), src1); - } - operands[1] = src1; operands[2] = src2; return dst; @@ -13560,7 +13821,7 @@ rtx prev = PREV_INSN (insn); while (prev && distance < LEA_SEARCH_THRESHOLD) { - if (INSN_P (prev)) + if (NONDEBUG_INSN_P (prev)) { distance++; for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++) @@ -13600,7 +13861,7 @@ && prev != insn && distance < LEA_SEARCH_THRESHOLD) { - if (INSN_P (prev)) + if (NONDEBUG_INSN_P (prev)) { distance++; for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++) @@ -13646,7 +13907,7 @@ rtx next = NEXT_INSN (insn); while (next && distance < LEA_SEARCH_THRESHOLD) { - if (INSN_P (next)) + if (NONDEBUG_INSN_P (next)) { distance++; @@ -13695,7 +13956,7 @@ && next != insn && distance < LEA_SEARCH_THRESHOLD) { - if (INSN_P (next)) + if (NONDEBUG_INSN_P (next)) { distance++; @@ -15391,7 +15652,7 @@ enum rtx_code code = GET_CODE (operands[1]), compare_code; rtx compare_seq, compare_op; enum machine_mode mode = GET_MODE (operands[0]); - bool sign_bit_compare_p = false;; + bool sign_bit_compare_p = false; start_sequence (); ix86_compare_op0 = XEXP (operands[1], 0); @@ -15432,7 +15693,6 @@ if (!sign_bit_compare_p) { rtx flags; - rtx (*insn)(rtx, rtx, rtx); bool fpcmp = false; compare_code = GET_CODE (compare_op); @@ -15473,11 +15733,10 @@ tmp = gen_reg_rtx (mode); if (mode == DImode) - insn = gen_x86_movdicc_0_m1; + emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op)); else - insn = gen_x86_movsicc_0_m1; - - emit_insn (insn (tmp, flags, compare_op)); + emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp), + flags, compare_op)); } else { @@ -16256,29 +16515,22 @@ case V2DImode: { rtx t1, t2, mask; - - /* Perform a parallel modulo subtraction. */ - t1 = gen_reg_rtx (mode); - emit_insn ((mode == V4SImode - ? gen_subv4si3 - : gen_subv2di3) (t1, cop0, cop1)); - - /* Extract the original sign bit of op0. */ + rtx (*gen_sub3) (rtx, rtx, rtx); + + /* Subtract (-(INT MAX) - 1) from both operands to make + them signed. */ mask = ix86_build_signbit_mask (GET_MODE_INNER (mode), true, false); + gen_sub3 = (mode == V4SImode + ? gen_subv4si3 : gen_subv2di3); + t1 = gen_reg_rtx (mode); + emit_insn (gen_sub3 (t1, cop0, mask)); + t2 = gen_reg_rtx (mode); - emit_insn ((mode == V4SImode - ? gen_andv4si3 - : gen_andv2di3) (t2, cop0, mask)); - - /* XOR it back into the result of the subtraction. - This results in the sign bit set iff we saw - unsigned underflow. */ - x = gen_reg_rtx (mode); - emit_insn ((mode == V4SImode - ? gen_xorv4si3 - : gen_xorv2di3) (x, t1, t2)); - + emit_insn (gen_sub3 (t2, cop1, mask)); + + cop0 = t1; + cop1 = t2; code = GT; } break; @@ -16290,6 +16542,8 @@ emit_insn (gen_rtx_SET (VOIDmode, x, gen_rtx_US_MINUS (mode, cop0, cop1))); + cop0 = x; + cop1 = CONST0_RTX (mode); code = EQ; negate = !negate; break; @@ -16297,9 +16551,6 @@ default: gcc_unreachable (); } - - cop0 = x; - cop1 = CONST0_RTX (mode); } } @@ -17023,20 +17274,22 @@ : gen_x86_64_shld) (high[0], low[0], operands[2])); } - emit_insn ((mode == DImode ? gen_ashlsi3 : gen_ashldi3) (low[0], low[0], operands[2])); + emit_insn ((mode == DImode + ? gen_ashlsi3 + : gen_ashldi3) (low[0], low[0], operands[2])); if (TARGET_CMOVE && scratch) { ix86_expand_clear (scratch); emit_insn ((mode == DImode - ? gen_x86_shift_adj_1 - : gen_x86_64_shift_adj_1) (high[0], low[0], operands[2], - scratch)); + ? gen_x86_shiftsi_adj_1 + : gen_x86_shiftdi_adj_1) (high[0], low[0], operands[2], + scratch)); } else emit_insn ((mode == DImode - ? gen_x86_shift_adj_2 - : gen_x86_64_shift_adj_2) (high[0], low[0], operands[2])); + ? gen_x86_shiftsi_adj_2 + : gen_x86_shiftdi_adj_2) (high[0], low[0], operands[2])); } void @@ -17109,14 +17362,14 @@ : gen_ashrdi3) (scratch, scratch, GEN_INT (single_width - 1))); emit_insn ((mode == DImode - ? gen_x86_shift_adj_1 - : gen_x86_64_shift_adj_1) (low[0], high[0], operands[2], - scratch)); + ? gen_x86_shiftsi_adj_1 + : gen_x86_shiftdi_adj_1) (low[0], high[0], operands[2], + scratch)); } else emit_insn ((mode == DImode - ? gen_x86_shift_adj_3 - : gen_x86_64_shift_adj_3) (low[0], high[0], operands[2])); + ? gen_x86_shiftsi_adj_3 + : gen_x86_shiftdi_adj_3) (low[0], high[0], operands[2])); } } @@ -17174,14 +17427,14 @@ { ix86_expand_clear (scratch); emit_insn ((mode == DImode - ? gen_x86_shift_adj_1 - : gen_x86_64_shift_adj_1) (low[0], high[0], operands[2], - scratch)); + ? gen_x86_shiftsi_adj_1 + : gen_x86_shiftdi_adj_1) (low[0], high[0], operands[2], + scratch)); } else emit_insn ((mode == DImode - ? gen_x86_shift_adj_2 - : gen_x86_64_shift_adj_2) (low[0], high[0], operands[2])); + ? gen_x86_shiftsi_adj_2 + : gen_x86_shiftdi_adj_2) (low[0], high[0], operands[2])); } } @@ -19606,6 +19859,7 @@ case PROCESSOR_NOCONA: case PROCESSOR_GENERIC32: case PROCESSOR_GENERIC64: + case PROCESSOR_BDVER1: return 3; case PROCESSOR_CORE2: @@ -19795,6 +20049,7 @@ case PROCESSOR_ATHLON: case PROCESSOR_K8: case PROCESSOR_AMDFAM10: + case PROCESSOR_BDVER1: case PROCESSOR_ATOM: case PROCESSOR_GENERIC32: case PROCESSOR_GENERIC64: @@ -19990,10 +20245,26 @@ } /* x86-64 ABI requires arrays greater than 16 bytes to be aligned - to 16byte boundary. */ - if (TARGET_64BIT) + to 16byte boundary. Exact wording is: + + An array uses the same alignment as its elements, except that a local or + global array variable of length at least 16 bytes or + a C99 variable-length array variable always has alignment of at least 16 bytes. + + This was added to allow use of aligned SSE instructions at arrays. This + rule is meant for static storage (where compiler can not do the analysis + by itself). We follow it for automatic variables only when convenient. + We fully control everything in the function compiled and functions from + other unit can not rely on the alignment. + + Exclude va_list type. It is the common case of local array where + we can not benefit from the alignment. */ + if (TARGET_64BIT && optimize_function_for_speed_p (cfun) + && TARGET_SSE) { if (AGGREGATE_TYPE_P (type) + && (TYPE_MAIN_VARIANT (type) + != TYPE_MAIN_VARIANT (va_list_type_node)) && TYPE_SIZE (type) && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16 @@ -20101,6 +20372,12 @@ us with EAX for the static chain. */ regno = AX_REG; } + else if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype))) + { + /* Thiscall functions use ecx for arguments, which leaves + us with EAX for the static chain. */ + regno = AX_REG; + } else if (ix86_function_regparm (fntype, fndecl) == 3) { /* For regparm 3, we have no free call-clobbered registers in @@ -20978,6 +21255,10 @@ IX86_BUILTIN_VPERMILPS, IX86_BUILTIN_VPERMILPD256, IX86_BUILTIN_VPERMILPS256, + IX86_BUILTIN_VPERMIL2PD, + IX86_BUILTIN_VPERMIL2PS, + IX86_BUILTIN_VPERMIL2PD256, + IX86_BUILTIN_VPERMIL2PS256, IX86_BUILTIN_VPERM2F128PD256, IX86_BUILTIN_VPERM2F128PS256, IX86_BUILTIN_VPERM2F128SI256, @@ -22167,6 +22448,10 @@ }; /* FMA4 and XOP. */ +#define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT +#define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT +#define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT +#define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF @@ -22409,6 +22694,11 @@ { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF }, { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 }, + }; /* Set up all the MMX/SSE builtins, even builtins for instructions that are not @@ -22789,6 +23079,14 @@ switch (m_type) { + case MULTI_ARG_4_DF2_DI_I: + case MULTI_ARG_4_DF2_DI_I1: + case MULTI_ARG_4_SF2_SI_I: + case MULTI_ARG_4_SF2_SI_I1: + nargs = 4; + last_arg_constant = true; + break; + case MULTI_ARG_3_SF: case MULTI_ARG_3_DF: case MULTI_ARG_3_SF2: @@ -22932,6 +23230,10 @@ pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op); break; + case 4: + pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op); + break; + default: gcc_unreachable (); } @@ -23550,6 +23852,13 @@ nargs = 3; nargs_constant = 2; break; + case V2DF_FTYPE_V2DF_V2DF_V2DI_INT: + case V4DF_FTYPE_V4DF_V4DF_V4DI_INT: + case V4SF_FTYPE_V4SF_V4SF_V4SI_INT: + case V8SF_FTYPE_V8SF_V8SF_V8SI_INT: + nargs = 4; + nargs_constant = 1; + break; case V2DI_FTYPE_V2DI_V2DI_UINT_UINT: nargs = 4; nargs_constant = 2; @@ -23619,6 +23928,10 @@ case CODE_FOR_sse4_1_blendpd: case CODE_FOR_avx_vpermilv2df: + case CODE_FOR_xop_vpermil2v2df3: + case CODE_FOR_xop_vpermil2v4sf3: + case CODE_FOR_xop_vpermil2v4df3: + case CODE_FOR_xop_vpermil2v8sf3: error ("the last argument must be a 2-bit immediate"); return const0_rtx; @@ -24275,14 +24588,16 @@ if it is not available. */ static tree -ix86_builtin_vectorized_function (unsigned int fn, tree type_out, +ix86_builtin_vectorized_function (tree fndecl, tree type_out, tree type_in) { enum machine_mode in_mode, out_mode; int in_n, out_n; + enum built_in_function fn = DECL_FUNCTION_CODE (fndecl); if (TREE_CODE (type_out) != VECTOR_TYPE - || TREE_CODE (type_in) != VECTOR_TYPE) + || TREE_CODE (type_in) != VECTOR_TYPE + || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL) return NULL_TREE; out_mode = TYPE_MODE (TREE_TYPE (type_out)); @@ -24540,43 +24855,92 @@ /* Returns a decl of a function that implements conversion of an integer vector - into a floating-point vector, or vice-versa. TYPE is the type of the integer - side of the conversion. + into a floating-point vector, or vice-versa. DEST_TYPE and SRC_TYPE + are the types involved when converting according to CODE. Return NULL_TREE if it is not available. */ static tree -ix86_vectorize_builtin_conversion (unsigned int code, tree type) -{ - if (! (TARGET_SSE2 && TREE_CODE (type) == VECTOR_TYPE)) +ix86_vectorize_builtin_conversion (unsigned int code, + tree dest_type, tree src_type) +{ + if (! TARGET_SSE2) return NULL_TREE; switch (code) { case FLOAT_EXPR: - switch (TYPE_MODE (type)) + switch (TYPE_MODE (src_type)) { case V4SImode: - return TYPE_UNSIGNED (type) - ? ix86_builtins[IX86_BUILTIN_CVTUDQ2PS] - : ix86_builtins[IX86_BUILTIN_CVTDQ2PS]; + switch (TYPE_MODE (dest_type)) + { + case V4SFmode: + return (TYPE_UNSIGNED (src_type) + ? ix86_builtins[IX86_BUILTIN_CVTUDQ2PS] + : ix86_builtins[IX86_BUILTIN_CVTDQ2PS]); + case V4DFmode: + return (TYPE_UNSIGNED (src_type) + ? NULL_TREE + : ix86_builtins[IX86_BUILTIN_CVTDQ2PD256]); + default: + return NULL_TREE; + } + break; + case V8SImode: + switch (TYPE_MODE (dest_type)) + { + case V8SFmode: + return (TYPE_UNSIGNED (src_type) + ? NULL_TREE + : ix86_builtins[IX86_BUILTIN_CVTDQ2PS]); + default: + return NULL_TREE; + } + break; default: return NULL_TREE; } case FIX_TRUNC_EXPR: - switch (TYPE_MODE (type)) + switch (TYPE_MODE (dest_type)) { case V4SImode: - return TYPE_UNSIGNED (type) - ? NULL_TREE - : ix86_builtins[IX86_BUILTIN_CVTTPS2DQ]; + switch (TYPE_MODE (src_type)) + { + case V4SFmode: + return (TYPE_UNSIGNED (dest_type) + ? NULL_TREE + : ix86_builtins[IX86_BUILTIN_CVTTPS2DQ]); + case V4DFmode: + return (TYPE_UNSIGNED (dest_type) + ? NULL_TREE + : ix86_builtins[IX86_BUILTIN_CVTTPD2DQ256]); + default: + return NULL_TREE; + } + break; + + case V8SImode: + switch (TYPE_MODE (src_type)) + { + case V8SFmode: + return (TYPE_UNSIGNED (dest_type) + ? NULL_TREE + : ix86_builtins[IX86_BUILTIN_CVTTPS2DQ256]); + default: + return NULL_TREE; + } + break; + default: return NULL_TREE; } + default: return NULL_TREE; - - } + } + + return NULL_TREE; } /* Returns a code for a target-specific builtin that implements @@ -24640,7 +25004,7 @@ if (!CONST_INT_P (er)) return 0; ei = INTVAL (er); - if (ei >= 2 * nelt) + if (ei >= nelt) return 0; ipar[i] = ei; } @@ -25835,13 +26199,6 @@ fprintf (file, "\t.indirect_symbol %s\n", symbol_name); fprintf (file, ASM_LONG "%s\n", binder_name); } - -void -darwin_x86_file_end (void) -{ - darwin_file_end (); - ix86_file_end (); -} #endif /* TARGET_MACHO */ /* Order the registers for register allocator. */ @@ -26037,6 +26394,13 @@ if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type))) regno = aggr ? DX_REG : CX_REG; + else if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (type))) + { + regno = CX_REG; + if (aggr) + return gen_rtx_MEM (SImode, + plus_constant (stack_pointer_rtx, 4)); + } else { regno = AX_REG; @@ -26088,7 +26452,7 @@ *(*this + vcall_offset) should be added to THIS. */ static void -x86_output_mi_thunk (FILE *file ATTRIBUTE_UNUSED, +x86_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta, HOST_WIDE_INT vcall_offset, tree function) { @@ -26096,6 +26460,9 @@ rtx this_param = x86_this_parameter (function); rtx this_reg, tmp; + /* Make sure unwind info is emitted for the thunk if needed. */ + final_start_function (emit_barrier (), file, 1); + /* If VCALL_OFFSET, we'll need THIS in a register. Might as well pull it in now and let DELTA benefit. */ if (REG_P (this_param)) @@ -26113,10 +26480,7 @@ /* Adjust the this parameter by a fixed constant. */ if (delta) { - /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'. - Exceptions: -128 encodes smaller than 128, so swap sign and op. */ - bool sub = delta < 0 || delta == 128; - xops[0] = GEN_INT (sub ? -delta : delta); + xops[0] = GEN_INT (delta); xops[1] = this_reg ? this_reg : this_param; if (TARGET_64BIT) { @@ -26128,12 +26492,12 @@ xops[0] = tmp; xops[1] = this_param; } - if (sub) + if (x86_maybe_negate_const_int (&xops[0], DImode)) output_asm_insn ("sub{q}\t{%0, %1|%1, %0}", xops); else output_asm_insn ("add{q}\t{%0, %1|%1, %0}", xops); } - else if (sub) + else if (x86_maybe_negate_const_int (&xops[0], SImode)) output_asm_insn ("sub{l}\t{%0, %1|%1, %0}", xops); else output_asm_insn ("add{l}\t{%0, %1|%1, %0}", xops); @@ -26148,7 +26512,9 @@ { int tmp_regno = CX_REG; if (lookup_attribute ("fastcall", - TYPE_ATTRIBUTES (TREE_TYPE (function)))) + TYPE_ATTRIBUTES (TREE_TYPE (function))) + || lookup_attribute ("thiscall", + TYPE_ATTRIBUTES (TREE_TYPE (function)))) tmp_regno = AX_REG; tmp = gen_rtx_REG (SImode, tmp_regno); } @@ -26224,6 +26590,7 @@ output_asm_insn ("jmp\t{*}%1", xops); } } + final_end_function (); } static void @@ -26265,7 +26632,7 @@ if (TARGET_64BIT) { #ifndef NO_PROFILE_COUNTERS - fprintf (file, "\tleaq\t" LPREFIX "P%d@(%%rip),%%r11\n", labelno); + fprintf (file, "\tleaq\t" LPREFIX "P%d(%%rip),%%r11\n", labelno); #endif if (DEFAULT_ABI == SYSV_ABI && flag_pic) @@ -26497,7 +26864,7 @@ replace = true; /* Empty functions get branch mispredict even when the jump destination is not visible to us. */ - if (!prev && cfun->function_frequency > FUNCTION_FREQUENCY_UNLIKELY_EXECUTED) + if (!prev && !optimize_function_for_size_p (cfun)) replace = true; } if (replace) @@ -26559,6 +26926,52 @@ extended_reg_mentioned_1, NULL); } +/* If profitable, negate (without causing overflow) integer constant + of mode MODE at location LOC. Return true in this case. */ +bool +x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode) +{ + HOST_WIDE_INT val; + + if (!CONST_INT_P (*loc)) + return false; + + switch (mode) + { + case DImode: + /* DImode x86_64 constants must fit in 32 bits. */ + gcc_assert (x86_64_immediate_operand (*loc, mode)); + + mode = SImode; + break; + + case SImode: + case HImode: + case QImode: + break; + + default: + gcc_unreachable (); + } + + /* Avoid overflows. */ + if (mode_signbit_p (mode, *loc)) + return false; + + val = INTVAL (*loc); + + /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'. + Exceptions: -128 encodes smaller than 128, so swap sign and op. */ + if ((val < 0 && val != -128) + || val == 128) + { + *loc = GEN_INT (-val); + return true; + } + + return false; +} + /* Generate an unsigned DImode/SImode to FP conversion. This is the same code optabs would emit if we didn't have TFmode patterns. */ @@ -26662,8 +27075,16 @@ insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup)); if (recog_memoized (insn) < 0) { + rtx seq; /* If that fails, force VAL into a register. */ + + start_sequence (); XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val); + seq = get_insns (); + end_sequence (); + if (seq) + emit_insn_before (seq, insn); + ok = recog_memoized (insn) >= 0; gcc_assert (ok); } @@ -28833,6 +29254,9 @@ /* Fastcall attribute says callee is responsible for popping arguments if they are not variable. */ { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute }, + /* Thiscall attribute says callee is responsible for popping arguments + if they are not variable. */ + { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute }, /* Cdecl attribute says the callee is a normal C declaration */ { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute }, /* Regparm attribute specifies how many integer arguments are to be @@ -28894,7 +29318,7 @@ tree itype = TREE_TYPE (vec_type); bool u = TYPE_UNSIGNED (itype); enum machine_mode vmode = TYPE_MODE (vec_type); - enum ix86_builtins fcode; + enum ix86_builtins fcode = fcode; /* Silence bogus warning. */ bool ok = TARGET_SSE2; switch (vmode) @@ -29103,8 +29527,8 @@ do_subreg: vmode = V8HImode; target = gen_lowpart (vmode, target); - op0 = gen_lowpart (vmode, target); - op1 = gen_lowpart (vmode, target); + op0 = gen_lowpart (vmode, op0); + op1 = gen_lowpart (vmode, op1); break; default: @@ -29112,7 +29536,7 @@ } /* This matches five different patterns with the different modes. */ - x = gen_rtx_VEC_MERGE (vmode, op0, op1, GEN_INT (mask)); + x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask)); x = gen_rtx_SET (VOIDmode, target, x); emit_insn (x); @@ -29224,7 +29648,12 @@ input where SEL+CONCAT may not. */ if (d->op0 == d->op1) { - if (expand_vselect (d->target, d->op0, d->perm, nelt)) + int mask = nelt - 1; + + for (i = 0; i < nelt; i++) + perm2[i] = d->perm[i] & mask; + + if (expand_vselect (d->target, d->op0, perm2, nelt)) return true; /* There are plenty of patterns in sse.md that are written for @@ -29235,8 +29664,8 @@ every other permutation operand. */ for (i = 0; i < nelt; i += 2) { - perm2[i] = d->perm[i]; - perm2[i+1] = d->perm[i+1] + nelt; + perm2[i] = d->perm[i] & mask; + perm2[i + 1] = (d->perm[i + 1] & mask) + nelt; } if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt)) return true; @@ -29244,11 +29673,12 @@ /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */ if (nelt >= 4) { - memcpy (perm2, d->perm, nelt); - for (i = 2; i < nelt; i += 4) - { - perm2[i+0] += nelt; - perm2[i+1] += nelt; + for (i = 0; i < nelt; i += 4) + { + perm2[i + 0] = d->perm[i + 0] & mask; + perm2[i + 1] = d->perm[i + 1] & mask; + perm2[i + 2] = (d->perm[i + 2] & mask) + nelt; + perm2[i + 3] = (d->perm[i + 3] & mask) + nelt; } if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt)) @@ -30383,6 +30813,9 @@ #undef TARGET_FUNCTION_VALUE #define TARGET_FUNCTION_VALUE ix86_function_value +#undef TARGET_FUNCTION_VALUE_REGNO_P +#define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p + #undef TARGET_SECONDARY_RELOAD #define TARGET_SECONDARY_RELOAD ix86_secondary_reload @@ -30429,6 +30862,9 @@ #undef TARGET_CAN_ELIMINATE #define TARGET_CAN_ELIMINATE ix86_can_eliminate +#undef TARGET_ASM_CODE_END +#define TARGET_ASM_CODE_END ix86_code_end + struct gcc_target targetm = TARGET_INITIALIZER; #include "gt-i386.h"