comparison gcc/config/i386/x86-tune.def @ 131:84e7813d76e9

gcc-8.2
author mir3636
date Thu, 25 Oct 2018 07:37:49 +0900
parents 04ced10e8804
children 1830386684a0
comparison
equal deleted inserted replaced
111:04ced10e8804 131:84e7813d76e9
1 /* Definitions of x86 tunable features. 1 /* Definitions of x86 tunable features.
2 Copyright (C) 2013-2017 Free Software Foundation, Inc. 2 Copyright (C) 2013-2018 Free Software Foundation, Inc.
3 3
4 This file is part of GCC. 4 This file is part of GCC.
5 5
6 GCC is free software; you can redistribute it and/or modify 6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by 7 it under the terms of the GNU General Public License as published by
39 /*****************************************************************************/ 39 /*****************************************************************************/
40 40
41 /* X86_TUNE_SCHEDULE: Enable scheduling. */ 41 /* X86_TUNE_SCHEDULE: Enable scheduling. */
42 DEF_TUNE (X86_TUNE_SCHEDULE, "schedule", 42 DEF_TUNE (X86_TUNE_SCHEDULE, "schedule",
43 m_PENT | m_LAKEMONT | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT 43 m_PENT | m_LAKEMONT | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT
44 | m_INTEL | m_KNL | m_KNM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC) 44 | m_INTEL | m_KNL | m_KNM | m_K6_GEODE | m_AMD_MULTIPLE | m_GOLDMONT
45 | m_GOLDMONT_PLUS | m_TREMONT | m_GENERIC)
45 46
46 /* X86_TUNE_PARTIAL_REG_DEPENDENCY: Enable more register renaming 47 /* X86_TUNE_PARTIAL_REG_DEPENDENCY: Enable more register renaming
47 on modern chips. Preffer stores affecting whole integer register 48 on modern chips. Preffer stores affecting whole integer register
48 over partial stores. For example preffer MOVZBL or MOVQ to load 8bit 49 over partial stores. For example preffer MOVZBL or MOVQ to load 8bit
49 value over movb. */ 50 value over movb. */
50 DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY, "partial_reg_dependency", 51 DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY, "partial_reg_dependency",
51 m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL 52 m_P4_NOCONA | m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2
52 | m_KNL | m_KNM | m_AMD_MULTIPLE | m_GENERIC) 53 | m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL
54 | m_KNL | m_KNM | m_AMD_MULTIPLE | m_TREMONT
55 | m_GENERIC)
53 56
54 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: This knob promotes all store 57 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: This knob promotes all store
55 destinations to be 128bit to allow register renaming on 128bit SSE units, 58 destinations to be 128bit to allow register renaming on 128bit SSE units,
56 but usually results in one extra microop on 64bit SSE units. 59 but usually results in one extra microop on 64bit SSE units.
57 Experimental results shows that disabling this option on P4 brings over 20% 60 Experimental results shows that disabling this option on P4 brings over 20%
72 This is because Core2 resolves dependencies on whole flags register 75 This is because Core2 resolves dependencies on whole flags register
73 and such sequences introduce false dependency on previous instruction 76 and such sequences introduce false dependency on previous instruction
74 setting full flags. 77 setting full flags.
75 78
76 The flags does not affect generation of INC and DEC that is controlled 79 The flags does not affect generation of INC and DEC that is controlled
77 by X86_TUNE_USE_INCDEC. 80 by X86_TUNE_USE_INCDEC. */
78 81
79 This flag may be dropped from generic once core2-corei5 machines are
80 rare enough. */
81 DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall", 82 DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall",
82 m_CORE2 | m_GENERIC) 83 m_CORE2)
83 84
84 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid 85 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
85 partial dependencies. */ 86 partial dependencies. */
86 DEF_TUNE (X86_TUNE_MOVX, "movx", 87 DEF_TUNE (X86_TUNE_MOVX, "movx",
87 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT 88 m_PPRO | m_P4_NOCONA | m_CORE2 | m_NEHALEM | m_SANDYBRIDGE
88 | m_KNL | m_KNM | m_INTEL | m_GEODE | m_AMD_MULTIPLE | m_GENERIC) 89 | m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_KNL | m_KNM | m_INTEL
90 | m_GOLDMONT_PLUS | m_GEODE | m_AMD_MULTIPLE
91 | m_CORE_AVX2 | m_TREMONT | m_GENERIC)
89 92
90 /* X86_TUNE_MEMORY_MISMATCH_STALL: Avoid partial stores that are followed by 93 /* X86_TUNE_MEMORY_MISMATCH_STALL: Avoid partial stores that are followed by
91 full sized loads. */ 94 full sized loads. */
92 DEF_TUNE (X86_TUNE_MEMORY_MISMATCH_STALL, "memory_mismatch_stall", 95 DEF_TUNE (X86_TUNE_MEMORY_MISMATCH_STALL, "memory_mismatch_stall",
93 m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL 96 m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL
94 | m_KNL | m_KNM | m_AMD_MULTIPLE | m_GENERIC) 97 | m_KNL | m_KNM | m_GOLDMONT | m_GOLDMONT_PLUS | m_AMD_MULTIPLE
98 | m_TREMONT | m_GENERIC)
95 99
96 /* X86_TUNE_FUSE_CMP_AND_BRANCH_32: Fuse compare with a subsequent 100 /* X86_TUNE_FUSE_CMP_AND_BRANCH_32: Fuse compare with a subsequent
97 conditional jump instruction for 32 bit TARGET. 101 conditional jump instruction for 32 bit TARGET. */
98 FIXME: revisit for generic. */
99 DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_32, "fuse_cmp_and_branch_32", 102 DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_32, "fuse_cmp_and_branch_32",
100 m_CORE_ALL | m_BDVER | m_ZNVER1) 103 m_CORE_ALL | m_BDVER | m_ZNVER1 | m_GENERIC)
101 104
102 /* X86_TUNE_FUSE_CMP_AND_BRANCH_64: Fuse compare with a subsequent 105 /* X86_TUNE_FUSE_CMP_AND_BRANCH_64: Fuse compare with a subsequent
103 conditional jump instruction for TARGET_64BIT. 106 conditional jump instruction for TARGET_64BIT. */
104 FIXME: revisit for generic. */
105 DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_64, "fuse_cmp_and_branch_64", 107 DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_64, "fuse_cmp_and_branch_64",
106 m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_BDVER | m_ZNVER1) 108 m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_BDVER | m_ZNVER1 | m_GENERIC)
107 109
108 /* X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS: Fuse compare with a 110 /* X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS: Fuse compare with a
109 subsequent conditional jump instruction when the condition jump 111 subsequent conditional jump instruction when the condition jump
110 check sign flag (SF) or overflow flag (OF). */ 112 check sign flag (SF) or overflow flag (OF). */
111 DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS, "fuse_cmp_and_branch_soflags", 113 DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS, "fuse_cmp_and_branch_soflags",
112 m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_BDVER | m_ZNVER1) 114 m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_BDVER | m_ZNVER1 | m_GENERIC)
113 115
114 /* X86_TUNE_FUSE_ALU_AND_BRANCH: Fuse alu with a subsequent conditional 116 /* X86_TUNE_FUSE_ALU_AND_BRANCH: Fuse alu with a subsequent conditional
115 jump instruction when the alu instruction produces the CCFLAG consumed by 117 jump instruction when the alu instruction produces the CCFLAG consumed by
116 the conditional jump instruction. */ 118 the conditional jump instruction. */
117 DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch", 119 DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch",
118 m_SANDYBRIDGE | m_HASWELL) 120 m_SANDYBRIDGE | m_CORE_AVX2 | m_GENERIC)
119 121
120 122
121 /*****************************************************************************/ 123 /*****************************************************************************/
122 /* Function prologue, epilogue and function calling sequences. */ 124 /* Function prologue, epilogue and function calling sequences. */
123 /*****************************************************************************/ 125 /*****************************************************************************/
126 arguments in prologue/epilogue instead of separately for each call 128 arguments in prologue/epilogue instead of separately for each call
127 by push/pop instructions. 129 by push/pop instructions.
128 This increase code size by about 5% in 32bit mode, less so in 64bit mode 130 This increase code size by about 5% in 32bit mode, less so in 64bit mode
129 because parameters are passed in registers. It is considerable 131 because parameters are passed in registers. It is considerable
130 win for targets without stack engine that prevents multple push operations 132 win for targets without stack engine that prevents multple push operations
131 to happen in parallel. 133 to happen in parallel. */
132 134
133 FIXME: the flags is incorrectly enabled for amdfam10, Bulldozer,
134 Bobcat and Generic. This is because disabling it causes large
135 regression on mgrid due to IRA limitation leading to unecessary
136 use of the frame pointer in 32bit mode. */
137 DEF_TUNE (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, "accumulate_outgoing_args", 135 DEF_TUNE (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, "accumulate_outgoing_args",
138 m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL 136 m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL
139 | m_ATHLON_K8) 137 | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_ATHLON_K8)
140 138
141 /* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in prologues that are 139 /* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in prologues that are
142 considered on critical path. */ 140 considered on critical path. */
143 DEF_TUNE (X86_TUNE_PROLOGUE_USING_MOVE, "prologue_using_move", 141 DEF_TUNE (X86_TUNE_PROLOGUE_USING_MOVE, "prologue_using_move",
144 m_PPRO | m_ATHLON_K8) 142 m_PPRO | m_ATHLON_K8)
190 of conditional jump or directly preceded by other jump instruction. 188 of conditional jump or directly preceded by other jump instruction.
191 This is important for AND K8-AMDFAM10 because the branch prediction 189 This is important for AND K8-AMDFAM10 because the branch prediction
192 architecture expect at most one jump per 2 byte window. Failing to 190 architecture expect at most one jump per 2 byte window. Failing to
193 pad returns leads to misaligned return stack. */ 191 pad returns leads to misaligned return stack. */
194 DEF_TUNE (X86_TUNE_PAD_RETURNS, "pad_returns", 192 DEF_TUNE (X86_TUNE_PAD_RETURNS, "pad_returns",
195 m_ATHLON_K8 | m_AMDFAM10 | m_GENERIC) 193 m_ATHLON_K8 | m_AMDFAM10)
196 194
197 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more 195 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
198 than 4 branch instructions in the 16 byte window. */ 196 than 4 branch instructions in the 16 byte window. */
199 DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit", 197 DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit",
200 m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM 198 m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM
201 |m_INTEL | m_ATHLON_K8 | m_AMDFAM10) 199 | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_INTEL | m_ATHLON_K8
200 | m_AMDFAM10)
202 201
203 /*****************************************************************************/ 202 /*****************************************************************************/
204 /* Integer instruction selection tuning */ 203 /* Integer instruction selection tuning */
205 /*****************************************************************************/ 204 /*****************************************************************************/
206 205
216 215
217 /* X86_TUNE_READ_MODIFY: Enable use of read-modify instructions such 216 /* X86_TUNE_READ_MODIFY: Enable use of read-modify instructions such
218 as "add mem, reg". */ 217 as "add mem, reg". */
219 DEF_TUNE (X86_TUNE_READ_MODIFY, "read_modify", ~(m_PENT | m_LAKEMONT | m_PPRO)) 218 DEF_TUNE (X86_TUNE_READ_MODIFY, "read_modify", ~(m_PENT | m_LAKEMONT | m_PPRO))
220 219
221 /* X86_TUNE_USE_INCDEC: Enable use of inc/dec instructions. */ 220 /* X86_TUNE_USE_INCDEC: Enable use of inc/dec instructions.
221
222 Core2 and nehalem has stall of 7 cycles for partial flag register stalls.
223 Sandy bridge and Ivy bridge generate extra uop. On Haswell this extra uop
224 is output only when the values needs to be really merged, which is not
225 done by GCC generated code. */
222 DEF_TUNE (X86_TUNE_USE_INCDEC, "use_incdec", 226 DEF_TUNE (X86_TUNE_USE_INCDEC, "use_incdec",
223 ~(m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL 227 ~(m_P4_NOCONA | m_CORE2 | m_NEHALEM | m_SANDYBRIDGE
224 | m_KNL | m_KNM | m_GENERIC)) 228 | m_BONNELL | m_SILVERMONT | m_INTEL | m_KNL | m_KNM | m_GOLDMONT
229 | m_GOLDMONT_PLUS | m_TREMONT | m_GENERIC))
225 230
226 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred 231 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
227 for DFmode copies */ 232 for DFmode copies */
228 DEF_TUNE (X86_TUNE_INTEGER_DFMODE_MOVES, "integer_dfmode_moves", 233 DEF_TUNE (X86_TUNE_INTEGER_DFMODE_MOVES, "integer_dfmode_moves",
229 ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT 234 ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
230 | m_KNL | m_KNM | m_INTEL | m_GEODE | m_AMD_MULTIPLE | m_GENERIC)) 235 | m_KNL | m_KNM | m_INTEL | m_GEODE | m_AMD_MULTIPLE | m_GOLDMONT
236 | m_GOLDMONT_PLUS | m_TREMONT | m_GENERIC))
231 237
232 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag 238 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
233 will impact LEA instruction selection. */ 239 will impact LEA instruction selection. */
234 DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_BONNELL | m_SILVERMONT | m_KNL 240 DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_BONNELL | m_SILVERMONT | m_KNL
235 | m_KNM | m_INTEL) 241 | m_KNM | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_INTEL)
236 242
237 /* X86_TUNE_AVOID_LEA_FOR_ADDR: Avoid lea for address computation. */ 243 /* X86_TUNE_AVOID_LEA_FOR_ADDR: Avoid lea for address computation. */
238 DEF_TUNE (X86_TUNE_AVOID_LEA_FOR_ADDR, "avoid_lea_for_addr", 244 DEF_TUNE (X86_TUNE_AVOID_LEA_FOR_ADDR, "avoid_lea_for_addr",
239 m_BONNELL | m_SILVERMONT | m_KNL | m_KNM) 245 m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT
246 | m_KNL | m_KNM)
240 247
241 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is 248 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
242 vector path on AMD machines. 249 vector path on AMD machines.
243 FIXME: Do we need to enable this for core? */ 250 FIXME: Do we need to enable this for core? */
244 DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM32_MEM, "slow_imul_imm32_mem", 251 DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM32_MEM, "slow_imul_imm32_mem",
251 m_K8 | m_AMDFAM10) 258 m_K8 | m_AMDFAM10)
252 259
253 /* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for 260 /* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for
254 a conditional move. */ 261 a conditional move. */
255 DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE, "avoid_mem_opnd_for_cmove", 262 DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE, "avoid_mem_opnd_for_cmove",
256 m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL) 263 m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_KNL
264 | m_KNM | m_TREMONT | m_INTEL)
257 265
258 /* X86_TUNE_SINGLE_STRINGOP: Enable use of single string operations, such 266 /* X86_TUNE_SINGLE_STRINGOP: Enable use of single string operations, such
259 as MOVS and STOS (without a REP prefix) to move/set sequences of bytes. */ 267 as MOVS and STOS (without a REP prefix) to move/set sequences of bytes. */
260 DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA) 268 DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA)
261 269
270 278
271 /* X86_TUNE_USE_SAHF: Controls use of SAHF. */ 279 /* X86_TUNE_USE_SAHF: Controls use of SAHF. */
272 DEF_TUNE (X86_TUNE_USE_SAHF, "use_sahf", 280 DEF_TUNE (X86_TUNE_USE_SAHF, "use_sahf",
273 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT 281 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
274 | m_KNL | m_KNM | m_INTEL | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER 282 | m_KNL | m_KNM | m_INTEL | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER
275 | m_BTVER | m_ZNVER1 | m_GENERIC) 283 | m_BTVER | m_ZNVER1 | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT
284 | m_GENERIC)
276 285
277 /* X86_TUNE_USE_CLTD: Controls use of CLTD and CTQO instructions. */ 286 /* X86_TUNE_USE_CLTD: Controls use of CLTD and CTQO instructions. */
278 DEF_TUNE (X86_TUNE_USE_CLTD, "use_cltd", 287 DEF_TUNE (X86_TUNE_USE_CLTD, "use_cltd",
279 ~(m_PENT | m_LAKEMONT | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL 288 ~(m_PENT | m_LAKEMONT | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL
280 | m_K6)) 289 | m_K6 | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT))
281 290
282 /* X86_TUNE_USE_BT: Enable use of BT (bit test) instructions. */ 291 /* X86_TUNE_USE_BT: Enable use of BT (bit test) instructions. */
283 DEF_TUNE (X86_TUNE_USE_BT, "use_bt", 292 DEF_TUNE (X86_TUNE_USE_BT, "use_bt",
284 m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL 293 m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL
285 | m_LAKEMONT | m_AMD_MULTIPLE | m_GENERIC) 294 | m_LAKEMONT | m_AMD_MULTIPLE | m_GOLDMONT | m_GOLDMONT_PLUS
295 | m_TREMONT | m_GENERIC)
286 296
287 /* X86_TUNE_AVOID_FALSE_DEP_FOR_BMI: Avoid false dependency 297 /* X86_TUNE_AVOID_FALSE_DEP_FOR_BMI: Avoid false dependency
288 for bit-manipulation instructions. */ 298 for bit-manipulation instructions. */
289 DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI, "avoid_false_dep_for_bmi", 299 DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI, "avoid_false_dep_for_bmi",
290 m_SANDYBRIDGE | m_HASWELL | m_GENERIC) 300 m_SANDYBRIDGE | m_CORE_AVX2 | m_GENERIC)
291 301
292 /* X86_TUNE_ADJUST_UNROLL: This enables adjusting the unroll factor based 302 /* X86_TUNE_ADJUST_UNROLL: This enables adjusting the unroll factor based
293 on hardware capabilities. Bdver3 hardware has a loop buffer which makes 303 on hardware capabilities. Bdver3 hardware has a loop buffer which makes
294 unrolling small loop less important. For, such architectures we adjust 304 unrolling small loop less important. For, such architectures we adjust
295 the unroll factor so that the unrolled loop fits the loop buffer. */ 305 the unroll factor so that the unrolled loop fits the loop buffer. */
296 DEF_TUNE (X86_TUNE_ADJUST_UNROLL, "adjust_unroll_factor", m_BDVER3 | m_BDVER4) 306 DEF_TUNE (X86_TUNE_ADJUST_UNROLL, "adjust_unroll_factor", m_BDVER3 | m_BDVER4)
297 307
298 /* X86_TUNE_ONE_IF_CONV_INSNS: Restrict a number of cmov insns in 308 /* X86_TUNE_ONE_IF_CONV_INSNS: Restrict a number of cmov insns in
299 if-converted sequence to one. */ 309 if-converted sequence to one. */
300 DEF_TUNE (X86_TUNE_ONE_IF_CONV_INSN, "one_if_conv_insn", 310 DEF_TUNE (X86_TUNE_ONE_IF_CONV_INSN, "one_if_conv_insn",
301 m_SILVERMONT | m_KNL | m_KNM | m_INTEL | m_CORE_ALL | m_GENERIC) 311 m_SILVERMONT | m_KNL | m_KNM | m_INTEL | m_CORE_ALL | m_GOLDMONT
312 | m_GOLDMONT_PLUS | m_TREMONT | m_GENERIC)
302 313
303 /*****************************************************************************/ 314 /*****************************************************************************/
304 /* 387 instruction selection tuning */ 315 /* 387 instruction selection tuning */
305 /*****************************************************************************/ 316 /*****************************************************************************/
306 317
312 323
313 /* X86_TUNE_USE_SIMODE_FIOP: Enables use of x87 instructions with 32bit 324 /* X86_TUNE_USE_SIMODE_FIOP: Enables use of x87 instructions with 32bit
314 integer operand. */ 325 integer operand. */
315 DEF_TUNE (X86_TUNE_USE_SIMODE_FIOP, "use_simode_fiop", 326 DEF_TUNE (X86_TUNE_USE_SIMODE_FIOP, "use_simode_fiop",
316 ~(m_PENT | m_LAKEMONT | m_PPRO | m_CORE_ALL | m_BONNELL 327 ~(m_PENT | m_LAKEMONT | m_PPRO | m_CORE_ALL | m_BONNELL
317 | m_SILVERMONT | m_KNL | m_KNM | m_INTEL | m_AMD_MULTIPLE | m_GENERIC)) 328 | m_SILVERMONT | m_KNL | m_KNM | m_INTEL | m_AMD_MULTIPLE
329 | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_GENERIC))
318 330
319 /* X86_TUNE_USE_FFREEP: Use freep instruction instead of fstp. */ 331 /* X86_TUNE_USE_FFREEP: Use freep instruction instead of fstp. */
320 DEF_TUNE (X86_TUNE_USE_FFREEP, "use_ffreep", m_AMD_MULTIPLE) 332 DEF_TUNE (X86_TUNE_USE_FFREEP, "use_ffreep", m_AMD_MULTIPLE)
321 333
322 /* X86_TUNE_EXT_80387_CONSTANTS: Use fancy 80387 constants, such as PI. */ 334 /* X86_TUNE_EXT_80387_CONSTANTS: Use fancy 80387 constants, such as PI. */
323 DEF_TUNE (X86_TUNE_EXT_80387_CONSTANTS, "ext_80387_constants", 335 DEF_TUNE (X86_TUNE_EXT_80387_CONSTANTS, "ext_80387_constants",
324 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT 336 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
325 | m_KNL | m_KNM | m_INTEL | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC) 337 | m_KNL | m_KNM | m_INTEL | m_K6_GEODE | m_ATHLON_K8 | m_GOLDMONT
338 | m_GOLDMONT_PLUS | m_TREMONT | m_GENERIC)
326 339
327 /*****************************************************************************/ 340 /*****************************************************************************/
328 /* SSE instruction selection tuning */ 341 /* SSE instruction selection tuning */
329 /*****************************************************************************/ 342 /*****************************************************************************/
330 343
334 m_CORE_ALL) 347 m_CORE_ALL)
335 348
336 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL: Use movups for misaligned loads instead 349 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL: Use movups for misaligned loads instead
337 of a sequence loading registers by parts. */ 350 of a sequence loading registers by parts. */
338 DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, "sse_unaligned_load_optimal", 351 DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, "sse_unaligned_load_optimal",
339 m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_SILVERMONT | m_KNL | m_KNM 352 m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_SILVERMONT | m_KNL | m_KNM
340 | m_INTEL | m_AMDFAM10 | m_BDVER | m_BTVER | m_ZNVER1 | m_GENERIC) 353 | m_INTEL | m_GOLDMONT | m_GOLDMONT_PLUS
354 | m_TREMONT | m_AMDFAM10 | m_BDVER | m_BTVER | m_ZNVER1 | m_GENERIC)
341 355
342 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL: Use movups for misaligned stores instead 356 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL: Use movups for misaligned stores instead
343 of a sequence loading registers by parts. */ 357 of a sequence loading registers by parts. */
344 DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal", 358 DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal",
345 m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_SILVERMONT | m_KNL | m_KNM 359 m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_SILVERMONT | m_KNL | m_KNM
346 | m_INTEL | m_BDVER | m_ZNVER1 | m_GENERIC) 360 | m_INTEL | m_GOLDMONT | m_GOLDMONT_PLUS
361 | m_TREMONT | m_BDVER | m_ZNVER1 | m_GENERIC)
347 362
348 /* Use packed single precision instructions where posisble. I.e. movups instead 363 /* Use packed single precision instructions where posisble. I.e. movups instead
349 of movupd. */ 364 of movupd. */
350 DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL, "sse_packed_single_insn_optimal", 365 DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL, "sse_packed_single_insn_optimal",
351 m_BDVER | m_ZNVER1) 366 m_BDVER | m_ZNVER1)
362 377
363 /* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from integer 378 /* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from integer
364 to SSE registers. If disabled, the moves will be done by storing 379 to SSE registers. If disabled, the moves will be done by storing
365 the value to memory and reloading. */ 380 the value to memory and reloading. */
366 DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_TO_VEC, "inter_unit_moves_to_vec", 381 DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_TO_VEC, "inter_unit_moves_to_vec",
367 ~(m_AMD_MULTIPLE | m_GENERIC)) 382 ~(m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC))
368 383
369 /* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from SSE 384 /* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from SSE
370 to integer registers. If disabled, the moves will be done by storing 385 to integer registers. If disabled, the moves will be done by storing
371 the value to memory and reloading. */ 386 the value to memory and reloading. */
372 DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_FROM_VEC, "inter_unit_moves_from_vec", 387 DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_FROM_VEC, "inter_unit_moves_from_vec",
373 ~m_ATHLON_K8) 388 ~m_ATHLON_K8)
374 389
375 /* X86_TUNE_INTER_UNIT_CONVERSIONS: Enable float<->integer conversions 390 /* X86_TUNE_INTER_UNIT_CONVERSIONS: Enable float<->integer conversions
376 to use both SSE and integer registers at a same time. 391 to use both SSE and integer registers at a same time. */
377 FIXME: revisit importance of this for generic. */
378 DEF_TUNE (X86_TUNE_INTER_UNIT_CONVERSIONS, "inter_unit_conversions", 392 DEF_TUNE (X86_TUNE_INTER_UNIT_CONVERSIONS, "inter_unit_conversions",
379 ~(m_AMDFAM10 | m_BDVER)) 393 ~(m_AMDFAM10 | m_BDVER))
380 394
381 /* X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS: Try to split memory operand for 395 /* X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS: Try to split memory operand for
382 fp converts to destination register. */ 396 fp converts to destination register. */
383 DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, "split_mem_opnd_for_fp_converts", 397 DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, "split_mem_opnd_for_fp_converts",
384 m_SILVERMONT | m_KNL | m_KNM | m_INTEL) 398 m_SILVERMONT | m_KNL | m_KNM | m_GOLDMONT | m_GOLDMONT_PLUS
399 | m_TREMONT | m_INTEL)
385 400
386 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion 401 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
387 from FP to FP. This form of instructions avoids partial write to the 402 from FP to FP. This form of instructions avoids partial write to the
388 destination. */ 403 destination. */
389 DEF_TUNE (X86_TUNE_USE_VECTOR_FP_CONVERTS, "use_vector_fp_converts", 404 DEF_TUNE (X86_TUNE_USE_VECTOR_FP_CONVERTS, "use_vector_fp_converts",
393 from integer to FP. */ 408 from integer to FP. */
394 DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10) 409 DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10)
395 410
396 /* X86_TUNE_SLOW_SHUFB: Indicates tunings with slow pshufb instruction. */ 411 /* X86_TUNE_SLOW_SHUFB: Indicates tunings with slow pshufb instruction. */
397 DEF_TUNE (X86_TUNE_SLOW_PSHUFB, "slow_pshufb", 412 DEF_TUNE (X86_TUNE_SLOW_PSHUFB, "slow_pshufb",
398 m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL) 413 m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_GOLDMONT
414 | m_GOLDMONT_PLUS | m_TREMONT | m_INTEL)
399 415
400 /* X86_TUNE_AVOID_4BYTE_PREFIXES: Avoid instructions requiring 4+ bytes of prefixes. */ 416 /* X86_TUNE_AVOID_4BYTE_PREFIXES: Avoid instructions requiring 4+ bytes of prefixes. */
401 DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, "avoid_4byte_prefixes", 417 DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, "avoid_4byte_prefixes",
402 m_SILVERMONT | m_INTEL) 418 m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_INTEL)
419
420 /* X86_TUNE_USE_GATHER: Use gather instructions. */
421 DEF_TUNE (X86_TUNE_USE_GATHER, "use_gather",
422 ~(m_ZNVER1 | m_GENERIC))
423
424 /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
425 smaller FMA chain. */
426 DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1)
403 427
404 /*****************************************************************************/ 428 /*****************************************************************************/
405 /* AVX instruction selection tuning (some of SSE flags affects AVX, too) */ 429 /* AVX instruction selection tuning (some of SSE flags affects AVX, too) */
406 /*****************************************************************************/ 430 /*****************************************************************************/
407 431
417 441
418 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for 442 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
419 the auto-vectorizer. */ 443 the auto-vectorizer. */
420 DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER2 444 DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER2
421 | m_ZNVER1) 445 | m_ZNVER1)
446
447 /* X86_TUNE_AVX256_OPTIMAL: Use 256-bit AVX instructions instead of 512-bit AVX
448 instructions in the auto-vectorizer. */
449 DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", m_CORE_AVX512)
422 450
423 /*****************************************************************************/ 451 /*****************************************************************************/
424 /* Historical relics: tuning flags that helps a specific old CPU designs */ 452 /* Historical relics: tuning flags that helps a specific old CPU designs */
425 /*****************************************************************************/ 453 /*****************************************************************************/
426 454
534 562
535 /* X86_TUNE_PROMOTE_QI_REGS: This enables generic code that promotes all 8bit 563 /* X86_TUNE_PROMOTE_QI_REGS: This enables generic code that promotes all 8bit
536 arithmetic to 32bit via PROMOTE_MODE macro. This code generation scheme 564 arithmetic to 32bit via PROMOTE_MODE macro. This code generation scheme
537 is usually used for RISC targets. */ 565 is usually used for RISC targets. */
538 DEF_TUNE (X86_TUNE_PROMOTE_QI_REGS, "promote_qi_regs", 0U) 566 DEF_TUNE (X86_TUNE_PROMOTE_QI_REGS, "promote_qi_regs", 0U)
567
568 /* X86_TUNE_EMIT_VZEROUPPER: This enables vzeroupper instruction insertion
569 before a transfer of control flow out of the function. */
570 DEF_TUNE (X86_TUNE_EMIT_VZEROUPPER, "emit_vzeroupper", ~m_KNL)