CbC/CbC_gcc: gcc/config/i386/x86-tune-costs.h comparison

comparison gcc/config/i386/x86-tune-costs.h @ 145:1830386684a0

gcc-9.2.0

author	anatofuz
date	Thu, 13 Feb 2020 11:34:05 +0900
parents	84e7813d76e9
children

comparison

equal deleted inserted replaced

-:84e7813d76e9
+:1830386684a0
 /* Costs of operations of individual x86 CPUs.
-Copyright (C) 1988-2018 Free Software Foundation, Inc.
+Copyright (C) 1988-2020 Free Software Foundation, Inc.
 This file is part of GCC.
 GCC is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
 const
 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
+{
+/* Start of register allocator costs.  integer->integer move cost is 2. */
+2,				     /* cost for loading QImode using movzbl */
+{2, 2, 2},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+{2, 2, 2},				/* cost of storing integer registers */
+2,					/* cost of reg,reg fld/fst */
+{2, 2, 2},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+{2, 2, 2},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+3,					/* cost of moving MMX register */
+{3, 3},				/* cost of loading MMX registers
+					   in SImode and DImode */
+{3, 3},				/* cost of storing MMX registers
+					   in SImode and DImode */
+3, 3, 3,				/* cost of moving XMM,YMM,ZMM register */
+{3, 3, 3, 3, 3},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+{3, 3, 3, 3, 3},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+3, 3,					/* SSE->integer and integer->SSE moves */
+/* End of register allocator costs.  */
+},
 COSTS_N_BYTES (2),			/* cost of an add instruction */
 COSTS_N_BYTES (3),			/* cost of a lea instruction */
 COSTS_N_BYTES (2),			/* variable shift costs */
 COSTS_N_BYTES (3),			/* constant shift costs */
 {COSTS_N_BYTES (3),			/* cost of starting multiply for QI */
 COSTS_N_BYTES (5)},			/*			    other */
 COSTS_N_BYTES (3),			/* cost of movsx */
 COSTS_N_BYTES (3),			/* cost of movzx */
 0,					/* "large" insn */
 2,					/* MOVE_RATIO */
+2,					/* CLEAR_RATIO */
-/* All move costs are relative to integer->integer move times 2. */
-2,				     /* cost for loading QImode using movzbl */
 {2, 2, 2},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
 {2, 2, 2},				/* cost of storing integer registers */
-2,					/* cost of reg,reg fld/fst */
+{3, 3, 3, 3, 3},			/* cost of loading SSE register
-{2, 2, 2},				/* cost of loading fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-					   in SFmode, DFmode and XFmode */
+{3, 3, 3, 3, 3},			/* cost of storing SSE register
-{2, 2, 2},				/* cost of storing fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-					   in SFmode, DFmode and XFmode */
-3,					/* cost of moving MMX register */
-{3, 3},				/* cost of loading MMX registers
-					   in SImode and DImode */
-{3, 3},				/* cost of storing MMX registers
-					   in SImode and DImode */
-3, 3, 3,				/* cost of moving XMM,YMM,ZMM register */
-{3, 3, 3, 3, 3},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
 {3, 3, 3, 3, 3},			/* cost of unaligned SSE load
 					   in 128bit, 256bit and 512bit */
-{3, 3, 3, 3, 3},			/* cost of storing SSE registers
+{3, 3, 3, 3, 3},			/* cost of unaligned SSE store
-					   in 32,64,128,256 and 512-bit */
-{3, 3, 3, 3, 3},				/* cost of unaligned SSE store
 					   in 128bit, 256bit and 512bit */
-3, 3,					/* SSE->integer and integer->SSE moves */
+3, 3, 3,				/* cost of moving XMM,YMM,ZMM register */
+3,					/* cost of moving SSE register to integer.  */
 5, 0,					/* Gather load static, per_elt.  */
 5, 0,					/* Gather store static, per_elt.  */
 0,					/* size of l1 cache  */
 0,					/* size of l2 cache  */
 0,					/* size of prefetch block */
 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
 DUMMY_STRINGOP_ALGS};
 static const
 struct processor_costs i386_cost = {	/* 386 specific costs */
+{
+/* Start of register allocator costs.  integer->integer move cost is 2. */
+4,				     /* cost for loading QImode using movzbl */
+{2, 4, 2},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+{2, 4, 2},				/* cost of storing integer registers */
+2,					/* cost of reg,reg fld/fst */
+{8, 8, 8},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+{8, 8, 8},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+2,					/* cost of moving MMX register */
+{4, 8},				/* cost of loading MMX registers
+					   in SImode and DImode */
+{4, 8},				/* cost of storing MMX registers
+					   in SImode and DImode */
+2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+{4, 8, 16, 32, 64},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+{4, 8, 16, 32, 64},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+3, 3,					/* SSE->integer and integer->SSE moves */
+/* End of register allocator costs.  */
+},
 COSTS_N_INSNS (1),			/* cost of an add instruction */
 COSTS_N_INSNS (1),			/* cost of a lea instruction */
 COSTS_N_INSNS (3),			/* variable shift costs */
 COSTS_N_INSNS (2),			/* constant shift costs */
 {COSTS_N_INSNS (6),			/* cost of starting multiply for QI */
 COSTS_N_INSNS (23)},			/*			    other */
 COSTS_N_INSNS (3),			/* cost of movsx */
 COSTS_N_INSNS (2),			/* cost of movzx */
 15,					/* "large" insn */
 3,					/* MOVE_RATIO */
+3,					/* CLEAR_RATIO */
-/* All move costs are relative to integer->integer move times 2 and thus
-they are latency*2. */
-4,				     /* cost for loading QImode using movzbl */
 {2, 4, 2},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
 {2, 4, 2},				/* cost of storing integer registers */
-2,					/* cost of reg,reg fld/fst */
+{4, 8, 16, 32, 64},			/* cost of loading SSE register
-{8, 8, 8},				/* cost of loading fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-					   in SFmode, DFmode and XFmode */
+{4, 8, 16, 32, 64},			/* cost of storing SSE register
-{8, 8, 8},				/* cost of storing fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-					   in SFmode, DFmode and XFmode */
-2,					/* cost of moving MMX register */
-{4, 8},				/* cost of loading MMX registers
-					   in SImode and DImode */
-{4, 8},				/* cost of storing MMX registers
-					   in SImode and DImode */
-2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-{4, 8, 16, 32, 64},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
 {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
-{4, 8, 16, 32, 64},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
 {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
-3, 3,					/* SSE->integer and integer->SSE moves */
+2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+3,					/* cost of moving SSE register to integer.  */
 4, 4,					/* Gather load static, per_elt.  */
 4, 4,					/* Gather store static, per_elt.  */
 0,					/* size of l1 cache  */
 0,					/* size of l2 cache  */
 0,					/* size of prefetch block */
 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
 DUMMY_STRINGOP_ALGS};
 static const
 struct processor_costs i486_cost = {	/* 486 specific costs */
+{
+/* Start of register allocator costs.  integer->integer move cost is 2. */
+4,				     /* cost for loading QImode using movzbl */
+{2, 4, 2},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+{2, 4, 2},				/* cost of storing integer registers */
+2,					/* cost of reg,reg fld/fst */
+{8, 8, 8},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+{8, 8, 8},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+2,					/* cost of moving MMX register */
+{4, 8},				/* cost of loading MMX registers
+					   in SImode and DImode */
+{4, 8},				/* cost of storing MMX registers
+					   in SImode and DImode */
+2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+{4, 8, 16, 32, 64},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+{4, 8, 16, 32, 64},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+3, 3,					/* SSE->integer and integer->SSE moves */
+/* End of register allocator costs.  */
+},
 COSTS_N_INSNS (1),			/* cost of an add instruction */
 COSTS_N_INSNS (1),			/* cost of a lea instruction */
 COSTS_N_INSNS (3),			/* variable shift costs */
 COSTS_N_INSNS (2),			/* constant shift costs */
 {COSTS_N_INSNS (12),			/* cost of starting multiply for QI */
 COSTS_N_INSNS (40)},			/*			    other */
 COSTS_N_INSNS (3),			/* cost of movsx */
 COSTS_N_INSNS (2),			/* cost of movzx */
 15,					/* "large" insn */
 3,					/* MOVE_RATIO */
+3,					/* CLEAR_RATIO */
-/* All move costs are relative to integer->integer move times 2 and thus
-they are latency*2. */
-4,				     /* cost for loading QImode using movzbl */
 {2, 4, 2},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
 {2, 4, 2},				/* cost of storing integer registers */
-2,					/* cost of reg,reg fld/fst */
+{4, 8, 16, 32, 64},			/* cost of loading SSE register
-{8, 8, 8},				/* cost of loading fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-					   in SFmode, DFmode and XFmode */
+{4, 8, 16, 32, 64},			/* cost of storing SSE register
-{8, 8, 8},				/* cost of storing fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-					   in SFmode, DFmode and XFmode */
-2,					/* cost of moving MMX register */
-{4, 8},				/* cost of loading MMX registers
-					   in SImode and DImode */
-{4, 8},				/* cost of storing MMX registers
-					   in SImode and DImode */
-2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-{4, 8, 16, 32, 64},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
 {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
-{4, 8, 16, 32, 64},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
 {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
-3, 3,					/* SSE->integer and integer->SSE moves */
+2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+3,					/* cost of moving SSE register to integer.  */
 4, 4,					/* Gather load static, per_elt.  */
 4, 4,					/* Gather store static, per_elt.  */
 4,					/* size of l1 cache.  486 has 8kB cache
 					   shared for code and data, so 4kB is
 					   not really precise.  */
 {libcall, {{-1, rep_prefix_4_byte, false}}},
 DUMMY_STRINGOP_ALGS};
 static const
 struct processor_costs pentium_cost = {
+{
+/* Start of register allocator costs.  integer->integer move cost is 2. */
+6,				     /* cost for loading QImode using movzbl */
+{2, 4, 2},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+{2, 4, 2},				/* cost of storing integer registers */
+2,					/* cost of reg,reg fld/fst */
+{2, 2, 6},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+{4, 4, 6},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+8,					/* cost of moving MMX register */
+{8, 8},				/* cost of loading MMX registers
+					   in SImode and DImode */
+{8, 8},				/* cost of storing MMX registers
+					   in SImode and DImode */
+2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+{4, 8, 16, 32, 64},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+{4, 8, 16, 32, 64},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+3, 3,					/* SSE->integer and integer->SSE moves */
+/* End of register allocator costs.  */
+},
 COSTS_N_INSNS (1),			/* cost of an add instruction */
 COSTS_N_INSNS (1),			/* cost of a lea instruction */
 COSTS_N_INSNS (4),			/* variable shift costs */
 COSTS_N_INSNS (1),			/* constant shift costs */
 {COSTS_N_INSNS (11),			/* cost of starting multiply for QI */
 COSTS_N_INSNS (25)},			/*			    other */
 COSTS_N_INSNS (3),			/* cost of movsx */
 COSTS_N_INSNS (2),			/* cost of movzx */
 8,					/* "large" insn */
 6,					/* MOVE_RATIO */
+6,					/* CLEAR_RATIO */
-/* All move costs are relative to integer->integer move times 2 and thus
-they are latency*2. */
-6,				     /* cost for loading QImode using movzbl */
 {2, 4, 2},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
 {2, 4, 2},				/* cost of storing integer registers */
-2,					/* cost of reg,reg fld/fst */
+{4, 8, 16, 32, 64},			/* cost of loading SSE register
-{2, 2, 6},				/* cost of loading fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-					   in SFmode, DFmode and XFmode */
+{4, 8, 16, 32, 64},			/* cost of storing SSE register
-{4, 4, 6},				/* cost of storing fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-					   in SFmode, DFmode and XFmode */
-8,					/* cost of moving MMX register */
-{8, 8},				/* cost of loading MMX registers
-					   in SImode and DImode */
-{8, 8},				/* cost of storing MMX registers
-					   in SImode and DImode */
-2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-{4, 8, 16, 32, 64},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
 {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
-{4, 8, 16, 32, 64},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
 {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
-3, 3,					/* SSE->integer and integer->SSE moves */
+2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+3,					/* cost of moving SSE register to integer.  */
 4, 4,					/* Gather load static, per_elt.  */
 4, 4,					/* Gather store static, per_elt.  */
 8,					/* size of l1 cache.  */
 8,					/* size of l2 cache  */
 0,					/* size of prefetch block */
 "16",					/* Func alignment.  */
 };
 static const
 struct processor_costs lakemont_cost = {
+{
+/* Start of register allocator costs.  integer->integer move cost is 2. */
+6,				     /* cost for loading QImode using movzbl */
+{2, 4, 2},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+{2, 4, 2},				/* cost of storing integer registers */
+2,					/* cost of reg,reg fld/fst */
+{2, 2, 6},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+{4, 4, 6},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+8,					/* cost of moving MMX register */
+{8, 8},				/* cost of loading MMX registers
+					   in SImode and DImode */
+{8, 8},				/* cost of storing MMX registers
+					   in SImode and DImode */
+2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+{4, 8, 16, 32, 64},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+{4, 8, 16, 32, 64},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+3, 3,					/* SSE->integer and integer->SSE moves */
+/* End of register allocator costs.  */
+},
 COSTS_N_INSNS (1),			/* cost of an add instruction */
 COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
 COSTS_N_INSNS (1),			/* variable shift costs */
 COSTS_N_INSNS (1),			/* constant shift costs */
 {COSTS_N_INSNS (11),			/* cost of starting multiply for QI */
 COSTS_N_INSNS (25)},			/*			    other */
 COSTS_N_INSNS (3),			/* cost of movsx */
 COSTS_N_INSNS (2),			/* cost of movzx */
 8,					/* "large" insn */
 17,					/* MOVE_RATIO */
+6,					/* CLEAR_RATIO */
-/* All move costs are relative to integer->integer move times 2 and thus
-they are latency*2. */
-6,				     /* cost for loading QImode using movzbl */
 {2, 4, 2},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
 {2, 4, 2},				/* cost of storing integer registers */
-2,					/* cost of reg,reg fld/fst */
+{4, 8, 16, 32, 64},			/* cost of loading SSE register
-{2, 2, 6},				/* cost of loading fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-					   in SFmode, DFmode and XFmode */
+{4, 8, 16, 32, 64},			/* cost of storing SSE register
-{4, 4, 6},				/* cost of storing fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-					   in SFmode, DFmode and XFmode */
-8,					/* cost of moving MMX register */
-{8, 8},				/* cost of loading MMX registers
-					   in SImode and DImode */
-{8, 8},				/* cost of storing MMX registers
-					   in SImode and DImode */
-2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-{4, 8, 16, 32, 64},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
 {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
-{4, 8, 16, 32, 64},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
 {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
-3, 3,					/* SSE->integer and integer->SSE moves */
+2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+3,					/* cost of moving SSE register to integer.  */
 4, 4,					/* Gather load static, per_elt.  */
 4, 4,					/* Gather store static, per_elt.  */
 8,					/* size of l1 cache.  */
 8,					/* size of l2 cache  */
 0,					/* size of prefetch block */
 {8192, rep_prefix_4_byte, false},
 {-1, libcall, false}}},
 DUMMY_STRINGOP_ALGS};
 static const
 struct processor_costs pentiumpro_cost = {
+{
+/* Start of register allocator costs.  integer->integer move cost is 2. */
+2,				     /* cost for loading QImode using movzbl */
+{4, 4, 4},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+{2, 2, 2},				/* cost of storing integer registers */
+2,					/* cost of reg,reg fld/fst */
+{2, 2, 6},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+{4, 4, 6},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+2,					/* cost of moving MMX register */
+{2, 2},				/* cost of loading MMX registers
+					   in SImode and DImode */
+{2, 2},				/* cost of storing MMX registers
+					   in SImode and DImode */
+2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+{4, 8, 16, 32, 64},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+{4, 8, 16, 32, 64},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+3, 3,					/* SSE->integer and integer->SSE moves */
+/* End of register allocator costs.  */
+},
 COSTS_N_INSNS (1),			/* cost of an add instruction */
 COSTS_N_INSNS (1),			/* cost of a lea instruction */
 COSTS_N_INSNS (1),			/* variable shift costs */
 COSTS_N_INSNS (1),			/* constant shift costs */
 {COSTS_N_INSNS (4),			/* cost of starting multiply for QI */
 COSTS_N_INSNS (17)},			/*			    other */
 COSTS_N_INSNS (1),			/* cost of movsx */
 COSTS_N_INSNS (1),			/* cost of movzx */
 8,					/* "large" insn */
 6,					/* MOVE_RATIO */
+6,					/* CLEAR_RATIO */
-/* All move costs are relative to integer->integer move times 2 and thus
-they are latency*2. */
-2,				     /* cost for loading QImode using movzbl */
 {4, 4, 4},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
 {2, 2, 2},				/* cost of storing integer registers */
-2,					/* cost of reg,reg fld/fst */
+{4, 8, 16, 32, 64},			/* cost of loading SSE register
-{2, 2, 6},				/* cost of loading fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-					   in SFmode, DFmode and XFmode */
+{4, 8, 16, 32, 64},			/* cost of storing SSE register
-{4, 4, 6},				/* cost of storing fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-					   in SFmode, DFmode and XFmode */
-2,					/* cost of moving MMX register */
-{2, 2},				/* cost of loading MMX registers
-					   in SImode and DImode */
-{2, 2},				/* cost of storing MMX registers
-					   in SImode and DImode */
-2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-{4, 8, 16, 32, 64},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
 {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
-{4, 8, 16, 32, 64},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
 {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
-3, 3,					/* SSE->integer and integer->SSE moves */
+2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+3,					/* cost of moving SSE register to integer.  */
 4, 4,					/* Gather load static, per_elt.  */
 4, 4,					/* Gather store static, per_elt.  */
 8,					/* size of l1 cache.  */
 256,					/* size of l2 cache  */
 32,					/* size of prefetch block */
 static stringop_algs geode_memset[2] = {
 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 DUMMY_STRINGOP_ALGS};
 static const
 struct processor_costs geode_cost = {
+{
+/* Start of register allocator costs.  integer->integer move cost is 2. */
+2,				     /* cost for loading QImode using movzbl */
+{2, 2, 2},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+{2, 2, 2},				/* cost of storing integer registers */
+2,					/* cost of reg,reg fld/fst */
+{2, 2, 2},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+{4, 6, 6},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+2,					/* cost of moving MMX register */
+{2, 2},				/* cost of loading MMX registers
+					   in SImode and DImode */
+{2, 2},				/* cost of storing MMX registers
+					   in SImode and DImode */
+2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+{2, 2, 8, 16, 32},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+{2, 2, 8, 16, 32},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+6, 6,					/* SSE->integer and integer->SSE moves */
+/* End of register allocator costs.  */
+},
 COSTS_N_INSNS (1),			/* cost of an add instruction */
 COSTS_N_INSNS (1),			/* cost of a lea instruction */
 COSTS_N_INSNS (2),			/* variable shift costs */
 COSTS_N_INSNS (1),			/* constant shift costs */
 {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
 COSTS_N_INSNS (39)},			/*			    other */
 COSTS_N_INSNS (1),			/* cost of movsx */
 COSTS_N_INSNS (1),			/* cost of movzx */
 8,					/* "large" insn */
 4,					/* MOVE_RATIO */
+4,					/* CLEAR_RATIO */
-/* All move costs are relative to integer->integer move times 2 and thus
-they are latency*2. */
-2,				     /* cost for loading QImode using movzbl */
 {2, 2, 2},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
 {2, 2, 2},				/* cost of storing integer registers */
-2,					/* cost of reg,reg fld/fst */
+{2, 2, 8, 16, 32},			/* cost of loading SSE register
-{2, 2, 2},				/* cost of loading fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-					   in SFmode, DFmode and XFmode */
+{2, 2, 8, 16, 32},			/* cost of storing SSE register
-{4, 6, 6},				/* cost of storing fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-					   in SFmode, DFmode and XFmode */
-2,					/* cost of moving MMX register */
-{2, 2},				/* cost of loading MMX registers
-					   in SImode and DImode */
-{2, 2},				/* cost of storing MMX registers
-					   in SImode and DImode */
-2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-{2, 2, 8, 16, 32},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
 {2, 2, 8, 16, 32},			/* cost of unaligned loads.  */
-{2, 2, 8, 16, 32},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
 {2, 2, 8, 16, 32},			/* cost of unaligned stores.  */
-6, 6,					/* SSE->integer and integer->SSE moves */
+2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+6,					/* cost of moving SSE register to integer.  */
 2, 2,					/* Gather load static, per_elt.  */
 2, 2,					/* Gather store static, per_elt.  */
 64,					/* size of l1 cache.  */
 128,					/* size of l2 cache.  */
 32,					/* size of prefetch block */
 static stringop_algs k6_memset[2] = {
 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 DUMMY_STRINGOP_ALGS};
 static const
 struct processor_costs k6_cost = {
+{
+/* Start of register allocator costs.  integer->integer move cost is 2. */
+3,				     /* cost for loading QImode using movzbl */
+{4, 5, 4},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+{2, 3, 2},				/* cost of storing integer registers */
+4,					/* cost of reg,reg fld/fst */
+{6, 6, 6},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+{4, 4, 4},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+2,					/* cost of moving MMX register */
+{2, 2},				/* cost of loading MMX registers
+					   in SImode and DImode */
+{2, 2},				/* cost of storing MMX registers
+					   in SImode and DImode */
+2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+{2, 2, 8, 16, 32},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+{2, 2, 8, 16, 32},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+6, 6,					/* SSE->integer and integer->SSE moves */
+/* End of register allocator costs.  */
+},
 COSTS_N_INSNS (1),			/* cost of an add instruction */
 COSTS_N_INSNS (2),			/* cost of a lea instruction */
 COSTS_N_INSNS (1),			/* variable shift costs */
 COSTS_N_INSNS (1),			/* constant shift costs */
 {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
 COSTS_N_INSNS (18)},			/*			    other */
 COSTS_N_INSNS (2),			/* cost of movsx */
 COSTS_N_INSNS (2),			/* cost of movzx */
 8,					/* "large" insn */
 4,					/* MOVE_RATIO */
+4,					/* CLEAR_RATIO */
-/* All move costs are relative to integer->integer move times 2 and thus
-they are latency*2. */
-3,				     /* cost for loading QImode using movzbl */
 {4, 5, 4},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
 {2, 3, 2},				/* cost of storing integer registers */
-4,					/* cost of reg,reg fld/fst */
+{2, 2, 8, 16, 32},			/* cost of loading SSE register
-{6, 6, 6},				/* cost of loading fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-					   in SFmode, DFmode and XFmode */
+{2, 2, 8, 16, 32},			/* cost of storing SSE register
-{4, 4, 4},				/* cost of storing fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-					   in SFmode, DFmode and XFmode */
-2,					/* cost of moving MMX register */
-{2, 2},				/* cost of loading MMX registers
-					   in SImode and DImode */
-{2, 2},				/* cost of storing MMX registers
-					   in SImode and DImode */
-2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-{2, 2, 8, 16, 32},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
 {2, 2, 8, 16, 32},			/* cost of unaligned loads.  */
-{2, 2, 8, 16, 32},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
 {2, 2, 8, 16, 32},			/* cost of unaligned stores.  */
-6, 6,					/* SSE->integer and integer->SSE moves */
+2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+6,					/* cost of moving SSE register to integer.  */
 2, 2,					/* Gather load static, per_elt.  */
 2, 2,					/* Gather store static, per_elt.  */
 32,					/* size of l1 cache.  */
 32,					/* size of l2 cache.  Some models
 					   have integrated l2 cache, but
 static stringop_algs athlon_memset[2] = {
 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 DUMMY_STRINGOP_ALGS};
 static const
 struct processor_costs athlon_cost = {
+{
+/* Start of register allocator costs.  integer->integer move cost is 2. */
+4,				     /* cost for loading QImode using movzbl */
+{3, 4, 3},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+{3, 4, 3},				/* cost of storing integer registers */
+4,					/* cost of reg,reg fld/fst */
+{4, 4, 12},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+{6, 6, 8},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+2,					/* cost of moving MMX register */
+{4, 4},				/* cost of loading MMX registers
+					   in SImode and DImode */
+{4, 4},				/* cost of storing MMX registers
+					   in SImode and DImode */
+2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+{4, 4, 12, 12, 24},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+{4, 4, 10, 10, 20},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+5, 5,					/* SSE->integer and integer->SSE moves */
+/* End of register allocator costs.  */
+},
 COSTS_N_INSNS (1),			/* cost of an add instruction */
 COSTS_N_INSNS (2),			/* cost of a lea instruction */
 COSTS_N_INSNS (1),			/* variable shift costs */
 COSTS_N_INSNS (1),			/* constant shift costs */
 {COSTS_N_INSNS (5),			/* cost of starting multiply for QI */
 COSTS_N_INSNS (74)},			/*			    other */
 COSTS_N_INSNS (1),			/* cost of movsx */
 COSTS_N_INSNS (1),			/* cost of movzx */
 8,					/* "large" insn */
 9,					/* MOVE_RATIO */
+6,					/* CLEAR_RATIO */
-/* All move costs are relative to integer->integer move times 2 and thus
-they are latency*2. */
-4,				     /* cost for loading QImode using movzbl */
 {3, 4, 3},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
 {3, 4, 3},				/* cost of storing integer registers */
-4,					/* cost of reg,reg fld/fst */
+{4, 4, 12, 12, 24},			/* cost of loading SSE register
-{4, 4, 12},				/* cost of loading fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-					   in SFmode, DFmode and XFmode */
+{4, 4, 10, 10, 20},			/* cost of storing SSE register
-{6, 6, 8},				/* cost of storing fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-					   in SFmode, DFmode and XFmode */
-2,					/* cost of moving MMX register */
-{4, 4},				/* cost of loading MMX registers
-					   in SImode and DImode */
-{4, 4},				/* cost of storing MMX registers
-					   in SImode and DImode */
-2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-{4, 4, 12, 12, 24},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
 {4, 4, 12, 12, 24},			/* cost of unaligned loads.  */
-{4, 4, 10, 10, 20},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
 {4, 4, 10, 10, 20},			/* cost of unaligned stores.  */
-5, 5,					/* SSE->integer and integer->SSE moves */
+2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+5,					/* cost of moving SSE register to integer.  */
 4, 4,					/* Gather load static, per_elt.  */
 4, 4,					/* Gather store static, per_elt.  */
 64,					/* size of l1 cache.  */
 256,					/* size of l2 cache.  */
 64,					/* size of prefetch block */
 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 {libcall, {{48, unrolled_loop, false},
 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
 static const
 struct processor_costs k8_cost = {
+{
+/* Start of register allocator costs.  integer->integer move cost is 2. */
+4,				     /* cost for loading QImode using movzbl */
+{3, 4, 3},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+{3, 4, 3},				/* cost of storing integer registers */
+4,					/* cost of reg,reg fld/fst */
+{4, 4, 12},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+{6, 6, 8},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+2,					/* cost of moving MMX register */
+{3, 3},				/* cost of loading MMX registers
+					   in SImode and DImode */
+{4, 4},				/* cost of storing MMX registers
+					   in SImode and DImode */
+2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+{4, 3, 12, 12, 24},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+{4, 4, 10, 10, 20},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+5, 5,					/* SSE->integer and integer->SSE moves */
+/* End of register allocator costs.  */
+},
 COSTS_N_INSNS (1),			/* cost of an add instruction */
 COSTS_N_INSNS (2),			/* cost of a lea instruction */
 COSTS_N_INSNS (1),			/* variable shift costs */
 COSTS_N_INSNS (1),			/* constant shift costs */
 {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
 COSTS_N_INSNS (74)},			/*			    other */
 COSTS_N_INSNS (1),			/* cost of movsx */
 COSTS_N_INSNS (1),			/* cost of movzx */
 8,					/* "large" insn */
 9,					/* MOVE_RATIO */
+6,					/* CLEAR_RATIO */
-/* All move costs are relative to integer->integer move times 2 and thus
-they are latency*2. */
-4,				     /* cost for loading QImode using movzbl */
 {3, 4, 3},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
 {3, 4, 3},				/* cost of storing integer registers */
-4,					/* cost of reg,reg fld/fst */
+{4, 3, 12, 12, 24},			/* cost of loading SSE register
-{4, 4, 12},				/* cost of loading fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-					   in SFmode, DFmode and XFmode */
+{4, 4, 10, 10, 20},			/* cost of storing SSE register
-{6, 6, 8},				/* cost of storing fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-					   in SFmode, DFmode and XFmode */
-2,					/* cost of moving MMX register */
-{3, 3},				/* cost of loading MMX registers
-					   in SImode and DImode */
-{4, 4},				/* cost of storing MMX registers
-					   in SImode and DImode */
-2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-{4, 3, 12, 12, 24},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
 {4, 3, 12, 12, 24},			/* cost of unaligned loads.  */
-{4, 4, 10, 10, 20},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
 {4, 4, 10, 10, 20},			/* cost of unaligned stores.  */
-5, 5,					/* SSE->integer and integer->SSE moves */
+2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+5,					/* cost of moving SSE register to integer.  */
 4, 4,					/* Gather load static, per_elt.  */
 4, 4,					/* Gather store static, per_elt.  */
 64,					/* size of l1 cache.  */
 512,					/* size of l2 cache.  */
 64,					/* size of prefetch block */
 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
 {-1, libcall, false}}}};
 struct processor_costs amdfam10_cost = {
+{
+/* Start of register allocator costs.  integer->integer move cost is 2. */
+4,				     /* cost for loading QImode using movzbl */
+{3, 4, 3},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+{3, 4, 3},				/* cost of storing integer registers */
+4,					/* cost of reg,reg fld/fst */
+{4, 4, 12},				/* cost of loading fp registers
+		   			   in SFmode, DFmode and XFmode */
+{6, 6, 8},				/* cost of storing fp registers
+		   			   in SFmode, DFmode and XFmode */
+2,					/* cost of moving MMX register */
+{3, 3},				/* cost of loading MMX registers
+					   in SImode and DImode */
+{4, 4},				/* cost of storing MMX registers
+					   in SImode and DImode */
+2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+{4, 4, 3, 6, 12},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+{4, 4, 5, 10, 20},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+3, 3,					/* SSE->integer and integer->SSE moves */
+					/* On K8:
+					    MOVD reg64, xmmreg Double FSTORE 4
+					    MOVD reg32, xmmreg Double FSTORE 4
+					   On AMDFAM10:
+					    MOVD reg64, xmmreg Double FADD 3
+							       1/1  1/1
+					    MOVD reg32, xmmreg Double FADD 3
+							       1/1  1/1 */
+/* End of register allocator costs.  */
+},
 COSTS_N_INSNS (1),			/* cost of an add instruction */
 COSTS_N_INSNS (2),			/* cost of a lea instruction */
 COSTS_N_INSNS (1),			/* variable shift costs */
 COSTS_N_INSNS (1),			/* constant shift costs */
 {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
 COSTS_N_INSNS (83)},			/*			    other */
 COSTS_N_INSNS (1),			/* cost of movsx */
 COSTS_N_INSNS (1),			/* cost of movzx */
 8,					/* "large" insn */
 9,					/* MOVE_RATIO */
+6,					/* CLEAR_RATIO */
-/* All move costs are relative to integer->integer move times 2 and thus
-they are latency*2. */
-4,				     /* cost for loading QImode using movzbl */
 {3, 4, 3},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
 {3, 4, 3},				/* cost of storing integer registers */
-4,					/* cost of reg,reg fld/fst */
+{4, 4, 3, 6, 12},			/* cost of loading SSE register
-{4, 4, 12},				/* cost of loading fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-		   			   in SFmode, DFmode and XFmode */
+{4, 4, 5, 10, 20},			/* cost of storing SSE register
-{6, 6, 8},				/* cost of storing fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-		   			   in SFmode, DFmode and XFmode */
-2,					/* cost of moving MMX register */
-{3, 3},				/* cost of loading MMX registers
-					   in SImode and DImode */
-{4, 4},				/* cost of storing MMX registers
-					   in SImode and DImode */
-2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-{4, 4, 3, 6, 12},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
 {4, 4, 3, 7, 12},			/* cost of unaligned loads.  */
-{4, 4, 5, 10, 20},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
 {4, 4, 5, 10, 20},			/* cost of unaligned stores.  */
-3, 3,					/* SSE->integer and integer->SSE moves */
+2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-					/* On K8:
+3,					/* cost of moving SSE register to integer.  */
-					    MOVD reg64, xmmreg Double FSTORE 4
-					    MOVD reg32, xmmreg Double FSTORE 4
-					   On AMDFAM10:
-					    MOVD reg64, xmmreg Double FADD 3
-							       1/1  1/1
-					    MOVD reg32, xmmreg Double FADD 3
-							       1/1  1/1 */
 4, 4,					/* Gather load static, per_elt.  */
 4, 4,					/* Gather store static, per_elt.  */
 64,					/* size of l1 cache.  */
 512,					/* size of l2 cache.  */
 64,					/* size of prefetch block */
 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
 {-1, libcall, false}}}};
 const struct processor_costs bdver_cost = {
+{
+/* Start of register allocator costs.  integer->integer move cost is 2. */
+8,				     /* cost for loading QImode using movzbl */
+{8, 8, 8},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+{8, 8, 8},				/* cost of storing integer registers */
+4,					/* cost of reg,reg fld/fst */
+{12, 12, 28},				/* cost of loading fp registers
+		   			   in SFmode, DFmode and XFmode */
+{10, 10, 18},				/* cost of storing fp registers
+		   			   in SFmode, DFmode and XFmode */
+4,					/* cost of moving MMX register */
+{12, 12},				/* cost of loading MMX registers
+					   in SImode and DImode */
+{10, 10},				/* cost of storing MMX registers
+					   in SImode and DImode */
+2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+{12, 12, 10, 40, 60},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+{10, 10, 10, 40, 60},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+16, 20,				/* SSE->integer and integer->SSE moves */
+/* End of register allocator costs.  */
+},
 COSTS_N_INSNS (1),			/* cost of an add instruction */
 COSTS_N_INSNS (1),			/* cost of a lea instruction */
 COSTS_N_INSNS (1),			/* variable shift costs */
 COSTS_N_INSNS (1),			/* constant shift costs */
 {COSTS_N_INSNS (4),			/* cost of starting multiply for QI */
 COSTS_N_INSNS (83)},			/*			    other */
 COSTS_N_INSNS (1),			/* cost of movsx */
 COSTS_N_INSNS (1),			/* cost of movzx */
 8,					/* "large" insn */
 9,					/* MOVE_RATIO */
+6,					/* CLEAR_RATIO */
-/* All move costs are relative to integer->integer move times 2 and thus
-they are latency*2. */
-8,				     /* cost for loading QImode using movzbl */
 {8, 8, 8},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
 {8, 8, 8},				/* cost of storing integer registers */
-4,					/* cost of reg,reg fld/fst */
+{12, 12, 10, 40, 60},			/* cost of loading SSE register
-{12, 12, 28},				/* cost of loading fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-		   			   in SFmode, DFmode and XFmode */
+{10, 10, 10, 40, 60},			/* cost of storing SSE register
-{10, 10, 18},				/* cost of storing fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-		   			   in SFmode, DFmode and XFmode */
-4,					/* cost of moving MMX register */
-{12, 12},				/* cost of loading MMX registers
-					   in SImode and DImode */
-{10, 10},				/* cost of storing MMX registers
-					   in SImode and DImode */
-2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-{12, 12, 10, 40, 60},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
 {12, 12, 10, 40, 60},			/* cost of unaligned loads.  */
-{10, 10, 10, 40, 60},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
 {10, 10, 10, 40, 60},			/* cost of unaligned stores.  */
-16, 20,				/* SSE->integer and integer->SSE moves */
+2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+16,					/* cost of moving SSE register to integer.  */
 12, 12,				/* Gather load static, per_elt.  */
 10, 10,				/* Gather store static, per_elt.  */
 16,					/* size of l1 cache.  */
 2048,					/* size of l2 cache.  */
 64,					/* size of prefetch block */
 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
 	     {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
 	     {-1, libcall, false}}}};
 struct processor_costs znver1_cost = {
+{
+/* Start of register allocator costs.  integer->integer move cost is 2. */
+/* reg-reg moves are done by renaming and thus they are even cheaper than
+1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
+to doubles of latencies, we do not model this correctly.  It does not
+seem to make practical difference to bump prices up even more.  */
+6,					/* cost for loading QImode using
+					   movzbl.  */
+{6, 6, 6},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+{8, 8, 8},				/* cost of storing integer
+					   registers.  */
+2,					/* cost of reg,reg fld/fst.  */
+{6, 6, 16},				/* cost of loading fp registers
+		   			   in SFmode, DFmode and XFmode.  */
+{8, 8, 16},				/* cost of storing fp registers
+		   			   in SFmode, DFmode and XFmode.  */
+2,					/* cost of moving MMX register.  */
+{6, 6},				/* cost of loading MMX registers
+					   in SImode and DImode.  */
+{8, 8},				/* cost of storing MMX registers
+					   in SImode and DImode.  */
+2, 3, 6,				/* cost of moving XMM,YMM,ZMM register.  */
+{6, 6, 6, 12, 24},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit.  */
+{8, 8, 8, 16, 32},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit.  */
+6, 6,					/* SSE->integer and integer->SSE moves.  */
+/* End of register allocator costs.  */
+},
 COSTS_N_INSNS (1),			/* cost of an add instruction.  */
 COSTS_N_INSNS (1),			/* cost of a lea instruction.  */
 COSTS_N_INSNS (1),			/* variable shift costs.  */
 COSTS_N_INSNS (1),			/* constant shift costs.  */
 {COSTS_N_INSNS (3),			/* cost of starting multiply for QI.  */
 COSTS_N_INSNS (45)},			/*			    other.  */
 COSTS_N_INSNS (1),			/* cost of movsx.  */
 COSTS_N_INSNS (1),			/* cost of movzx.  */
 8,					/* "large" insn.  */
 9,					/* MOVE_RATIO.  */
+6,					/* CLEAR_RATIO */
-/* All move costs are relative to integer->integer move times 2 and thus
-they are latency*2. */
-/* reg-reg moves are done by renaming and thus they are even cheaper than
-1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
-to doubles of latencies, we do not model this correctly.  It does not
-seem to make practical difference to bump prices up even more.  */
-6,					/* cost for loading QImode using
-					   movzbl.  */
 {6, 6, 6},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
 {8, 8, 8},				/* cost of storing integer
 					   registers.  */
-2,					/* cost of reg,reg fld/fst.  */
+{6, 6, 6, 12, 24},			/* cost of loading SSE register
-{6, 6, 16},				/* cost of loading fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-		   			   in SFmode, DFmode and XFmode.  */
+{8, 8, 8, 16, 32},			/* cost of storing SSE register
-{8, 8, 16},				/* cost of storing fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-		   			   in SFmode, DFmode and XFmode.  */
+{6, 6, 6, 12, 24},			/* cost of unaligned loads.  */
-2,					/* cost of moving MMX register.  */
+{8, 8, 8, 16, 32},			/* cost of unaligned stores.  */
-{6, 6},				/* cost of loading MMX registers
-					   in SImode and DImode.  */
-{8, 8},				/* cost of storing MMX registers
-					   in SImode and DImode.  */
 2, 3, 6,				/* cost of moving XMM,YMM,ZMM register.  */
-{6, 6, 6, 12, 24},			/* cost of loading SSE registers
+6,					/* cost of moving SSE register to integer.  */
-					   in 32,64,128,256 and 512-bit.  */
-{6, 6, 6, 12, 24},			/* cost of unaligned loads.  */
-{8, 8, 8, 16, 32},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit.  */
-{8, 8, 8, 16, 32},			/* cost of unaligned stores.  */
-6, 6,					/* SSE->integer and integer->SSE moves.  */
 /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
 throughput 12.  Approx 9 uops do not depend on vector size and every load
 is 7 uops.  */
 18, 8,				/* Gather load static, per_elt.  */
 18, 10,				/* Gather store static, per_elt.  */
 "16",					/* Jump alignment.  */
 "0:0:8",				/* Label alignment.  */
 "16",					/* Func alignment.  */
 };
+/*  ZNVER2 has optimized REP instruction for medium sized blocks, but for
+very small blocks it is better to use loop.  For large blocks, libcall
+can do nontemporary accesses and beat inline considerably.  */
+static stringop_algs znver2_memcpy[2] = {
+{libcall, {{6, loop, false}, {14, unrolled_loop, false},
+	     {-1, rep_prefix_4_byte, false}}},
+{libcall, {{16, loop, false}, {64, rep_prefix_4_byte, false},
+	     {-1, libcall, false}}}};
+static stringop_algs znver2_memset[2] = {
+{libcall, {{8, loop, false}, {24, unrolled_loop, false},
+	     {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+{libcall, {{24, rep_prefix_4_byte, false}, {128, rep_prefix_8_byte, false},
+	     {-1, libcall, false}}}};
+struct processor_costs znver2_cost = {
+{
+/* Start of register allocator costs.  integer->integer move cost is 2. */
+/* reg-reg moves are done by renaming and thus they are even cheaper than
+1 cycle.  Because reg-reg move cost is 2 and following tables correspond
+to doubles of latencies, we do not model this correctly.  It does not
+seem to make practical difference to bump prices up even more.  */
+6,					/* cost for loading QImode using
+					   movzbl.  */
+{6, 6, 6},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+{8, 8, 8},				/* cost of storing integer
+					   registers.  */
+2,					/* cost of reg,reg fld/fst.  */
+{6, 6, 16},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode.  */
+{8, 8, 16},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode.  */
+2,					/* cost of moving MMX register.  */
+{6, 6},				/* cost of loading MMX registers
+					   in SImode and DImode.  */
+{8, 8},				/* cost of storing MMX registers
+					   in SImode and DImode.  */
+2, 2, 3,				/* cost of moving XMM,YMM,ZMM
+					   register.  */
+{6, 6, 6, 6, 12},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit.  */
+{8, 8, 8, 8, 16},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit.  */
+6, 6,					/* SSE->integer and integer->SSE
+					   moves.  */
+/* End of register allocator costs.  */
+},
+COSTS_N_INSNS (1),			/* cost of an add instruction.  */
+COSTS_N_INSNS (1),			/* cost of a lea instruction.  */
+COSTS_N_INSNS (1),			/* variable shift costs.  */
+COSTS_N_INSNS (1),			/* constant shift costs.  */
+{COSTS_N_INSNS (3),			/* cost of starting multiply for QI.  */
+COSTS_N_INSNS (3),			/* 				 HI.  */
+COSTS_N_INSNS (3),			/*				 SI.  */
+COSTS_N_INSNS (3),			/*				 DI.  */
+COSTS_N_INSNS (3)},			/*			other.  */
+0,					/* cost of multiply per each bit
+					   set.  */
+/* Depending on parameters, idiv can get faster on ryzen.  This is upper
+bound.  */
+{COSTS_N_INSNS (16),			/* cost of a divide/mod for QI.  */
+COSTS_N_INSNS (22),			/* 			    HI.  */
+COSTS_N_INSNS (30),			/*			    SI.  */
+COSTS_N_INSNS (45),			/*			    DI.  */
+COSTS_N_INSNS (45)},			/*			    other.  */
+COSTS_N_INSNS (1),			/* cost of movsx.  */
+COSTS_N_INSNS (1),			/* cost of movzx.  */
+8,					/* "large" insn.  */
+9,					/* MOVE_RATIO.  */
+6,					/* CLEAR_RATIO */
+{6, 6, 6},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+{8, 8, 8},				/* cost of storing integer
+					   registers.  */
+{6, 6, 6, 6, 12},			/* cost of loading SSE registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+{8, 8, 8, 8, 16},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+{6, 6, 6, 6, 12},			/* cost of unaligned loads.  */
+{8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
+2, 2, 3,				/* cost of moving XMM,YMM,ZMM
+					   register.  */
+6,					/* cost of moving SSE register to integer.  */
+/* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
+throughput 12.  Approx 9 uops do not depend on vector size and every load
+is 7 uops.  */
+18, 8,				/* Gather load static, per_elt.  */
+18, 10,				/* Gather store static, per_elt.  */
+32,					/* size of l1 cache.  */
+512,					/* size of l2 cache.  */
+64,					/* size of prefetch block.  */
+/* New AMD processors never drop prefetches; if they cannot be performed
+immediately, they are queued.  We set number of simultaneous prefetches
+to a large constant to reflect this (it probably is not a good idea not
+to limit number of prefetches at all, as their execution also takes some
+time).  */
+100,					/* number of parallel prefetches.  */
+3,					/* Branch cost.  */
+COSTS_N_INSNS (5),			/* cost of FADD and FSUB insns.  */
+COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
+/* Latency of fdiv is 8-15.  */
+COSTS_N_INSNS (15),			/* cost of FDIV instruction.  */
+COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
+COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
+/* Latency of fsqrt is 4-10.  */
+COSTS_N_INSNS (10),			/* cost of FSQRT instruction.  */
+COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
+COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
+COSTS_N_INSNS (3),			/* cost of MULSS instruction.  */
+COSTS_N_INSNS (3),			/* cost of MULSD instruction.  */
+COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
+COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
+COSTS_N_INSNS (10),			/* cost of DIVSS instruction.  */
+/* 9-13.  */
+COSTS_N_INSNS (13),			/* cost of DIVSD instruction.  */
+COSTS_N_INSNS (10),			/* cost of SQRTSS instruction.  */
+COSTS_N_INSNS (15),			/* cost of SQRTSD instruction.  */
+/* Zen can execute 4 integer operations per cycle.  FP operations
+take 3 cycles and it can execute 2 integer additions and 2
+multiplications thus reassociation may make sense up to with of 6.
+SPEC2k6 bencharks suggests
+that 4 works better than 6 probably due to register pressure.
+Integer vector operations are taken by FP unit and execute 3 vector
+plus/minus operations per cycle but only one multiply.  This is adjusted
+in ix86_reassociation_width.  */
+4, 4, 3, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
+znver2_memcpy,
+znver2_memset,
+COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
+COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
+"16",					/* Loop alignment.  */
+"16",					/* Jump alignment.  */
+"0:0:8",				/* Label alignment.  */
+"16",					/* Func alignment.  */
+};
 /* skylake_cost should produce code tuned for Skylake familly of CPUs.  */
 static stringop_algs skylake_memcpy[2] =   {
 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
 {libcall, {{16, loop, false}, {512, unrolled_loop, false},
 {-1, libcall, false}}}};
 {libcall, {{24, loop, true}, {512, unrolled_loop, false},
 {-1, libcall, false}}}};
 static const
 struct processor_costs skylake_cost = {
+{
+/* Start of register allocator costs.  integer->integer move cost is 2. */
+6,				     /* cost for loading QImode using movzbl */
+{4, 4, 4},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+{6, 6, 6},				/* cost of storing integer registers */
+2,					/* cost of reg,reg fld/fst */
+{6, 6, 8},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+{6, 6, 10},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+2,					/* cost of moving MMX register */
+{6, 6},				/* cost of loading MMX registers
+					   in SImode and DImode */
+{6, 6},				/* cost of storing MMX registers
+					   in SImode and DImode */
+2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
+{6, 6, 6, 10, 20},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+{8, 8, 8, 12, 24},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+6, 6,					/* SSE->integer and integer->SSE moves */
+/* End of register allocator costs.  */
+},
 COSTS_N_INSNS (1),			/* cost of an add instruction */
 COSTS_N_INSNS (1)+1,		/* cost of a lea instruction */
 COSTS_N_INSNS (1),			/* variable shift costs */
 COSTS_N_INSNS (1),			/* constant shift costs */
 {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
 COSTS_N_INSNS (76)},			/*			    other */
 COSTS_N_INSNS (1),			/* cost of movsx */
 COSTS_N_INSNS (0),			/* cost of movzx */
 8,					/* "large" insn */
 17,					/* MOVE_RATIO */
+6,					/* CLEAR_RATIO */
-6,				     /* cost for loading QImode using movzbl */
 {4, 4, 4},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
-{6, 6, 3},				/* cost of storing integer registers */
+{6, 6, 6},				/* cost of storing integer registers */
-2,					/* cost of reg,reg fld/fst */
+{6, 6, 6, 10, 20},			/* cost of loading SSE register
-{6, 6, 8},				/* cost of loading fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-					   in SFmode, DFmode and XFmode */
+{8, 8, 8, 12, 24},			/* cost of storing SSE register
-{6, 6, 10},				/* cost of storing fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-					   in SFmode, DFmode and XFmode */
+{6, 6, 6, 10, 20},			/* cost of unaligned loads.  */
-2,					/* cost of moving MMX register */
+{8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
-{6, 6},				/* cost of loading MMX registers
-					   in SImode and DImode */
-{6, 6},				/* cost of storing MMX registers
-					   in SImode and DImode */
 2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
-{6, 6, 6, 10, 20},			/* cost of loading SSE registers
+2,					/* cost of moving SSE register to integer.  */
-					   in 32,64,128,256 and 512-bit */
-{6, 6, 6, 10, 20},			/* cost of unaligned loads.  */
-{8, 8, 8, 12, 24},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
-{8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
-2, 2,					/* SSE->integer and integer->SSE moves */
 20, 8,				/* Gather load static, per_elt.  */
 22, 10,				/* Gather store static, per_elt.  */
 64,					/* size of l1 cache.  */
 512,					/* size of l2 cache.  */
 64,					/* size of prefetch block */
 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
 {-1, libcall, false}}}};
 const struct processor_costs btver1_cost = {
+{
+/* Start of register allocator costs.  integer->integer move cost is 2. */
+8,				     /* cost for loading QImode using movzbl */
+{6, 8, 6},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+{6, 8, 6},				/* cost of storing integer registers */
+4,					/* cost of reg,reg fld/fst */
+{12, 12, 28},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+{12, 12, 38},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+4,					/* cost of moving MMX register */
+{10, 10},				/* cost of loading MMX registers
+					   in SImode and DImode */
+{12, 12},				/* cost of storing MMX registers
+					   in SImode and DImode */
+2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+{10, 10, 12, 48, 96},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+{10, 10, 12, 48, 96},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+14, 14,				/* SSE->integer and integer->SSE moves */
+/* End of register allocator costs.  */
+},
 COSTS_N_INSNS (1),			/* cost of an add instruction */
 COSTS_N_INSNS (2),			/* cost of a lea instruction */
 COSTS_N_INSNS (1),			/* variable shift costs */
 COSTS_N_INSNS (1),			/* constant shift costs */
 {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
 COSTS_N_INSNS (83)},			/*			    other */
 COSTS_N_INSNS (1),			/* cost of movsx */
 COSTS_N_INSNS (1),			/* cost of movzx */
 8,					/* "large" insn */
 9,					/* MOVE_RATIO */
+6,					/* CLEAR_RATIO */
-/* All move costs are relative to integer->integer move times 2 and thus
-they are latency*2. */
-8,				     /* cost for loading QImode using movzbl */
 {6, 8, 6},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
 {6, 8, 6},				/* cost of storing integer registers */
-4,					/* cost of reg,reg fld/fst */
+{10, 10, 12, 48, 96},			/* cost of loading SSE register
-{12, 12, 28},				/* cost of loading fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-					   in SFmode, DFmode and XFmode */
+{10, 10, 12, 48, 96},			/* cost of storing SSE register
-{12, 12, 38},				/* cost of storing fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-					   in SFmode, DFmode and XFmode */
-4,					/* cost of moving MMX register */
-{10, 10},				/* cost of loading MMX registers
-					   in SImode and DImode */
-{12, 12},				/* cost of storing MMX registers
-					   in SImode and DImode */
-2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-{10, 10, 12, 48, 96},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
 {10, 10, 12, 48, 96},			/* cost of unaligned loads.  */
-{10, 10, 12, 48, 96},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
 {10, 10, 12, 48, 96},			/* cost of unaligned stores.  */
-14, 14,				/* SSE->integer and integer->SSE moves */
+2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+14,					/* cost of moving SSE register to integer.  */
 10, 10,				/* Gather load static, per_elt.  */
 10, 10,				/* Gather store static, per_elt.  */
 32,					/* size of l1 cache.  */
 512,					/* size of l2 cache.  */
 64,					/* size of prefetch block */
 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
 {-1, libcall, false}}}};
 const struct processor_costs btver2_cost = {
+{
+/* Start of register allocator costs.  integer->integer move cost is 2. */
+8,				     /* cost for loading QImode using movzbl */
+{8, 8, 6},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+{8, 8, 6},				/* cost of storing integer registers */
+4,					/* cost of reg,reg fld/fst */
+{12, 12, 28},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+{12, 12, 38},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+4,					/* cost of moving MMX register */
+{10, 10},				/* cost of loading MMX registers
+					   in SImode and DImode */
+{12, 12},				/* cost of storing MMX registers
+					   in SImode and DImode */
+2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+{10, 10, 12, 48, 96},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+{10, 10, 12, 48, 96},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+14, 14,				/* SSE->integer and integer->SSE moves */
+/* End of register allocator costs.  */
+},
 COSTS_N_INSNS (1),			/* cost of an add instruction */
 COSTS_N_INSNS (2),			/* cost of a lea instruction */
 COSTS_N_INSNS (1),			/* variable shift costs */
 COSTS_N_INSNS (1),			/* constant shift costs */
 {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
 COSTS_N_INSNS (83)},			/*			    other */
 COSTS_N_INSNS (1),			/* cost of movsx */
 COSTS_N_INSNS (1),			/* cost of movzx */
 8,					/* "large" insn */
 9,					/* MOVE_RATIO */
+6,					/* CLEAR_RATIO */
-/* All move costs are relative to integer->integer move times 2 and thus
-they are latency*2. */
-8,				     /* cost for loading QImode using movzbl */
 {8, 8, 6},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
 {8, 8, 6},				/* cost of storing integer registers */
-4,					/* cost of reg,reg fld/fst */
+{10, 10, 12, 48, 96},			/* cost of loading SSE register
-{12, 12, 28},				/* cost of loading fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-					   in SFmode, DFmode and XFmode */
+{10, 10, 12, 48, 96},			/* cost of storing SSE register
-{12, 12, 38},				/* cost of storing fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-					   in SFmode, DFmode and XFmode */
-4,					/* cost of moving MMX register */
-{10, 10},				/* cost of loading MMX registers
-					   in SImode and DImode */
-{12, 12},				/* cost of storing MMX registers
-					   in SImode and DImode */
-2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-{10, 10, 12, 48, 96},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
 {10, 10, 12, 48, 96},			/* cost of unaligned loads.  */
-{10, 10, 12, 48, 96},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
 {10, 10, 12, 48, 96},			/* cost of unaligned stores.  */
-14, 14,				/* SSE->integer and integer->SSE moves */
+2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+14,					/* cost of moving SSE register to integer.  */
 10, 10,				/* Gather load static, per_elt.  */
 10, 10,				/* Gather store static, per_elt.  */
 32,					/* size of l1 cache.  */
 2048,					/* size of l2 cache.  */
 64,					/* size of prefetch block */
 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 DUMMY_STRINGOP_ALGS};
 static const
 struct processor_costs pentium4_cost = {
+{
+/* Start of register allocator costs.  integer->integer move cost is 2. */
+5,				     /* cost for loading QImode using movzbl */
+{4, 5, 4},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+{2, 3, 2},				/* cost of storing integer registers */
+12,					/* cost of reg,reg fld/fst */
+{14, 14, 14},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+{14, 14, 14},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+12,					/* cost of moving MMX register */
+{16, 16},				/* cost of loading MMX registers
+					   in SImode and DImode */
+{16, 16},				/* cost of storing MMX registers
+					   in SImode and DImode */
+12, 24, 48,				/* cost of moving XMM,YMM,ZMM register */
+{16, 16, 16, 32, 64},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+{16, 16, 16, 32, 64},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+20, 12,				/* SSE->integer and integer->SSE moves */
+/* End of register allocator costs.  */
+},
 COSTS_N_INSNS (1),			/* cost of an add instruction */
 COSTS_N_INSNS (3),			/* cost of a lea instruction */
 COSTS_N_INSNS (4),			/* variable shift costs */
 COSTS_N_INSNS (4),			/* constant shift costs */
 {COSTS_N_INSNS (15),			/* cost of starting multiply for QI */
 COSTS_N_INSNS (56)},			/*			    other */
 COSTS_N_INSNS (1),			/* cost of movsx */
 COSTS_N_INSNS (1),			/* cost of movzx */
 16,					/* "large" insn */
 6,					/* MOVE_RATIO */
+6,					/* CLEAR_RATIO */
-/* All move costs are relative to integer->integer move times 2 and thus
-they are latency*2. */
-5,				     /* cost for loading QImode using movzbl */
 {4, 5, 4},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
 {2, 3, 2},				/* cost of storing integer registers */
-12,					/* cost of reg,reg fld/fst */
+{16, 16, 16, 32, 64},			/* cost of loading SSE register
-{14, 14, 14},				/* cost of loading fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-					   in SFmode, DFmode and XFmode */
+{16, 16, 16, 32, 64},			/* cost of storing SSE register
-{14, 14, 14},				/* cost of storing fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-					   in SFmode, DFmode and XFmode */
+{32, 32, 32, 64, 128},		/* cost of unaligned loads.  */
-12,					/* cost of moving MMX register */
+{32, 32, 32, 64, 128},		/* cost of unaligned stores.  */
-{16, 16},				/* cost of loading MMX registers
-					   in SImode and DImode */
-{16, 16},				/* cost of storing MMX registers
-					   in SImode and DImode */
 12, 24, 48,				/* cost of moving XMM,YMM,ZMM register */
-{16, 16, 16, 32, 64},			/* cost of loading SSE registers
+20,					/* cost of moving SSE register to integer.  */
-					   in 32,64,128,256 and 512-bit */
-{32, 32, 32, 64, 128},		/* cost of unaligned loads.  */
-{16, 16, 16, 32, 64},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
-{32, 32, 32, 64, 128},		/* cost of unaligned stores.  */
-20, 12,				/* SSE->integer and integer->SSE moves */
 16, 16,				/* Gather load static, per_elt.  */
 16, 16,				/* Gather store static, per_elt.  */
 8,					/* size of l1 cache.  */
 256,					/* size of l2 cache.  */
 64,					/* size of prefetch block */
 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
 static const
 struct processor_costs nocona_cost = {
+{
+/* Start of register allocator costs.  integer->integer move cost is 2. */
+4,				     /* cost for loading QImode using movzbl */
+{4, 4, 4},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+{4, 4, 4},				/* cost of storing integer registers */
+12,					/* cost of reg,reg fld/fst */
+{14, 14, 14},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+{14, 14, 14},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+14,					/* cost of moving MMX register */
+{12, 12},				/* cost of loading MMX registers
+					   in SImode and DImode */
+{12, 12},				/* cost of storing MMX registers
+					   in SImode and DImode */
+6, 12, 24,				/* cost of moving XMM,YMM,ZMM register */
+{12, 12, 12, 24, 48},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+{12, 12, 12, 24, 48},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+20, 12,				/* SSE->integer and integer->SSE moves */
+/* End of register allocator costs.  */
+},
 COSTS_N_INSNS (1),			/* cost of an add instruction */
 COSTS_N_INSNS (1),			/* cost of a lea instruction */
 COSTS_N_INSNS (1),			/* variable shift costs */
 COSTS_N_INSNS (1),			/* constant shift costs */
 {COSTS_N_INSNS (10),			/* cost of starting multiply for QI */
 COSTS_N_INSNS (66)},			/*			    other */
 COSTS_N_INSNS (1),			/* cost of movsx */
 COSTS_N_INSNS (1),			/* cost of movzx */
 16,					/* "large" insn */
 17,					/* MOVE_RATIO */
+6,					/* CLEAR_RATIO */
-/* All move costs are relative to integer->integer move times 2 and thus
-they are latency*2. */
-4,				     /* cost for loading QImode using movzbl */
 {4, 4, 4},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
 {4, 4, 4},				/* cost of storing integer registers */
-12,					/* cost of reg,reg fld/fst */
+{12, 12, 12, 24, 48},			/* cost of loading SSE register
-{14, 14, 14},				/* cost of loading fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-					   in SFmode, DFmode and XFmode */
+{12, 12, 12, 24, 48},			/* cost of storing SSE register
-{14, 14, 14},				/* cost of storing fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-					   in SFmode, DFmode and XFmode */
+{24, 24, 24, 48, 96},			/* cost of unaligned loads.  */
-14,					/* cost of moving MMX register */
+{24, 24, 24, 48, 96},			/* cost of unaligned stores.  */
-{12, 12},				/* cost of loading MMX registers
-					   in SImode and DImode */
-{12, 12},				/* cost of storing MMX registers
-					   in SImode and DImode */
 6, 12, 24,				/* cost of moving XMM,YMM,ZMM register */
-{12, 12, 12, 24, 48},			/* cost of loading SSE registers
+20,					/* cost of moving SSE register to integer.  */
-					   in 32,64,128,256 and 512-bit */
-{24, 24, 24, 48, 96},			/* cost of unaligned loads.  */
-{12, 12, 12, 24, 48},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
-{24, 24, 24, 48, 96},			/* cost of unaligned stores.  */
-20, 12,				/* SSE->integer and integer->SSE moves */
 12, 12,				/* Gather load static, per_elt.  */
 12, 12,				/* Gather store static, per_elt.  */
 8,					/* size of l1 cache.  */
 1024,					/* size of l2 cache.  */
 64,					/* size of prefetch block */
 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
 static const
 struct processor_costs atom_cost = {
+{
+/* Start of register allocator costs.  integer->integer move cost is 2. */
+6,					/* cost for loading QImode using movzbl */
+{6, 6, 6},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+{6, 6, 6},				/* cost of storing integer registers */
+4,					/* cost of reg,reg fld/fst */
+{6, 6, 18},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+{14, 14, 24},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+2,					/* cost of moving MMX register */
+{8, 8},				/* cost of loading MMX registers
+					   in SImode and DImode */
+{10, 10},				/* cost of storing MMX registers
+					   in SImode and DImode */
+2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+{8, 8, 8, 16, 32},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+{8, 8, 8, 16, 32},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+8, 6,					/* SSE->integer and integer->SSE moves */
+/* End of register allocator costs.  */
+},
 COSTS_N_INSNS (1),			/* cost of an add instruction */
 COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
 COSTS_N_INSNS (1),			/* variable shift costs */
 COSTS_N_INSNS (1),			/* constant shift costs */
 {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
 COSTS_N_INSNS (74)},			/*			    other */
 COSTS_N_INSNS (1),			/* cost of movsx */
 COSTS_N_INSNS (1),			/* cost of movzx */
 8,					/* "large" insn */
 17,					/* MOVE_RATIO */
+6,					/* CLEAR_RATIO */
-/* All move costs are relative to integer->integer move times 2 and thus
-they are latency*2. */
-6,					/* cost for loading QImode using movzbl */
 {6, 6, 6},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
 {6, 6, 6},				/* cost of storing integer registers */
-4,					/* cost of reg,reg fld/fst */
+{8, 8, 8, 16, 32},			/* cost of loading SSE register
-{6, 6, 18},				/* cost of loading fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-					   in SFmode, DFmode and XFmode */
+{8, 8, 8, 16, 32},			/* cost of storing SSE register
-{14, 14, 24},				/* cost of storing fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-					   in SFmode, DFmode and XFmode */
-2,					/* cost of moving MMX register */
-{8, 8},				/* cost of loading MMX registers
-					   in SImode and DImode */
-{10, 10},				/* cost of storing MMX registers
-					   in SImode and DImode */
-2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-{8, 8, 8, 16, 32},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
 {16, 16, 16, 32, 64},			/* cost of unaligned loads.  */
-{8, 8, 8, 16, 32},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
 {16, 16, 16, 32, 64},			/* cost of unaligned stores.  */
-8, 6,					/* SSE->integer and integer->SSE moves */
+2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+8,					/* cost of moving SSE register to integer.  */
 8, 8,					/* Gather load static, per_elt.  */
 8, 8,					/* Gather store static, per_elt.  */
 32,					/* size of l1 cache.  */
 256,					/* size of l2 cache.  */
 64,					/* size of prefetch block */
 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
 static const
 struct processor_costs slm_cost = {
+{
+/* Start of register allocator costs.  integer->integer move cost is 2. */
+8,					/* cost for loading QImode using movzbl */
+{8, 8, 8},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+{6, 6, 6},				/* cost of storing integer registers */
+2,					/* cost of reg,reg fld/fst */
+{8, 8, 18},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+{6, 6, 18},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+2,					/* cost of moving MMX register */
+{8, 8},				/* cost of loading MMX registers
+					   in SImode and DImode */
+{6, 6},				/* cost of storing MMX registers
+					   in SImode and DImode */
+2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+{8, 8, 8, 16, 32},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+{8, 8, 8, 16, 32},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+8, 6,					/* SSE->integer and integer->SSE moves */
+/* End of register allocator costs.  */
+},
 COSTS_N_INSNS (1),			/* cost of an add instruction */
 COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
 COSTS_N_INSNS (1),			/* variable shift costs */
 COSTS_N_INSNS (1),			/* constant shift costs */
 {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
 COSTS_N_INSNS (74)},			/*			    other */
 COSTS_N_INSNS (1),			/* cost of movsx */
 COSTS_N_INSNS (1),			/* cost of movzx */
 8,					/* "large" insn */
 17,					/* MOVE_RATIO */
+6,					/* CLEAR_RATIO */
-/* All move costs are relative to integer->integer move times 2 and thus
-they are latency*2. */
-8,					/* cost for loading QImode using movzbl */
 {8, 8, 8},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
 {6, 6, 6},				/* cost of storing integer registers */
-2,					/* cost of reg,reg fld/fst */
+{8, 8, 8, 16, 32},			/* cost of loading SSE register
-{8, 8, 18},				/* cost of loading fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-					   in SFmode, DFmode and XFmode */
+{8, 8, 8, 16, 32},			/* cost of storing SSE register
-{6, 6, 18},				/* cost of storing fp registers
+					   in SImode, DImode and TImode.  */
-					   in SFmode, DFmode and XFmode */
-2,					/* cost of moving MMX register */
-{8, 8},				/* cost of loading MMX registers
-					   in SImode and DImode */
-{6, 6},				/* cost of storing MMX registers
-					   in SImode and DImode */
-2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-{8, 8, 8, 16, 32},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
 {16, 16, 16, 32, 64},			/* cost of unaligned loads.  */
-{8, 8, 8, 16, 32},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
 {16, 16, 16, 32, 64},			/* cost of unaligned stores.  */
-8, 6,					/* SSE->integer and integer->SSE moves */
+2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+8,					/* cost of moving SSE register to integer.  */
 8, 8,					/* Gather load static, per_elt.  */
 8, 8,					/* Gather store static, per_elt.  */
 32,					/* size of l1 cache.  */
 256,					/* size of l2 cache.  */
 64,					/* size of prefetch block */
 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
 static const
 struct processor_costs intel_cost = {
+{
+/* Start of register allocator costs.  integer->integer move cost is 2. */
+6,				     /* cost for loading QImode using movzbl */
+{4, 4, 4},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+{6, 6, 6},				/* cost of storing integer registers */
+2,					/* cost of reg,reg fld/fst */
+{6, 6, 8},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+{6, 6, 10},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+2,					/* cost of moving MMX register */
+{6, 6},				/* cost of loading MMX registers
+					   in SImode and DImode */
+{6, 6},				/* cost of storing MMX registers
+					   in SImode and DImode */
+2, 2, 2,				/* cost of moving XMM,YMM,ZMM register */
+{6, 6, 6, 6, 6},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+{6, 6, 6, 6, 6},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+4, 4,					/* SSE->integer and integer->SSE moves */
+/* End of register allocator costs.  */
+},
 COSTS_N_INSNS (1),			/* cost of an add instruction */
 COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
 COSTS_N_INSNS (1),			/* variable shift costs */
 COSTS_N_INSNS (1),			/* constant shift costs */
 {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
 COSTS_N_INSNS (74)},			/*			    other */
 COSTS_N_INSNS (1),			/* cost of movsx */
 COSTS_N_INSNS (1),			/* cost of movzx */
 8,					/* "large" insn */
 17,					/* MOVE_RATIO */
+6,					/* CLEAR_RATIO */
-/* All move costs are relative to integer->integer move times 2 and thus
-they are latency*2. */
-6,				     /* cost for loading QImode using movzbl */
 {4, 4, 4},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
 {6, 6, 6},				/* cost of storing integer registers */
-2,					/* cost of reg,reg fld/fst */
+{6, 6, 6, 6, 6},			/* cost of loading SSE register
-{6, 6, 8},				/* cost of loading fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-					   in SFmode, DFmode and XFmode */
+{6, 6, 6, 6, 6},			/* cost of storing SSE register
-{6, 6, 10},				/* cost of storing fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-					   in SFmode, DFmode and XFmode */
+{10, 10, 10, 10, 10},			/* cost of unaligned loads.  */
-2,					/* cost of moving MMX register */
+{10, 10, 10, 10, 10},			/* cost of unaligned loads.  */
-{6, 6},				/* cost of loading MMX registers
-					   in SImode and DImode */
-{6, 6},				/* cost of storing MMX registers
-					   in SImode and DImode */
 2, 2, 2,				/* cost of moving XMM,YMM,ZMM register */
-{6, 6, 6, 6, 6},			/* cost of loading SSE registers
+4,					/* cost of moving SSE register to integer.  */
-					   in 32,64,128,256 and 512-bit */
-{10, 10, 10, 10, 10},			/* cost of unaligned loads.  */
-{6, 6, 6, 6, 6},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
-{10, 10, 10, 10, 10},			/* cost of unaligned loads.  */
-4, 4,					/* SSE->integer and integer->SSE moves */
 6, 6,					/* Gather load static, per_elt.  */
 6, 6,					/* Gather store static, per_elt.  */
 32,					/* size of l1 cache.  */
 256,					/* size of l2 cache.  */
 64,					/* size of prefetch block */
 COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
 COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
 COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
 COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
-COSTS_N_INSNS (8),			/* cost of cheap SSE instruction.  */
+COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
 COSTS_N_INSNS (8),			/* cost of ADDSS/SD SUBSS/SD insns.  */
 COSTS_N_INSNS (8),			/* cost of MULSS instruction.  */
 COSTS_N_INSNS (8),			/* cost of MULSD instruction.  */
 COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
 COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
 {-1, libcall, false}}},
 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
 {-1, libcall, false}}}};
 static const
 struct processor_costs generic_cost = {
+{
+/* Start of register allocator costs.  integer->integer move cost is 2. */
+6,				     /* cost for loading QImode using movzbl */
+{6, 6, 6},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+{6, 6, 6},				/* cost of storing integer registers */
+4,					/* cost of reg,reg fld/fst */
+{6, 6, 12},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+{6, 6, 12},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+2,					/* cost of moving MMX register */
+{6, 6},				/* cost of loading MMX registers
+					   in SImode and DImode */
+{6, 6},				/* cost of storing MMX registers
+					   in SImode and DImode */
+2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
+{6, 6, 6, 10, 15},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+{6, 6, 6, 10, 15},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+6, 6,					/* SSE->integer and integer->SSE moves */
+/* End of register allocator costs.  */
+},
 COSTS_N_INSNS (1),			/* cost of an add instruction */
 /* Setting cost to 2 makes our current implementation of synth_mult result in
 use of unnecessary temporary registers causing regression on several
 SPECfp benchmarks.  */
 COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
 COSTS_N_INSNS (74)},			/*			    other */
 COSTS_N_INSNS (1),			/* cost of movsx */
 COSTS_N_INSNS (1),			/* cost of movzx */
 8,					/* "large" insn */
 17,					/* MOVE_RATIO */
+6,					/* CLEAR_RATIO */
-/* All move costs are relative to integer->integer move times 2 and thus
-they are latency*2. */
-6,				     /* cost for loading QImode using movzbl */
 {6, 6, 6},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
 {6, 6, 6},				/* cost of storing integer registers */
-4,					/* cost of reg,reg fld/fst */
+{6, 6, 6, 10, 15},			/* cost of loading SSE register
-{6, 6, 12},				/* cost of loading fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-					   in SFmode, DFmode and XFmode */
+{6, 6, 6, 10, 15},			/* cost of storing SSE register
-{6, 6, 12},				/* cost of storing fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-					   in SFmode, DFmode and XFmode */
+{6, 6, 6, 10, 15},			/* cost of unaligned loads.  */
-2,					/* cost of moving MMX register */
+{6, 6, 6, 10, 15},			/* cost of unaligned storess.  */
-{6, 6},				/* cost of loading MMX registers
-					   in SImode and DImode */
-{6, 6},				/* cost of storing MMX registers
-					   in SImode and DImode */
 2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
-{6, 6, 6, 10, 15},			/* cost of loading SSE registers
+6,					/* cost of moving SSE register to integer.  */
-					   in 32,64,128,256 and 512-bit */
-{6, 6, 6, 10, 15},			/* cost of unaligned loads.  */
-{6, 6, 6, 10, 15},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
-{6, 6, 6, 10, 15},			/* cost of unaligned storess.  */
-6, 6,					/* SSE->integer and integer->SSE moves */
 18, 6,				/* Gather load static, per_elt.  */
 18, 6,				/* Gather store static, per_elt.  */
 32,					/* size of l1 cache.  */
 512,					/* size of l2 cache.  */
 64,					/* size of prefetch block */
 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
 {-1, libcall, false}}}};
 static const
 struct processor_costs core_cost = {
+{
+/* Start of register allocator costs.  integer->integer move cost is 2. */
+6,				     /* cost for loading QImode using movzbl */
+{4, 4, 4},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+{6, 6, 6},				/* cost of storing integer registers */
+2,					/* cost of reg,reg fld/fst */
+{6, 6, 8},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+{6, 6, 10},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+2,					/* cost of moving MMX register */
+{6, 6},				/* cost of loading MMX registers
+					   in SImode and DImode */
+{6, 6},				/* cost of storing MMX registers
+					   in SImode and DImode */
+2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
+{6, 6, 6, 6, 12},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+{6, 6, 6, 6, 12},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+6, 6,					/* SSE->integer and integer->SSE moves */
+/* End of register allocator costs.  */
+},
 COSTS_N_INSNS (1),			/* cost of an add instruction */
 /* On all chips taken into consideration lea is 2 cycles and more.  With
 this cost however our current implementation of synth_mult results in
 use of unnecessary temporary registers causing regression on several
 SPECfp benchmarks.  */
 COSTS_N_INSNS (81)},			/*			    other */
 COSTS_N_INSNS (1),			/* cost of movsx */
 COSTS_N_INSNS (1),			/* cost of movzx */
 8,					/* "large" insn */
 17,					/* MOVE_RATIO */
+6,					/* CLEAR_RATIO */
-/* All move costs are relative to integer->integer move times 2 and thus
-they are latency*2. */
-6,				     /* cost for loading QImode using movzbl */
 {4, 4, 4},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
 {6, 6, 6},				/* cost of storing integer registers */
-2,					/* cost of reg,reg fld/fst */
+{6, 6, 6, 6, 12},			/* cost of loading SSE register
-{6, 6, 8},				/* cost of loading fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-					   in SFmode, DFmode and XFmode */
+{6, 6, 6, 6, 12},			/* cost of storing SSE register
-{6, 6, 10},				/* cost of storing fp registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-					   in SFmode, DFmode and XFmode */
+{6, 6, 6, 6, 12},			/* cost of unaligned loads.  */
-2,					/* cost of moving MMX register */
+{6, 6, 6, 6, 12},			/* cost of unaligned stores.  */
-{6, 6},				/* cost of loading MMX registers
-					   in SImode and DImode */
-{6, 6},				/* cost of storing MMX registers
-					   in SImode and DImode */
 2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
-{6, 6, 6, 6, 12},			/* cost of loading SSE registers
+2,					/* cost of moving SSE register to integer.  */
-					   in 32,64,128,256 and 512-bit */
-{6, 6, 6, 6, 12},			/* cost of unaligned loads.  */
-{6, 6, 6, 6, 12},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
-{6, 6, 6, 6, 12},			/* cost of unaligned stores.  */
-2, 2,					/* SSE->integer and integer->SSE moves */
 /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
 rec. throughput 6.
 So 5 uops statically and one uops per load.  */
 10, 6,				/* Gather load static, per_elt.  */
 10, 6,				/* Gather store static, per_elt.  */

Mercurial > hg > CbC > CbC_gcc

comparison gcc/config/i386/x86-tune-costs.h @ 145:1830386684a0