111
|
1 ;; Scheduling description for Niagara-7
|
131
|
2 ;; Copyright (C) 2016-2018 Free Software Foundation, Inc.
|
111
|
3 ;;
|
|
4 ;; This file is part of GCC.
|
|
5 ;;
|
|
6 ;; GCC is free software; you can redistribute it and/or modify
|
|
7 ;; it under the terms of the GNU General Public License as published by
|
|
8 ;; the Free Software Foundation; either version 3, or (at your option)
|
|
9 ;; any later version.
|
|
10 ;;
|
|
11 ;; GCC is distributed in the hope that it will be useful,
|
|
12 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
14 ;; GNU General Public License for more details.
|
|
15 ;;
|
|
16 ;; You should have received a copy of the GNU General Public License
|
|
17 ;; along with GCC; see the file COPYING3. If not see
|
|
18 ;; <http://www.gnu.org/licenses/>.
|
|
19
|
|
20 (define_automaton "niagara7_0")
|
|
21
|
|
22 ;; The S4 core has a dual-issue queue. This queue is divided into two
|
|
23 ;; slots. One instruction can be issued each cycle to each slot, and
|
|
24 ;; up to 2 instructions are committed each cycle. Each slot serves
|
|
25 ;; several execution units, as depicted below:
|
|
26 ;;
|
|
27 ;;
|
|
28 ;; m7_slot0 - Integer unit.
|
|
29 ;; - Load/Store unit.
|
|
30 ;; === QUEUE ==>
|
|
31 ;;
|
|
32 ;; m7_slot1 - Integer unit.
|
|
33 ;; - Branch unit.
|
|
34 ;; - Floating-point and graphics unit.
|
|
35 ;; - 3-cycles crypto unit.
|
|
36
|
|
37 (define_cpu_unit "n7_slot0,n7_slot1" "niagara7_0")
|
|
38
|
|
39 ;; Some instructions stall the pipeline and avoid any other
|
|
40 ;; instruction to be issued in the same cycle. We assume the same for
|
|
41 ;; multi-instruction insns.
|
|
42
|
|
43 (define_reservation "n7_single_issue" "n7_slot0 + n7_slot1")
|
|
44
|
|
45 (define_insn_reservation "n7_single" 1
|
|
46 (and (eq_attr "cpu" "niagara7")
|
|
47 (eq_attr "type" "multi,savew,flushw,trap"))
|
|
48 "n7_single_issue")
|
|
49
|
|
50 ;; Most of the instructions executing in the integer unit have a
|
|
51 ;; latency of 1.
|
|
52
|
|
53 (define_insn_reservation "n7_integer" 1
|
|
54 (and (eq_attr "cpu" "niagara7")
|
|
55 (eq_attr "type" "ialu,ialuX,shift,cmove,compare"))
|
|
56 "(n7_slot0 | n7_slot1)")
|
|
57
|
|
58 ;; Flushing the instruction memory takes 27 cycles.
|
|
59
|
|
60 (define_insn_reservation "n7_iflush" 27
|
|
61 (and (eq_attr "cpu" "niagara7")
|
|
62 (eq_attr "type" "iflush"))
|
|
63 "(n7_slot0 | n7_slot1), nothing*26")
|
|
64
|
|
65 ;; The integer multiplication instructions have a latency of 12 cycles
|
|
66 ;; and execute in the integer unit.
|
|
67 ;;
|
|
68 ;; Likewise for array*, edge* and pdistn instructions.
|
|
69
|
|
70 (define_insn_reservation "n7_imul" 12
|
|
71 (and (eq_attr "cpu" "niagara7")
|
|
72 (eq_attr "type" "imul,array,edge,edgen,pdistn"))
|
|
73 "(n7_slot0 | n7_slot1), nothing*11")
|
|
74
|
|
75 ;; The integer division instructions have a latency of 35 cycles and
|
|
76 ;; execute in the integer unit.
|
|
77
|
|
78 (define_insn_reservation "n7_idiv" 35
|
|
79 (and (eq_attr "cpu" "niagara7")
|
|
80 (eq_attr "type" "idiv"))
|
|
81 "(n7_slot0 | n7_slot1), nothing*34")
|
|
82
|
|
83 ;; Both integer and floating-point load instructions have a latency of
|
|
84 ;; 5 cycles, and execute in the slot0.
|
|
85 ;;
|
|
86 ;; The prefetch instruction also executes in the load/store unit, but
|
|
87 ;; its latency is only 1 cycle.
|
|
88
|
|
89 (define_insn_reservation "n7_load" 5
|
|
90 (and (eq_attr "cpu" "niagara7")
|
|
91 (ior (eq_attr "type" "fpload,sload")
|
|
92 (and (eq_attr "type" "load")
|
|
93 (eq_attr "subtype" "regular"))))
|
|
94 "n7_slot0, nothing*4")
|
|
95
|
|
96 (define_insn_reservation "n7_prefetch" 1
|
|
97 (and (eq_attr "cpu" "niagara7")
|
|
98 (eq_attr "type" "load")
|
|
99 (eq_attr "subtype" "prefetch"))
|
|
100 "n7_slot0")
|
|
101
|
|
102 ;; Both integer and floating-point store instructions have a latency
|
|
103 ;; of 1 cycle, and execute in the load/store unit in slot0.
|
|
104
|
|
105 (define_insn_reservation "n7_store" 1
|
|
106 (and (eq_attr "cpu" "niagara7")
|
|
107 (eq_attr "type" "store,fpstore"))
|
|
108 "n7_slot0")
|
|
109
|
|
110 ;; Control-transfer instructions execute in the Branch Unit in the
|
|
111 ;; slot1.
|
|
112
|
|
113 (define_insn_reservation "n7_cti" 1
|
|
114 (and (eq_attr "cpu" "niagara7")
|
|
115 (eq_attr "type" "cbcond,uncond_cbcond,branch,call,sibcall,call_no_delay_slot,uncond_branch,return"))
|
|
116 "n7_slot1")
|
|
117
|
|
118 ;; Many instructions executing in the Floating-point and Graphics unit
|
|
119 ;; in the slot1 feature a latency of 11 cycles.
|
|
120
|
|
121 (define_insn_reservation "n7_fp" 11
|
|
122 (and (eq_attr "cpu" "niagara7")
|
|
123 (ior (eq_attr "type" "fpmove,fpcmove,fpcrmove,fp,fpcmp,fpmul,fgm_pack,fgm_mul,pdist")
|
|
124 (and (eq_attr "type" "fga")
|
|
125 (eq_attr "subtype" "fpu,maxmin"))))
|
|
126 "n7_slot1, nothing*10")
|
|
127
|
|
128 ;; Floating-point division and floating-point square-root instructions
|
|
129 ;; have high latencies. They execute in the floating-point and
|
|
130 ;; graphics unit in the slot1.
|
|
131
|
|
132
|
|
133 (define_insn_reservation "n7_fpdivs" 24
|
|
134 (and (eq_attr "cpu" "niagara7")
|
|
135 (eq_attr "type" "fpdivs,fpsqrts"))
|
|
136 "n7_slot1, nothing*23")
|
|
137
|
|
138 (define_insn_reservation "n7_fpdivd" 37
|
|
139 (and (eq_attr "cpu" "niagara7")
|
|
140 (eq_attr "type" "fpdivd,fpsqrtd"))
|
|
141 "n7_slot1, nothing*36")
|
|
142
|
|
143 ;; SIMD VIS instructions executing in the Floating-point and graphics
|
|
144 ;; unit (FPG) in slot1 usually have a latency of either 11 or 12
|
|
145 ;; cycles.
|
|
146 ;;
|
|
147 ;; However, the latency for many instructions is only 3 cycles if the
|
|
148 ;; consumer can also be executed in 3 cycles. We model this with a
|
|
149 ;; bypass. In these cases the instructions are executed in the
|
|
150 ;; 3-cycle crypto unit which also serves slot1.
|
|
151
|
|
152 (define_insn_reservation "n7_vis_11cycles" 11
|
|
153 (and (eq_attr "cpu" "niagara7")
|
|
154 (ior (and (eq_attr "type" "fga")
|
|
155 (eq_attr "subtype" "addsub64,other"))
|
|
156 (and (eq_attr "type" "vismv")
|
|
157 (eq_attr "subtype" "double,single"))
|
|
158 (and (eq_attr "type" "visl")
|
|
159 (eq_attr "subtype" "double,single"))))
|
|
160 "n7_slot1, nothing*10")
|
|
161
|
|
162 (define_insn_reservation "n7_vis_12cycles" 12
|
|
163 (and (eq_attr "cpu" "niagara7")
|
|
164 (ior (eq_attr "type" "bmask,viscmp")
|
|
165 (and (eq_attr "type" "fga")
|
|
166 (eq_attr "subtype" "cmask"))
|
|
167 (and (eq_attr "type" "vismv")
|
|
168 (eq_attr "subtype" "movstouw"))))
|
|
169 "n7_slot1, nothing*11")
|
|
170
|
|
171 (define_bypass 3 "n7_vis_*" "n7_vis_*")
|
|
172
|
|
173 ;; Some other VIS instructions have a latency of 12 cycles, and won't
|
|
174 ;; be executed in the 3-cycle crypto pipe.
|
|
175
|
|
176 (define_insn_reservation "n7_lzd" 12
|
|
177 (and (eq_attr "cpu" "niagara7")
|
|
178 (ior (eq_attr "type" "lzd,")
|
|
179 (and (eq_attr "type" "gsr")
|
|
180 (eq_attr "subtype" "alignaddr"))))
|
|
181 "n7_slot1, nothing*11")
|
|
182
|
|
183 ;; A couple of VIS instructions feature very low latencies in the M7.
|
|
184
|
|
185 (define_insn_reservation "n7_single_vis" 1
|
|
186 (and (eq_attr "cpu" "niagara7")
|
|
187 (eq_attr "type" "vismv")
|
|
188 (eq_attr "subtype" "movxtod"))
|
|
189 "n7_slot1")
|
|
190
|
|
191 (define_insn_reservation "n7_double_vis" 2
|
|
192 (and (eq_attr "cpu" "niagara7")
|
|
193 (eq_attr "type" "vismv")
|
|
194 (eq_attr "subtype" "movdtox"))
|
|
195 "n7_slot1, nothing")
|
|
196
|
|
197 ;; Reading and writing to the gsr register takes a high number of
|
|
198 ;; cycles that is not documented in the PRM. Let's use the same value
|
|
199 ;; than the M8.
|
|
200
|
|
201 (define_insn_reservation "n7_gsr_reg" 70
|
|
202 (and (eq_attr "cpu" "niagara7")
|
|
203 (eq_attr "type" "gsr")
|
|
204 (eq_attr "subtype" "reg"))
|
|
205 "n7_slot1, nothing*70")
|