0
|
1 ;; DFA-based pipeline description for the VR1x000.
|
|
2 ;; Copyright (C) 2005, 2006, 2008 Free Software Foundation, Inc.
|
|
3 ;;
|
|
4 ;; This file is part of GCC.
|
|
5
|
|
6 ;; GCC is free software; you can redistribute it and/or modify it
|
|
7 ;; under the terms of the GNU General Public License as published
|
|
8 ;; by the Free Software Foundation; either version 3, or (at your
|
|
9 ;; option) any later version.
|
|
10
|
|
11 ;; GCC is distributed in the hope that it will be useful, but WITHOUT
|
|
12 ;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
13 ;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
|
|
14 ;; License for more details.
|
|
15
|
|
16 ;; You should have received a copy of the GNU General Public License
|
|
17 ;; along with GCC; see the file COPYING3. If not see
|
|
18 ;; <http://www.gnu.org/licenses/>.
|
|
19
|
|
20
|
|
21 ;; R12K/R14K/R16K are derivatives of R10K, thus copy its description
|
|
22 ;; until specific tuning for each is added.
|
|
23
|
|
24 ;; R10000 has an int queue, fp queue, address queue.
|
|
25 ;; The int queue feeds ALU1 and ALU2.
|
|
26 ;; The fp queue feeds the fp-adder and fp-multiplier.
|
|
27 ;; The addr queue feeds the Load/Store unit.
|
|
28 ;;
|
|
29 ;; However, we define the fp-adder and fp-multiplier as
|
|
30 ;; separate automatons, because the fp-multiplier is
|
|
31 ;; divided into fp-multiplier, fp-division, and
|
|
32 ;; fp-squareroot units, all of which share the same
|
|
33 ;; issue and completion logic, yet can operate in
|
|
34 ;; parallel.
|
|
35 ;;
|
|
36 ;; This is based on the model described in the R10K Manual
|
|
37 ;; and it helps to reduce the size of the automata.
|
|
38 (define_automaton "r10k_a_int, r10k_a_fpadder, r10k_a_addr,
|
|
39 r10k_a_fpmpy, r10k_a_fpdiv, r10k_a_fpsqrt")
|
|
40
|
|
41 (define_cpu_unit "r10k_alu1" "r10k_a_int")
|
|
42 (define_cpu_unit "r10k_alu2" "r10k_a_int")
|
|
43 (define_cpu_unit "r10k_fpadd" "r10k_a_fpadder")
|
|
44 (define_cpu_unit "r10k_fpmpy" "r10k_a_fpmpy")
|
|
45 (define_cpu_unit "r10k_fpdiv" "r10k_a_fpdiv")
|
|
46 (define_cpu_unit "r10k_fpsqrt" "r10k_a_fpsqrt")
|
|
47 (define_cpu_unit "r10k_loadstore" "r10k_a_addr")
|
|
48
|
|
49
|
|
50 ;; R10k Loads and Stores.
|
|
51 (define_insn_reservation "r10k_load" 2
|
|
52 (and (eq_attr "cpu" "r10000")
|
|
53 (eq_attr "type" "load,prefetch,prefetchx"))
|
|
54 "r10k_loadstore")
|
|
55
|
|
56 (define_insn_reservation "r10k_store" 0
|
|
57 (and (eq_attr "cpu" "r10000")
|
|
58 (eq_attr "type" "store,fpstore,fpidxstore"))
|
|
59 "r10k_loadstore")
|
|
60
|
|
61 (define_insn_reservation "r10k_fpload" 3
|
|
62 (and (eq_attr "cpu" "r10000")
|
|
63 (eq_attr "type" "fpload,fpidxload"))
|
|
64 "r10k_loadstore")
|
|
65
|
|
66
|
|
67 ;; Integer add/sub + logic ops, and mt hi/lo can be done by alu1 or alu2.
|
|
68 ;; Miscellaneous arith goes here too (this is a guess).
|
|
69 (define_insn_reservation "r10k_arith" 1
|
|
70 (and (eq_attr "cpu" "r10000")
|
|
71 (eq_attr "type" "arith,mthilo,slt,clz,const,nop,trap,logical"))
|
|
72 "r10k_alu1 | r10k_alu2")
|
|
73
|
|
74 ;; We treat mfhilo differently, because we need to know when
|
|
75 ;; it's HI and when it's LO.
|
|
76 (define_insn_reservation "r10k_mfhi" 1
|
|
77 (and (eq_attr "cpu" "r10000")
|
|
78 (and (eq_attr "type" "mfhilo")
|
|
79 (not (match_operand 1 "lo_operand"))))
|
|
80 "r10k_alu1 | r10k_alu2")
|
|
81
|
|
82 (define_insn_reservation "r10k_mflo" 1
|
|
83 (and (eq_attr "cpu" "r10000")
|
|
84 (and (eq_attr "type" "mfhilo")
|
|
85 (match_operand 1 "lo_operand")))
|
|
86 "r10k_alu1 | r10k_alu2")
|
|
87
|
|
88
|
|
89 ;; ALU1 handles shifts, branch eval, and condmove.
|
|
90 ;;
|
|
91 ;; Brancher is separate, but part of ALU1, but can only
|
|
92 ;; do one branch per cycle (is this even implementable?).
|
|
93 ;;
|
|
94 ;; Unsure if the brancher handles jumps and calls as well, but since
|
|
95 ;; they're related, we'll add them here for now.
|
|
96 (define_insn_reservation "r10k_brancher" 1
|
|
97 (and (eq_attr "cpu" "r10000")
|
|
98 (eq_attr "type" "shift,branch,jump,call"))
|
|
99 "r10k_alu1")
|
|
100
|
|
101 (define_insn_reservation "r10k_int_cmove" 1
|
|
102 (and (eq_attr "cpu" "r10000")
|
|
103 (and (eq_attr "type" "condmove")
|
|
104 (eq_attr "mode" "SI,DI")))
|
|
105 "r10k_alu1")
|
|
106
|
|
107
|
|
108 ;; Coprocessor Moves.
|
|
109 ;; mtc1/dmtc1 are handled by ALU1.
|
|
110 ;; mfc1/dmfc1 are handled by the fp-multiplier.
|
|
111 (define_insn_reservation "r10k_mt_xfer" 3
|
|
112 (and (eq_attr "cpu" "r10000")
|
|
113 (eq_attr "type" "mtc"))
|
|
114 "r10k_alu1")
|
|
115
|
|
116 (define_insn_reservation "r10k_mf_xfer" 2
|
|
117 (and (eq_attr "cpu" "r10000")
|
|
118 (eq_attr "type" "mfc"))
|
|
119 "r10k_fpmpy")
|
|
120
|
|
121
|
|
122 ;; Only ALU2 does int multiplications and divisions.
|
|
123 ;;
|
|
124 ;; According to the Vr10000 series user manual,
|
|
125 ;; integer mult and div insns can be issued one
|
|
126 ;; cycle earlier if using register Lo. We model
|
|
127 ;; this by using the Lo value by default, as it
|
|
128 ;; is the more common value, and use a bypass
|
|
129 ;; for the Hi value when needed.
|
|
130 ;;
|
|
131 ;; Also of note, There are different latencies
|
|
132 ;; for MULT/DMULT (Lo 5/Hi 6) and MULTU/DMULTU (Lo 6/Hi 7).
|
|
133 ;; However, gcc does not have separate types
|
|
134 ;; for these insns. Thus to strike a balance,
|
|
135 ;; we use the Hi latency value for imul
|
|
136 ;; operations until the imul type can be split.
|
|
137 (define_insn_reservation "r10k_imul_single" 6
|
|
138 (and (eq_attr "cpu" "r10000")
|
|
139 (and (eq_attr "type" "imul,imul3")
|
|
140 (eq_attr "mode" "SI")))
|
|
141 "r10k_alu2 * 6")
|
|
142
|
|
143 (define_insn_reservation "r10k_imul_double" 10
|
|
144 (and (eq_attr "cpu" "r10000")
|
|
145 (and (eq_attr "type" "imul,imul3")
|
|
146 (eq_attr "mode" "DI")))
|
|
147 "r10k_alu2 * 10")
|
|
148
|
|
149 ;; Divides keep ALU2 busy.
|
|
150 (define_insn_reservation "r10k_idiv_single" 34
|
|
151 (and (eq_attr "cpu" "r10000")
|
|
152 (and (eq_attr "type" "idiv")
|
|
153 (eq_attr "mode" "SI")))
|
|
154 "r10k_alu2 * 35")
|
|
155
|
|
156 (define_insn_reservation "r10k_idiv_double" 66
|
|
157 (and (eq_attr "cpu" "r10000")
|
|
158 (and (eq_attr "type" "idiv")
|
|
159 (eq_attr "mode" "DI")))
|
|
160 "r10k_alu2 * 67")
|
|
161
|
|
162 (define_bypass 35 "r10k_idiv_single" "r10k_mfhi")
|
|
163 (define_bypass 67 "r10k_idiv_double" "r10k_mfhi")
|
|
164
|
|
165
|
|
166 ;; Floating point add/sub, mul, abs value, neg, comp, & moves.
|
|
167 (define_insn_reservation "r10k_fp_miscadd" 2
|
|
168 (and (eq_attr "cpu" "r10000")
|
|
169 (eq_attr "type" "fadd,fabs,fneg,fcmp"))
|
|
170 "r10k_fpadd")
|
|
171
|
|
172 (define_insn_reservation "r10k_fp_miscmul" 2
|
|
173 (and (eq_attr "cpu" "r10000")
|
|
174 (eq_attr "type" "fmul,fmove"))
|
|
175 "r10k_fpmpy")
|
|
176
|
|
177 (define_insn_reservation "r10k_fp_cmove" 2
|
|
178 (and (eq_attr "cpu" "r10000")
|
|
179 (and (eq_attr "type" "condmove")
|
|
180 (eq_attr "mode" "SF,DF")))
|
|
181 "r10k_fpmpy")
|
|
182
|
|
183
|
|
184 ;; The fcvt.s.[wl] insn has latency 4, repeat 2.
|
|
185 ;; All other fcvt insns have latency 2, repeat 1.
|
|
186 (define_insn_reservation "r10k_fcvt_single" 4
|
|
187 (and (eq_attr "cpu" "r10000")
|
|
188 (and (eq_attr "type" "fcvt")
|
|
189 (eq_attr "cnv_mode" "I2S")))
|
|
190 "r10k_fpadd * 2")
|
|
191
|
|
192 (define_insn_reservation "r10k_fcvt_other" 2
|
|
193 (and (eq_attr "cpu" "r10000")
|
|
194 (and (eq_attr "type" "fcvt")
|
|
195 (eq_attr "cnv_mode" "!I2S")))
|
|
196 "r10k_fpadd")
|
|
197
|
|
198
|
|
199 ;; Run the fmadd insn through fp-adder first, then fp-multiplier.
|
|
200 ;;
|
|
201 ;; The latency for fmadd is 2 cycles if the result is used
|
|
202 ;; by another fmadd instruction.
|
|
203 (define_insn_reservation "r10k_fmadd" 4
|
|
204 (and (eq_attr "cpu" "r10000")
|
|
205 (eq_attr "type" "fmadd"))
|
|
206 "r10k_fpadd, r10k_fpmpy")
|
|
207
|
|
208 (define_bypass 2 "r10k_fmadd" "r10k_fmadd")
|
|
209
|
|
210
|
|
211 ;; Floating point Divisions & square roots.
|
|
212 (define_insn_reservation "r10k_fdiv_single" 12
|
|
213 (and (eq_attr "cpu" "r10000")
|
|
214 (and (eq_attr "type" "fdiv,frdiv")
|
|
215 (eq_attr "mode" "SF")))
|
|
216 "r10k_fpdiv * 14")
|
|
217
|
|
218 (define_insn_reservation "r10k_fdiv_double" 19
|
|
219 (and (eq_attr "cpu" "r10000")
|
|
220 (and (eq_attr "type" "fdiv,frdiv")
|
|
221 (eq_attr "mode" "DF")))
|
|
222 "r10k_fpdiv * 21")
|
|
223
|
|
224 (define_insn_reservation "r10k_fsqrt_single" 18
|
|
225 (and (eq_attr "cpu" "r10000")
|
|
226 (and (eq_attr "type" "fsqrt")
|
|
227 (eq_attr "mode" "SF")))
|
|
228 "r10k_fpsqrt * 20")
|
|
229
|
|
230 (define_insn_reservation "r10k_fsqrt_double" 33
|
|
231 (and (eq_attr "cpu" "r10000")
|
|
232 (and (eq_attr "type" "fsqrt")
|
|
233 (eq_attr "mode" "DF")))
|
|
234 "r10k_fpsqrt * 35")
|
|
235
|
|
236 (define_insn_reservation "r10k_frsqrt_single" 30
|
|
237 (and (eq_attr "cpu" "r10000")
|
|
238 (and (eq_attr "type" "frsqrt")
|
|
239 (eq_attr "mode" "SF")))
|
|
240 "r10k_fpsqrt * 20")
|
|
241
|
|
242 (define_insn_reservation "r10k_frsqrt_double" 52
|
|
243 (and (eq_attr "cpu" "r10000")
|
|
244 (and (eq_attr "type" "frsqrt")
|
|
245 (eq_attr "mode" "DF")))
|
|
246 "r10k_fpsqrt * 35")
|
|
247
|
|
248
|
|
249 ;; Handle unknown/multi insns here (this is a guess).
|
|
250 (define_insn_reservation "r10k_unknown" 1
|
|
251 (and (eq_attr "cpu" "r10000")
|
|
252 (eq_attr "type" "unknown,multi"))
|
|
253 "r10k_alu1 + r10k_alu2")
|