0
|
1 ;; Scheduling for the Intel P6 family of processors
|
|
2 ;; Copyright (C) 2004, 2005, 2007, 2008 Free Software Foundation, Inc.
|
|
3 ;;
|
|
4 ;; This file is part of GCC.
|
|
5 ;;
|
|
6 ;; GCC is free software; you can redistribute it and/or modify
|
|
7 ;; it under the terms of the GNU General Public License as published by
|
|
8 ;; the Free Software Foundation; either version 3, or (at your option)
|
|
9 ;; any later version.
|
|
10 ;;
|
|
11 ;; GCC is distributed in the hope that it will be useful,
|
|
12 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
14 ;; GNU General Public License for more details.
|
|
15 ;;
|
|
16 ;; You should have received a copy of the GNU General Public License
|
|
17 ;; along with GCC; see the file COPYING3. If not see
|
|
18 ;; <http://www.gnu.org/licenses/>. */
|
|
19
|
|
20 ;; The P6 family includes the Pentium Pro, Pentium II, Pentium III, Celeron
|
|
21 ;; and Xeon lines of CPUs. The DFA scheduler description in this file is
|
|
22 ;; based on information that can be found in the following three documents:
|
|
23 ;;
|
|
24 ;; "P6 Family of Processors Hardware Developer's Manual",
|
|
25 ;; Intel, September 1999.
|
|
26 ;;
|
|
27 ;; "Intel Architecture Optimization Manual",
|
|
28 ;; Intel, 1999 (Order Number: 245127-001).
|
|
29 ;;
|
|
30 ;; "How to optimize for the Pentium family of microprocessors",
|
|
31 ;; by Agner Fog, PhD.
|
|
32 ;;
|
|
33 ;; The P6 pipeline has three major components:
|
|
34 ;; 1) the FETCH/DECODE unit, an in-order issue front-end
|
|
35 ;; 2) the DISPATCH/EXECUTE unit, which is the out-of-order core
|
|
36 ;; 3) the RETIRE unit, an in-order retirement unit
|
|
37 ;;
|
|
38 ;; So, the P6 CPUs have out-of-order cores, but the instruction decoder and
|
|
39 ;; retirement unit are naturally in-order.
|
|
40 ;;
|
|
41 ;; BUS INTERFACE UNIT
|
|
42 ;; / \
|
|
43 ;; L1 ICACHE L1 DCACHE
|
|
44 ;; / | \ | \
|
|
45 ;; DECODER0 DECODER1 DECODER2 DISP/EXEC RETIRE
|
|
46 ;; \ | / | |
|
|
47 ;; INSTRUCTION POOL __________|_______/
|
|
48 ;; (inc. reorder buffer)
|
|
49 ;;
|
|
50 ;; Since the P6 CPUs execute instructions out-of-order, the most important
|
|
51 ;; consideration in performance tuning is making sure enough micro-ops are
|
|
52 ;; ready for execution in the out-of-order core, while not stalling the
|
|
53 ;; decoder.
|
|
54 ;;
|
|
55 ;; TODO:
|
|
56 ;; - Find a less crude way to model complex instructions, in
|
|
57 ;; particular how many cycles they take to be decoded.
|
|
58 ;; - Include decoder latencies in the total reservation latencies.
|
|
59 ;; This isn't necessary right now because we assume for every
|
|
60 ;; instruction that it never blocks a decoder.
|
|
61 ;; - Figure out where the p0 and p1 reservations come from. These
|
|
62 ;; appear not to be in the manual
|
|
63 ;; - Lots more because I'm sure this is still far from optimal :-)
|
|
64
|
|
65 ;; The ppro_idiv and ppro_fdiv automata are used to model issue
|
|
66 ;; latencies of idiv and fdiv type insns.
|
|
67 (define_automaton "ppro_decoder,ppro_core,ppro_idiv,ppro_fdiv,ppro_load,ppro_store")
|
|
68
|
|
69 ;; Simple instructions of the register-register form have only one uop.
|
|
70 ;; Load instructions are also only one uop. Store instructions decode to
|
|
71 ;; two uops, and simple read-modify instructions also take two uops.
|
|
72 ;; Simple instructions of the register-memory form have two to three uops.
|
|
73 ;; Simple read-modify-write instructions have four uops. The rules for
|
|
74 ;; the decoder are simple:
|
|
75 ;; - an instruction with 1 uop can be decoded by any of the three
|
|
76 ;; decoders in one cycle.
|
|
77 ;; - an instruction with 1 to 4 uops can be decoded only by decoder 0
|
|
78 ;; but still in only one cycle.
|
|
79 ;; - a complex (microcode) instruction can also only be decoded by
|
|
80 ;; decoder 0, and this takes an unspecified number of cycles.
|
|
81 ;;
|
|
82 ;; The goal is to schedule such that we have a few-one-one uops sequence
|
|
83 ;; in each cycle, to decode as many instructions per cycle as possible.
|
|
84 (define_cpu_unit "decoder0" "ppro_decoder")
|
|
85 (define_cpu_unit "decoder1" "ppro_decoder")
|
|
86 (define_cpu_unit "decoder2" "ppro_decoder")
|
|
87
|
|
88 ;; We first wish to find an instruction for decoder0, so exclude
|
|
89 ;; decoder1 and decoder2 from being reserved until decoder 0 is
|
|
90 ;; reserved.
|
|
91 (presence_set "decoder1" "decoder0")
|
|
92 (presence_set "decoder2" "decoder0")
|
|
93
|
|
94 ;; Most instructions can be decoded on any of the three decoders.
|
|
95 (define_reservation "decodern" "(decoder0|decoder1|decoder2)")
|
|
96
|
|
97 ;; The out-of-order core has five pipelines. During each cycle, the core
|
|
98 ;; may dispatch zero or one uop on the port of any of the five pipelines
|
|
99 ;; so the maximum number of dispatched uops per cycle is 5. In practicer,
|
|
100 ;; 3 uops per cycle is more realistic.
|
|
101 ;;
|
|
102 ;; Two of the five pipelines contain several execution units:
|
|
103 ;;
|
|
104 ;; Port 0 Port 1 Port 2 Port 3 Port 4
|
|
105 ;; ALU ALU LOAD SAC SDA
|
|
106 ;; FPU JUE
|
|
107 ;; AGU MMX
|
|
108 ;; MMX P3FPU
|
|
109 ;; P3FPU
|
|
110 ;;
|
|
111 ;; (SAC=Store Address Calculation, SDA=Store Data Unit, P3FPU = SSE unit,
|
|
112 ;; JUE = Jump Execution Unit, AGU = Address Generation Unit)
|
|
113 ;;
|
|
114 (define_cpu_unit "p0,p1" "ppro_core")
|
|
115 (define_cpu_unit "p2" "ppro_load")
|
|
116 (define_cpu_unit "p3,p4" "ppro_store")
|
|
117 (define_cpu_unit "idiv" "ppro_idiv")
|
|
118 (define_cpu_unit "fdiv" "ppro_fdiv")
|
|
119
|
|
120 ;; Only the irregular instructions have to be modeled here. A load
|
|
121 ;; increases the latency by 2 or 3, or by nothing if the manual gives
|
|
122 ;; a latency already. Store latencies are not accounted for.
|
|
123 ;;
|
|
124 ;; The simple instructions follow a very regular pattern of 1 uop per
|
|
125 ;; reg-reg operation, 1 uop per load on port 2. and 2 uops per store
|
|
126 ;; on port 4 and port 3. These instructions are modelled at the bottom
|
|
127 ;; of this file.
|
|
128 ;;
|
|
129 ;; For microcoded instructions we don't know how many uops are produced.
|
|
130 ;; These instructions are the "complex" ones in the Intel manuals. All
|
|
131 ;; we _do_ know is that they typically produce four or more uops, so
|
|
132 ;; they can only be decoded on decoder0. Modelling their latencies
|
|
133 ;; doesn't make sense because we don't know how these instructions are
|
|
134 ;; executed in the core. So we just model that they can only be decoded
|
|
135 ;; on decoder 0, and say that it takes a little while before the result
|
|
136 ;; is available.
|
|
137 (define_insn_reservation "ppro_complex_insn" 6
|
|
138 (and (eq_attr "cpu" "pentiumpro")
|
|
139 (eq_attr "type" "other,multi,call,callv,str"))
|
|
140 "decoder0")
|
|
141
|
|
142 ;; imov with memory operands does not use the integer units.
|
|
143 (define_insn_reservation "ppro_imov" 1
|
|
144 (and (eq_attr "cpu" "pentiumpro")
|
|
145 (and (eq_attr "memory" "none")
|
|
146 (eq_attr "type" "imov")))
|
|
147 "decodern,(p0|p1)")
|
|
148
|
|
149 (define_insn_reservation "ppro_imov_load" 4
|
|
150 (and (eq_attr "cpu" "pentiumpro")
|
|
151 (and (eq_attr "memory" "load")
|
|
152 (eq_attr "type" "imov")))
|
|
153 "decodern,p2")
|
|
154
|
|
155 (define_insn_reservation "ppro_imov_store" 1
|
|
156 (and (eq_attr "cpu" "pentiumpro")
|
|
157 (and (eq_attr "memory" "store")
|
|
158 (eq_attr "type" "imov")))
|
|
159 "decoder0,p4+p3")
|
|
160
|
|
161 ;; imovx always decodes to one uop, and also doesn't use the integer
|
|
162 ;; units if it has memory operands.
|
|
163 (define_insn_reservation "ppro_imovx" 1
|
|
164 (and (eq_attr "cpu" "pentiumpro")
|
|
165 (and (eq_attr "memory" "none")
|
|
166 (eq_attr "type" "imovx")))
|
|
167 "decodern,(p0|p1)")
|
|
168
|
|
169 (define_insn_reservation "ppro_imovx_load" 4
|
|
170 (and (eq_attr "cpu" "pentiumpro")
|
|
171 (and (eq_attr "memory" "load")
|
|
172 (eq_attr "type" "imovx")))
|
|
173 "decodern,p2")
|
|
174
|
|
175 ;; lea executes on port 0 with latency one and throughput 1.
|
|
176 (define_insn_reservation "ppro_lea" 1
|
|
177 (and (eq_attr "cpu" "pentiumpro")
|
|
178 (and (eq_attr "memory" "none")
|
|
179 (eq_attr "type" "lea")))
|
|
180 "decodern,p0")
|
|
181
|
|
182 ;; Shift and rotate execute on port 0 with latency and throughput 1.
|
|
183 ;; The load and store units need to be reserved when memory operands
|
|
184 ;; are involved.
|
|
185 (define_insn_reservation "ppro_shift_rotate" 1
|
|
186 (and (eq_attr "cpu" "pentiumpro")
|
|
187 (and (eq_attr "memory" "none")
|
|
188 (eq_attr "type" "ishift,ishift1,rotate,rotate1")))
|
|
189 "decodern,p0")
|
|
190
|
|
191 (define_insn_reservation "ppro_shift_rotate_mem" 4
|
|
192 (and (eq_attr "cpu" "pentiumpro")
|
|
193 (and (eq_attr "memory" "!none")
|
|
194 (eq_attr "type" "ishift,ishift1,rotate,rotate1")))
|
|
195 "decoder0,p2+p0,p4+p3")
|
|
196
|
|
197
|
|
198 ;; The P6 has a sophisticated branch prediction mechanism to minimize
|
|
199 ;; latencies due to branching. In particular, it has a fast way to
|
|
200 ;; execute branches that are taken multiple times (such as in loops).
|
|
201 ;; Branches not taken suffer no penalty, and correctly predicted
|
|
202 ;; branches cost only one fetch cycle. Mispredicted branches are very
|
|
203 ;; costly: typically 15 cycles and possibly as many as 26 cycles.
|
|
204 ;;
|
|
205 ;; Unfortunately all this makes it quite difficult to properly model
|
|
206 ;; the latencies for the compiler. Here I've made the choice to be
|
|
207 ;; optimistic and assume branches are often predicted correctly, so
|
|
208 ;; they have latency 1, and the decoders are not blocked.
|
|
209 ;;
|
|
210 ;; In addition, the model assumes a branch always decodes to only 1 uop,
|
|
211 ;; which is not exactly true because there are a few instructions that
|
|
212 ;; decode to 2 uops or microcode. But this probably gives the best
|
|
213 ;; results because we can assume these instructions can decode on all
|
|
214 ;; decoders.
|
|
215 (define_insn_reservation "ppro_branch" 1
|
|
216 (and (eq_attr "cpu" "pentiumpro")
|
|
217 (and (eq_attr "memory" "none")
|
|
218 (eq_attr "type" "ibr")))
|
|
219 "decodern,p1")
|
|
220
|
|
221 ;; ??? Indirect branches probably have worse latency than this.
|
|
222 (define_insn_reservation "ppro_indirect_branch" 6
|
|
223 (and (eq_attr "cpu" "pentiumpro")
|
|
224 (and (eq_attr "memory" "!none")
|
|
225 (eq_attr "type" "ibr")))
|
|
226 "decoder0,p2+p1")
|
|
227
|
|
228 (define_insn_reservation "ppro_leave" 4
|
|
229 (and (eq_attr "cpu" "pentiumpro")
|
|
230 (eq_attr "type" "leave"))
|
|
231 "decoder0,p2+(p0|p1),(p0|p1)")
|
|
232
|
|
233 ;; imul has throughput one, but latency 4, and can only execute on port 0.
|
|
234 (define_insn_reservation "ppro_imul" 4
|
|
235 (and (eq_attr "cpu" "pentiumpro")
|
|
236 (and (eq_attr "memory" "none")
|
|
237 (eq_attr "type" "imul")))
|
|
238 "decodern,p0")
|
|
239
|
|
240 (define_insn_reservation "ppro_imul_mem" 4
|
|
241 (and (eq_attr "cpu" "pentiumpro")
|
|
242 (and (eq_attr "memory" "!none")
|
|
243 (eq_attr "type" "imul")))
|
|
244 "decoder0,p2+p0")
|
|
245
|
|
246 ;; div and idiv are very similar, so we model them the same.
|
|
247 ;; QI, HI, and SI have issue latency 12, 21, and 37, respectively.
|
|
248 ;; These issue latencies are modelled via the ppro_div automaton.
|
|
249 (define_insn_reservation "ppro_idiv_QI" 19
|
|
250 (and (eq_attr "cpu" "pentiumpro")
|
|
251 (and (eq_attr "memory" "none")
|
|
252 (and (eq_attr "mode" "QI")
|
|
253 (eq_attr "type" "idiv"))))
|
|
254 "decoder0,(p0+idiv)*2,(p0|p1)+idiv,idiv*9")
|
|
255
|
|
256 (define_insn_reservation "ppro_idiv_QI_load" 19
|
|
257 (and (eq_attr "cpu" "pentiumpro")
|
|
258 (and (eq_attr "memory" "load")
|
|
259 (and (eq_attr "mode" "QI")
|
|
260 (eq_attr "type" "idiv"))))
|
|
261 "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*9")
|
|
262
|
|
263 (define_insn_reservation "ppro_idiv_HI" 23
|
|
264 (and (eq_attr "cpu" "pentiumpro")
|
|
265 (and (eq_attr "memory" "none")
|
|
266 (and (eq_attr "mode" "HI")
|
|
267 (eq_attr "type" "idiv"))))
|
|
268 "decoder0,(p0+idiv)*3,(p0|p1)+idiv,idiv*17")
|
|
269
|
|
270 (define_insn_reservation "ppro_idiv_HI_load" 23
|
|
271 (and (eq_attr "cpu" "pentiumpro")
|
|
272 (and (eq_attr "memory" "load")
|
|
273 (and (eq_attr "mode" "HI")
|
|
274 (eq_attr "type" "idiv"))))
|
|
275 "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*18")
|
|
276
|
|
277 (define_insn_reservation "ppro_idiv_SI" 39
|
|
278 (and (eq_attr "cpu" "pentiumpro")
|
|
279 (and (eq_attr "memory" "none")
|
|
280 (and (eq_attr "mode" "SI")
|
|
281 (eq_attr "type" "idiv"))))
|
|
282 "decoder0,(p0+idiv)*3,(p0|p1)+idiv,idiv*33")
|
|
283
|
|
284 (define_insn_reservation "ppro_idiv_SI_load" 39
|
|
285 (and (eq_attr "cpu" "pentiumpro")
|
|
286 (and (eq_attr "memory" "load")
|
|
287 (and (eq_attr "mode" "SI")
|
|
288 (eq_attr "type" "idiv"))))
|
|
289 "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*34")
|
|
290
|
|
291 ;; Floating point operations always execute on port 0.
|
|
292 ;; ??? where do these latencies come from? fadd has latency 3 and
|
|
293 ;; has throughput "1/cycle (align with FADD)". What do they
|
|
294 ;; mean and how can we model that?
|
|
295 (define_insn_reservation "ppro_fop" 3
|
|
296 (and (eq_attr "cpu" "pentiumpro")
|
|
297 (and (eq_attr "memory" "none,unknown")
|
|
298 (eq_attr "type" "fop")))
|
|
299 "decodern,p0")
|
|
300
|
|
301 (define_insn_reservation "ppro_fop_load" 5
|
|
302 (and (eq_attr "cpu" "pentiumpro")
|
|
303 (and (eq_attr "memory" "load")
|
|
304 (eq_attr "type" "fop")))
|
|
305 "decoder0,p2+p0,p0")
|
|
306
|
|
307 (define_insn_reservation "ppro_fop_store" 3
|
|
308 (and (eq_attr "cpu" "pentiumpro")
|
|
309 (and (eq_attr "memory" "store")
|
|
310 (eq_attr "type" "fop")))
|
|
311 "decoder0,p0,p0,p0+p4+p3")
|
|
312
|
|
313 (define_insn_reservation "ppro_fop_both" 5
|
|
314 (and (eq_attr "cpu" "pentiumpro")
|
|
315 (and (eq_attr "memory" "both")
|
|
316 (eq_attr "type" "fop")))
|
|
317 "decoder0,p2+p0,p0+p4+p3")
|
|
318
|
|
319 (define_insn_reservation "ppro_fsgn" 1
|
|
320 (and (eq_attr "cpu" "pentiumpro")
|
|
321 (eq_attr "type" "fsgn"))
|
|
322 "decodern,p0")
|
|
323
|
|
324 (define_insn_reservation "ppro_fistp" 5
|
|
325 (and (eq_attr "cpu" "pentiumpro")
|
|
326 (eq_attr "type" "fistp"))
|
|
327 "decoder0,p0*2,p4+p3")
|
|
328
|
|
329 (define_insn_reservation "ppro_fcmov" 2
|
|
330 (and (eq_attr "cpu" "pentiumpro")
|
|
331 (eq_attr "type" "fcmov"))
|
|
332 "decoder0,p0*2")
|
|
333
|
|
334 (define_insn_reservation "ppro_fcmp" 1
|
|
335 (and (eq_attr "cpu" "pentiumpro")
|
|
336 (and (eq_attr "memory" "none")
|
|
337 (eq_attr "type" "fcmp")))
|
|
338 "decodern,p0")
|
|
339
|
|
340 (define_insn_reservation "ppro_fcmp_load" 4
|
|
341 (and (eq_attr "cpu" "pentiumpro")
|
|
342 (and (eq_attr "memory" "load")
|
|
343 (eq_attr "type" "fcmp")))
|
|
344 "decoder0,p2+p0")
|
|
345
|
|
346 (define_insn_reservation "ppro_fmov" 1
|
|
347 (and (eq_attr "cpu" "pentiumpro")
|
|
348 (and (eq_attr "memory" "none")
|
|
349 (eq_attr "type" "fmov")))
|
|
350 "decodern,p0")
|
|
351
|
|
352 (define_insn_reservation "ppro_fmov_load" 1
|
|
353 (and (eq_attr "cpu" "pentiumpro")
|
|
354 (and (eq_attr "memory" "load")
|
|
355 (and (eq_attr "mode" "!XF")
|
|
356 (eq_attr "type" "fmov"))))
|
|
357 "decodern,p2")
|
|
358
|
|
359 (define_insn_reservation "ppro_fmov_XF_load" 3
|
|
360 (and (eq_attr "cpu" "pentiumpro")
|
|
361 (and (eq_attr "memory" "load")
|
|
362 (and (eq_attr "mode" "XF")
|
|
363 (eq_attr "type" "fmov"))))
|
|
364 "decoder0,(p2+p0)*2")
|
|
365
|
|
366 (define_insn_reservation "ppro_fmov_store" 1
|
|
367 (and (eq_attr "cpu" "pentiumpro")
|
|
368 (and (eq_attr "memory" "store")
|
|
369 (and (eq_attr "mode" "!XF")
|
|
370 (eq_attr "type" "fmov"))))
|
|
371 "decodern,p0")
|
|
372
|
|
373 (define_insn_reservation "ppro_fmov_XF_store" 3
|
|
374 (and (eq_attr "cpu" "pentiumpro")
|
|
375 (and (eq_attr "memory" "store")
|
|
376 (and (eq_attr "mode" "XF")
|
|
377 (eq_attr "type" "fmov"))))
|
|
378 "decoder0,(p0+p4),(p0+p3)")
|
|
379
|
|
380 ;; fmul executes on port 0 with latency 5. It has issue latency 2,
|
|
381 ;; but we don't model this.
|
|
382 (define_insn_reservation "ppro_fmul" 5
|
|
383 (and (eq_attr "cpu" "pentiumpro")
|
|
384 (and (eq_attr "memory" "none")
|
|
385 (eq_attr "type" "fmul")))
|
|
386 "decoder0,p0*2")
|
|
387
|
|
388 (define_insn_reservation "ppro_fmul_load" 6
|
|
389 (and (eq_attr "cpu" "pentiumpro")
|
|
390 (and (eq_attr "memory" "load")
|
|
391 (eq_attr "type" "fmul")))
|
|
392 "decoder0,p2+p0,p0")
|
|
393
|
|
394 ;; fdiv latencies depend on the mode of the operands. XFmode gives
|
|
395 ;; a latency of 38 cycles, DFmode gives 32, and SFmode gives latency 18.
|
|
396 ;; Division by a power of 2 takes only 9 cycles, but we cannot model
|
|
397 ;; that. Throughput is equal to latency - 1, which we model using the
|
|
398 ;; ppro_div automaton.
|
|
399 (define_insn_reservation "ppro_fdiv_SF" 18
|
|
400 (and (eq_attr "cpu" "pentiumpro")
|
|
401 (and (eq_attr "memory" "none")
|
|
402 (and (eq_attr "mode" "SF")
|
|
403 (eq_attr "type" "fdiv,fpspc"))))
|
|
404 "decodern,p0+fdiv,fdiv*16")
|
|
405
|
|
406 (define_insn_reservation "ppro_fdiv_SF_load" 19
|
|
407 (and (eq_attr "cpu" "pentiumpro")
|
|
408 (and (eq_attr "memory" "load")
|
|
409 (and (eq_attr "mode" "SF")
|
|
410 (eq_attr "type" "fdiv,fpspc"))))
|
|
411 "decoder0,p2+p0+fdiv,fdiv*16")
|
|
412
|
|
413 (define_insn_reservation "ppro_fdiv_DF" 32
|
|
414 (and (eq_attr "cpu" "pentiumpro")
|
|
415 (and (eq_attr "memory" "none")
|
|
416 (and (eq_attr "mode" "DF")
|
|
417 (eq_attr "type" "fdiv,fpspc"))))
|
|
418 "decodern,p0+fdiv,fdiv*30")
|
|
419
|
|
420 (define_insn_reservation "ppro_fdiv_DF_load" 33
|
|
421 (and (eq_attr "cpu" "pentiumpro")
|
|
422 (and (eq_attr "memory" "load")
|
|
423 (and (eq_attr "mode" "DF")
|
|
424 (eq_attr "type" "fdiv,fpspc"))))
|
|
425 "decoder0,p2+p0+fdiv,fdiv*30")
|
|
426
|
|
427 (define_insn_reservation "ppro_fdiv_XF" 38
|
|
428 (and (eq_attr "cpu" "pentiumpro")
|
|
429 (and (eq_attr "memory" "none")
|
|
430 (and (eq_attr "mode" "XF")
|
|
431 (eq_attr "type" "fdiv,fpspc"))))
|
|
432 "decodern,p0+fdiv,fdiv*36")
|
|
433
|
|
434 (define_insn_reservation "ppro_fdiv_XF_load" 39
|
|
435 (and (eq_attr "cpu" "pentiumpro")
|
|
436 (and (eq_attr "memory" "load")
|
|
437 (and (eq_attr "mode" "XF")
|
|
438 (eq_attr "type" "fdiv,fpspc"))))
|
|
439 "decoder0,p2+p0+fdiv,fdiv*36")
|
|
440
|
|
441 ;; MMX instructions can execute on either port 0 or port 1 with a
|
|
442 ;; throughput of 1/cycle.
|
|
443 ;; on port 0: - ALU (latency 1)
|
|
444 ;; - Multiplier Unit (latency 3)
|
|
445 ;; on port 1: - ALU (latency 1)
|
|
446 ;; - Shift Unit (latency 1)
|
|
447 ;;
|
|
448 ;; MMX instructions are either of the type reg-reg, or read-modify, and
|
|
449 ;; except for mmxshft and mmxmul they can execute on port 0 or port 1,
|
|
450 ;; so they behave as "simple" instructions that need no special modelling.
|
|
451 ;; We only have to model mmxshft and mmxmul.
|
|
452 (define_insn_reservation "ppro_mmx_shft" 1
|
|
453 (and (eq_attr "cpu" "pentiumpro")
|
|
454 (and (eq_attr "memory" "none")
|
|
455 (eq_attr "type" "mmxshft")))
|
|
456 "decodern,p1")
|
|
457
|
|
458 (define_insn_reservation "ppro_mmx_shft_load" 2
|
|
459 (and (eq_attr "cpu" "pentiumpro")
|
|
460 (and (eq_attr "memory" "none")
|
|
461 (eq_attr "type" "mmxshft")))
|
|
462 "decoder0,p2+p1")
|
|
463
|
|
464 (define_insn_reservation "ppro_mmx_mul" 3
|
|
465 (and (eq_attr "cpu" "pentiumpro")
|
|
466 (and (eq_attr "memory" "none")
|
|
467 (eq_attr "type" "mmxmul")))
|
|
468 "decodern,p0")
|
|
469
|
|
470 (define_insn_reservation "ppro_mmx_mul_load" 3
|
|
471 (and (eq_attr "cpu" "pentiumpro")
|
|
472 (and (eq_attr "memory" "none")
|
|
473 (eq_attr "type" "mmxmul")))
|
|
474 "decoder0,p2+p0")
|
|
475
|
|
476 (define_insn_reservation "ppro_sse_mmxcvt" 4
|
|
477 (and (eq_attr "cpu" "pentiumpro")
|
|
478 (and (eq_attr "mode" "DI")
|
|
479 (eq_attr "type" "mmxcvt")))
|
|
480 "decodern,p1")
|
|
481
|
|
482 ;; FIXME: These are Pentium III only, but we cannot tell here if
|
|
483 ;; we're generating code for PentiumPro/Pentium II or Pentium III
|
|
484 ;; (define_insn_reservation "ppro_sse_mmxshft" 2
|
|
485 ;; (and (eq_attr "cpu" "pentiumpro")
|
|
486 ;; (and (eq_attr "mode" "DI")
|
|
487 ;; (eq_attr "type" "mmxshft")))
|
|
488 ;; "decodern,p0")
|
|
489
|
|
490 ;; SSE is very complicated, and takes a bit more effort.
|
|
491 ;; ??? I assumed that all SSE instructions decode on decoder0,
|
|
492 ;; but is this correct?
|
|
493
|
|
494 ;; The sfence instruction.
|
|
495 (define_insn_reservation "ppro_sse_sfence" 3
|
|
496 (and (eq_attr "cpu" "pentiumpro")
|
|
497 (and (eq_attr "memory" "unknown")
|
|
498 (eq_attr "type" "sse")))
|
|
499 "decoder0,p4+p3")
|
|
500
|
|
501 ;; FIXME: This reservation is all wrong when we're scheduling sqrtss.
|
|
502 (define_insn_reservation "ppro_sse_SF" 3
|
|
503 (and (eq_attr "cpu" "pentiumpro")
|
|
504 (and (eq_attr "mode" "SF")
|
|
505 (eq_attr "type" "sse")))
|
|
506 "decodern,p0")
|
|
507
|
|
508 (define_insn_reservation "ppro_sse_add_SF" 3
|
|
509 (and (eq_attr "cpu" "pentiumpro")
|
|
510 (and (eq_attr "memory" "none")
|
|
511 (and (eq_attr "mode" "SF")
|
|
512 (eq_attr "type" "sseadd"))))
|
|
513 "decodern,p1")
|
|
514
|
|
515 (define_insn_reservation "ppro_sse_add_SF_load" 3
|
|
516 (and (eq_attr "cpu" "pentiumpro")
|
|
517 (and (eq_attr "memory" "load")
|
|
518 (and (eq_attr "mode" "SF")
|
|
519 (eq_attr "type" "sseadd"))))
|
|
520 "decoder0,p2+p1")
|
|
521
|
|
522 (define_insn_reservation "ppro_sse_cmp_SF" 3
|
|
523 (and (eq_attr "cpu" "pentiumpro")
|
|
524 (and (eq_attr "memory" "none")
|
|
525 (and (eq_attr "mode" "SF")
|
|
526 (eq_attr "type" "ssecmp"))))
|
|
527 "decoder0,p1")
|
|
528
|
|
529 (define_insn_reservation "ppro_sse_cmp_SF_load" 3
|
|
530 (and (eq_attr "cpu" "pentiumpro")
|
|
531 (and (eq_attr "memory" "load")
|
|
532 (and (eq_attr "mode" "SF")
|
|
533 (eq_attr "type" "ssecmp"))))
|
|
534 "decoder0,p2+p1")
|
|
535
|
|
536 (define_insn_reservation "ppro_sse_comi_SF" 1
|
|
537 (and (eq_attr "cpu" "pentiumpro")
|
|
538 (and (eq_attr "memory" "none")
|
|
539 (and (eq_attr "mode" "SF")
|
|
540 (eq_attr "type" "ssecomi"))))
|
|
541 "decodern,p0")
|
|
542
|
|
543 (define_insn_reservation "ppro_sse_comi_SF_load" 1
|
|
544 (and (eq_attr "cpu" "pentiumpro")
|
|
545 (and (eq_attr "memory" "load")
|
|
546 (and (eq_attr "mode" "SF")
|
|
547 (eq_attr "type" "ssecomi"))))
|
|
548 "decoder0,p2+p0")
|
|
549
|
|
550 (define_insn_reservation "ppro_sse_mul_SF" 4
|
|
551 (and (eq_attr "cpu" "pentiumpro")
|
|
552 (and (eq_attr "memory" "none")
|
|
553 (and (eq_attr "mode" "SF")
|
|
554 (eq_attr "type" "ssemul"))))
|
|
555 "decodern,p0")
|
|
556
|
|
557 (define_insn_reservation "ppro_sse_mul_SF_load" 4
|
|
558 (and (eq_attr "cpu" "pentiumpro")
|
|
559 (and (eq_attr "memory" "load")
|
|
560 (and (eq_attr "mode" "SF")
|
|
561 (eq_attr "type" "ssemul"))))
|
|
562 "decoder0,p2+p0")
|
|
563
|
|
564 ;; FIXME: ssediv doesn't close p0 for 17 cycles, surely???
|
|
565 (define_insn_reservation "ppro_sse_div_SF" 18
|
|
566 (and (eq_attr "cpu" "pentiumpro")
|
|
567 (and (eq_attr "memory" "none")
|
|
568 (and (eq_attr "mode" "SF")
|
|
569 (eq_attr "type" "ssediv"))))
|
|
570 "decoder0,p0*17")
|
|
571
|
|
572 (define_insn_reservation "ppro_sse_div_SF_load" 18
|
|
573 (and (eq_attr "cpu" "pentiumpro")
|
|
574 (and (eq_attr "memory" "none")
|
|
575 (and (eq_attr "mode" "SF")
|
|
576 (eq_attr "type" "ssediv"))))
|
|
577 "decoder0,(p2+p0),p0*16")
|
|
578
|
|
579 (define_insn_reservation "ppro_sse_icvt_SF" 4
|
|
580 (and (eq_attr "cpu" "pentiumpro")
|
|
581 (and (eq_attr "mode" "SF")
|
|
582 (eq_attr "type" "sseicvt")))
|
|
583 "decoder0,(p2+p1)*2")
|
|
584
|
|
585 (define_insn_reservation "ppro_sse_icvt_SI" 3
|
|
586 (and (eq_attr "cpu" "pentiumpro")
|
|
587 (and (eq_attr "mode" "SI")
|
|
588 (eq_attr "type" "sseicvt")))
|
|
589 "decoder0,(p2+p1)")
|
|
590
|
|
591 (define_insn_reservation "ppro_sse_mov_SF" 3
|
|
592 (and (eq_attr "cpu" "pentiumpro")
|
|
593 (and (eq_attr "memory" "none")
|
|
594 (and (eq_attr "mode" "SF")
|
|
595 (eq_attr "type" "ssemov"))))
|
|
596 "decoder0,(p0|p1)")
|
|
597
|
|
598 (define_insn_reservation "ppro_sse_mov_SF_load" 3
|
|
599 (and (eq_attr "cpu" "pentiumpro")
|
|
600 (and (eq_attr "memory" "load")
|
|
601 (and (eq_attr "mode" "SF")
|
|
602 (eq_attr "type" "ssemov"))))
|
|
603 "decoder0,p2+(p0|p1)")
|
|
604
|
|
605 (define_insn_reservation "ppro_sse_mov_SF_store" 3
|
|
606 (and (eq_attr "cpu" "pentiumpro")
|
|
607 (and (eq_attr "memory" "store")
|
|
608 (and (eq_attr "mode" "SF")
|
|
609 (eq_attr "type" "ssemov"))))
|
|
610 "decoder0,p4+p3")
|
|
611
|
|
612 (define_insn_reservation "ppro_sse_V4SF" 4
|
|
613 (and (eq_attr "cpu" "pentiumpro")
|
|
614 (and (eq_attr "mode" "V4SF")
|
|
615 (eq_attr "type" "sse")))
|
|
616 "decoder0,p1*2")
|
|
617
|
|
618 (define_insn_reservation "ppro_sse_add_V4SF" 3
|
|
619 (and (eq_attr "cpu" "pentiumpro")
|
|
620 (and (eq_attr "memory" "none")
|
|
621 (and (eq_attr "mode" "V4SF")
|
|
622 (eq_attr "type" "sseadd"))))
|
|
623 "decoder0,p1*2")
|
|
624
|
|
625 (define_insn_reservation "ppro_sse_add_V4SF_load" 3
|
|
626 (and (eq_attr "cpu" "pentiumpro")
|
|
627 (and (eq_attr "memory" "load")
|
|
628 (and (eq_attr "mode" "V4SF")
|
|
629 (eq_attr "type" "sseadd"))))
|
|
630 "decoder0,(p2+p1)*2")
|
|
631
|
|
632 (define_insn_reservation "ppro_sse_cmp_V4SF" 3
|
|
633 (and (eq_attr "cpu" "pentiumpro")
|
|
634 (and (eq_attr "memory" "none")
|
|
635 (and (eq_attr "mode" "V4SF")
|
|
636 (eq_attr "type" "ssecmp"))))
|
|
637 "decoder0,p1*2")
|
|
638
|
|
639 (define_insn_reservation "ppro_sse_cmp_V4SF_load" 3
|
|
640 (and (eq_attr "cpu" "pentiumpro")
|
|
641 (and (eq_attr "memory" "load")
|
|
642 (and (eq_attr "mode" "V4SF")
|
|
643 (eq_attr "type" "ssecmp"))))
|
|
644 "decoder0,(p2+p1)*2")
|
|
645
|
|
646 (define_insn_reservation "ppro_sse_cvt_V4SF" 3
|
|
647 (and (eq_attr "cpu" "pentiumpro")
|
|
648 (and (eq_attr "memory" "none,unknown")
|
|
649 (and (eq_attr "mode" "V4SF")
|
|
650 (eq_attr "type" "ssecvt"))))
|
|
651 "decoder0,p1*2")
|
|
652
|
|
653 (define_insn_reservation "ppro_sse_cvt_V4SF_other" 4
|
|
654 (and (eq_attr "cpu" "pentiumpro")
|
|
655 (and (eq_attr "memory" "!none,unknown")
|
|
656 (and (eq_attr "mode" "V4SF")
|
|
657 (eq_attr "type" "ssecmp"))))
|
|
658 "decoder0,p1,p4+p3")
|
|
659
|
|
660 (define_insn_reservation "ppro_sse_mul_V4SF" 5
|
|
661 (and (eq_attr "cpu" "pentiumpro")
|
|
662 (and (eq_attr "memory" "none")
|
|
663 (and (eq_attr "mode" "V4SF")
|
|
664 (eq_attr "type" "ssemul"))))
|
|
665 "decoder0,p0*2")
|
|
666
|
|
667 (define_insn_reservation "ppro_sse_mul_V4SF_load" 5
|
|
668 (and (eq_attr "cpu" "pentiumpro")
|
|
669 (and (eq_attr "memory" "load")
|
|
670 (and (eq_attr "mode" "V4SF")
|
|
671 (eq_attr "type" "ssemul"))))
|
|
672 "decoder0,(p2+p0)*2")
|
|
673
|
|
674 ;; FIXME: p0 really closed this long???
|
|
675 (define_insn_reservation "ppro_sse_div_V4SF" 48
|
|
676 (and (eq_attr "cpu" "pentiumpro")
|
|
677 (and (eq_attr "memory" "none")
|
|
678 (and (eq_attr "mode" "V4SF")
|
|
679 (eq_attr "type" "ssediv"))))
|
|
680 "decoder0,p0*34")
|
|
681
|
|
682 (define_insn_reservation "ppro_sse_div_V4SF_load" 48
|
|
683 (and (eq_attr "cpu" "pentiumpro")
|
|
684 (and (eq_attr "memory" "load")
|
|
685 (and (eq_attr "mode" "V4SF")
|
|
686 (eq_attr "type" "ssediv"))))
|
|
687 "decoder0,(p2+p0)*2,p0*32")
|
|
688
|
|
689 (define_insn_reservation "ppro_sse_log_V4SF" 2
|
|
690 (and (eq_attr "cpu" "pentiumpro")
|
|
691 (and (eq_attr "memory" "none")
|
|
692 (and (eq_attr "mode" "V4SF")
|
|
693 (eq_attr "type" "sselog,sselog1"))))
|
|
694 "decodern,p1")
|
|
695
|
|
696 (define_insn_reservation "ppro_sse_log_V4SF_load" 2
|
|
697 (and (eq_attr "cpu" "pentiumpro")
|
|
698 (and (eq_attr "memory" "load")
|
|
699 (and (eq_attr "mode" "V4SF")
|
|
700 (eq_attr "type" "sselog,sselog1"))))
|
|
701 "decoder0,(p2+p1)")
|
|
702
|
|
703 (define_insn_reservation "ppro_sse_mov_V4SF" 1
|
|
704 (and (eq_attr "cpu" "pentiumpro")
|
|
705 (and (eq_attr "memory" "none")
|
|
706 (and (eq_attr "mode" "V4SF")
|
|
707 (eq_attr "type" "ssemov"))))
|
|
708 "decoder0,(p0|p1)*2")
|
|
709
|
|
710 (define_insn_reservation "ppro_sse_mov_V4SF_load" 2
|
|
711 (and (eq_attr "cpu" "pentiumpro")
|
|
712 (and (eq_attr "memory" "load")
|
|
713 (and (eq_attr "mode" "V4SF")
|
|
714 (eq_attr "type" "ssemov"))))
|
|
715 "decoder0,p2*2")
|
|
716
|
|
717 (define_insn_reservation "ppro_sse_mov_V4SF_store" 3
|
|
718 (and (eq_attr "cpu" "pentiumpro")
|
|
719 (and (eq_attr "memory" "store")
|
|
720 (and (eq_attr "mode" "V4SF")
|
|
721 (eq_attr "type" "ssemov"))))
|
|
722 "decoder0,(p4+p3)*2")
|
|
723
|
|
724 ;; All other instructions are modelled as simple instructions.
|
|
725 ;; We have already modelled all i387 floating point instructions, so all
|
|
726 ;; other instructions execute on either port 0 or port 1. This includes
|
|
727 ;; the ALU units, and the MMX units.
|
|
728 ;;
|
|
729 ;; reg-reg instructions produce 1 uop so they can be decoded on any of
|
|
730 ;; the three decoders.
|
|
731 (define_insn_reservation "ppro_insn" 1
|
|
732 (and (eq_attr "cpu" "pentiumpro")
|
|
733 (and (eq_attr "memory" "none,unknown")
|
|
734 (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseimul,mmx,mmxadd,mmxcmp")))
|
|
735 "decodern,(p0|p1)")
|
|
736
|
|
737 ;; read-modify and register-memory instructions have 2 or three uops,
|
|
738 ;; so they have to be decoded on decoder0.
|
|
739 (define_insn_reservation "ppro_insn_load" 3
|
|
740 (and (eq_attr "cpu" "pentiumpro")
|
|
741 (and (eq_attr "memory" "load")
|
|
742 (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseimul,mmx,mmxadd,mmxcmp")))
|
|
743 "decoder0,p2+(p0|p1)")
|
|
744
|
|
745 (define_insn_reservation "ppro_insn_store" 1
|
|
746 (and (eq_attr "cpu" "pentiumpro")
|
|
747 (and (eq_attr "memory" "store")
|
|
748 (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseimul,mmx,mmxadd,mmxcmp")))
|
|
749 "decoder0,(p0|p1),p4+p3")
|
|
750
|
|
751 ;; read-modify-store instructions produce 4 uops so they have to be
|
|
752 ;; decoded on decoder0 as well.
|
|
753 (define_insn_reservation "ppro_insn_both" 4
|
|
754 (and (eq_attr "cpu" "pentiumpro")
|
|
755 (and (eq_attr "memory" "both")
|
|
756 (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseimul,mmx,mmxadd,mmxcmp")))
|
|
757 "decoder0,p2+(p0|p1),p4+p3")
|
|
758
|