comparison gcc/config/i386/ppro.md @ 0:a06113de4d67

first commit
author kent <kent@cr.ie.u-ryukyu.ac.jp>
date Fri, 17 Jul 2009 14:47:48 +0900
parents
children f6334be47118
comparison
equal deleted inserted replaced
-1:000000000000 0:a06113de4d67
1 ;; Scheduling for the Intel P6 family of processors
2 ;; Copyright (C) 2004, 2005, 2007, 2008 Free Software Foundation, Inc.
3 ;;
4 ;; This file is part of GCC.
5 ;;
6 ;; GCC is free software; you can redistribute it and/or modify
7 ;; it under the terms of the GNU General Public License as published by
8 ;; the Free Software Foundation; either version 3, or (at your option)
9 ;; any later version.
10 ;;
11 ;; GCC is distributed in the hope that it will be useful,
12 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 ;; GNU General Public License for more details.
15 ;;
16 ;; You should have received a copy of the GNU General Public License
17 ;; along with GCC; see the file COPYING3. If not see
18 ;; <http://www.gnu.org/licenses/>. */
19
20 ;; The P6 family includes the Pentium Pro, Pentium II, Pentium III, Celeron
21 ;; and Xeon lines of CPUs. The DFA scheduler description in this file is
22 ;; based on information that can be found in the following three documents:
23 ;;
24 ;; "P6 Family of Processors Hardware Developer's Manual",
25 ;; Intel, September 1999.
26 ;;
27 ;; "Intel Architecture Optimization Manual",
28 ;; Intel, 1999 (Order Number: 245127-001).
29 ;;
30 ;; "How to optimize for the Pentium family of microprocessors",
31 ;; by Agner Fog, PhD.
32 ;;
33 ;; The P6 pipeline has three major components:
34 ;; 1) the FETCH/DECODE unit, an in-order issue front-end
35 ;; 2) the DISPATCH/EXECUTE unit, which is the out-of-order core
36 ;; 3) the RETIRE unit, an in-order retirement unit
37 ;;
38 ;; So, the P6 CPUs have out-of-order cores, but the instruction decoder and
39 ;; retirement unit are naturally in-order.
40 ;;
41 ;; BUS INTERFACE UNIT
42 ;; / \
43 ;; L1 ICACHE L1 DCACHE
44 ;; / | \ | \
45 ;; DECODER0 DECODER1 DECODER2 DISP/EXEC RETIRE
46 ;; \ | / | |
47 ;; INSTRUCTION POOL __________|_______/
48 ;; (inc. reorder buffer)
49 ;;
50 ;; Since the P6 CPUs execute instructions out-of-order, the most important
51 ;; consideration in performance tuning is making sure enough micro-ops are
52 ;; ready for execution in the out-of-order core, while not stalling the
53 ;; decoder.
54 ;;
55 ;; TODO:
56 ;; - Find a less crude way to model complex instructions, in
57 ;; particular how many cycles they take to be decoded.
58 ;; - Include decoder latencies in the total reservation latencies.
59 ;; This isn't necessary right now because we assume for every
60 ;; instruction that it never blocks a decoder.
61 ;; - Figure out where the p0 and p1 reservations come from. These
62 ;; appear not to be in the manual
63 ;; - Lots more because I'm sure this is still far from optimal :-)
64
65 ;; The ppro_idiv and ppro_fdiv automata are used to model issue
66 ;; latencies of idiv and fdiv type insns.
67 (define_automaton "ppro_decoder,ppro_core,ppro_idiv,ppro_fdiv,ppro_load,ppro_store")
68
69 ;; Simple instructions of the register-register form have only one uop.
70 ;; Load instructions are also only one uop. Store instructions decode to
71 ;; two uops, and simple read-modify instructions also take two uops.
72 ;; Simple instructions of the register-memory form have two to three uops.
73 ;; Simple read-modify-write instructions have four uops. The rules for
74 ;; the decoder are simple:
75 ;; - an instruction with 1 uop can be decoded by any of the three
76 ;; decoders in one cycle.
77 ;; - an instruction with 1 to 4 uops can be decoded only by decoder 0
78 ;; but still in only one cycle.
79 ;; - a complex (microcode) instruction can also only be decoded by
80 ;; decoder 0, and this takes an unspecified number of cycles.
81 ;;
82 ;; The goal is to schedule such that we have a few-one-one uops sequence
83 ;; in each cycle, to decode as many instructions per cycle as possible.
84 (define_cpu_unit "decoder0" "ppro_decoder")
85 (define_cpu_unit "decoder1" "ppro_decoder")
86 (define_cpu_unit "decoder2" "ppro_decoder")
87
88 ;; We first wish to find an instruction for decoder0, so exclude
89 ;; decoder1 and decoder2 from being reserved until decoder 0 is
90 ;; reserved.
91 (presence_set "decoder1" "decoder0")
92 (presence_set "decoder2" "decoder0")
93
94 ;; Most instructions can be decoded on any of the three decoders.
95 (define_reservation "decodern" "(decoder0|decoder1|decoder2)")
96
97 ;; The out-of-order core has five pipelines. During each cycle, the core
98 ;; may dispatch zero or one uop on the port of any of the five pipelines
99 ;; so the maximum number of dispatched uops per cycle is 5. In practicer,
100 ;; 3 uops per cycle is more realistic.
101 ;;
102 ;; Two of the five pipelines contain several execution units:
103 ;;
104 ;; Port 0 Port 1 Port 2 Port 3 Port 4
105 ;; ALU ALU LOAD SAC SDA
106 ;; FPU JUE
107 ;; AGU MMX
108 ;; MMX P3FPU
109 ;; P3FPU
110 ;;
111 ;; (SAC=Store Address Calculation, SDA=Store Data Unit, P3FPU = SSE unit,
112 ;; JUE = Jump Execution Unit, AGU = Address Generation Unit)
113 ;;
114 (define_cpu_unit "p0,p1" "ppro_core")
115 (define_cpu_unit "p2" "ppro_load")
116 (define_cpu_unit "p3,p4" "ppro_store")
117 (define_cpu_unit "idiv" "ppro_idiv")
118 (define_cpu_unit "fdiv" "ppro_fdiv")
119
120 ;; Only the irregular instructions have to be modeled here. A load
121 ;; increases the latency by 2 or 3, or by nothing if the manual gives
122 ;; a latency already. Store latencies are not accounted for.
123 ;;
124 ;; The simple instructions follow a very regular pattern of 1 uop per
125 ;; reg-reg operation, 1 uop per load on port 2. and 2 uops per store
126 ;; on port 4 and port 3. These instructions are modelled at the bottom
127 ;; of this file.
128 ;;
129 ;; For microcoded instructions we don't know how many uops are produced.
130 ;; These instructions are the "complex" ones in the Intel manuals. All
131 ;; we _do_ know is that they typically produce four or more uops, so
132 ;; they can only be decoded on decoder0. Modelling their latencies
133 ;; doesn't make sense because we don't know how these instructions are
134 ;; executed in the core. So we just model that they can only be decoded
135 ;; on decoder 0, and say that it takes a little while before the result
136 ;; is available.
137 (define_insn_reservation "ppro_complex_insn" 6
138 (and (eq_attr "cpu" "pentiumpro")
139 (eq_attr "type" "other,multi,call,callv,str"))
140 "decoder0")
141
142 ;; imov with memory operands does not use the integer units.
143 (define_insn_reservation "ppro_imov" 1
144 (and (eq_attr "cpu" "pentiumpro")
145 (and (eq_attr "memory" "none")
146 (eq_attr "type" "imov")))
147 "decodern,(p0|p1)")
148
149 (define_insn_reservation "ppro_imov_load" 4
150 (and (eq_attr "cpu" "pentiumpro")
151 (and (eq_attr "memory" "load")
152 (eq_attr "type" "imov")))
153 "decodern,p2")
154
155 (define_insn_reservation "ppro_imov_store" 1
156 (and (eq_attr "cpu" "pentiumpro")
157 (and (eq_attr "memory" "store")
158 (eq_attr "type" "imov")))
159 "decoder0,p4+p3")
160
161 ;; imovx always decodes to one uop, and also doesn't use the integer
162 ;; units if it has memory operands.
163 (define_insn_reservation "ppro_imovx" 1
164 (and (eq_attr "cpu" "pentiumpro")
165 (and (eq_attr "memory" "none")
166 (eq_attr "type" "imovx")))
167 "decodern,(p0|p1)")
168
169 (define_insn_reservation "ppro_imovx_load" 4
170 (and (eq_attr "cpu" "pentiumpro")
171 (and (eq_attr "memory" "load")
172 (eq_attr "type" "imovx")))
173 "decodern,p2")
174
175 ;; lea executes on port 0 with latency one and throughput 1.
176 (define_insn_reservation "ppro_lea" 1
177 (and (eq_attr "cpu" "pentiumpro")
178 (and (eq_attr "memory" "none")
179 (eq_attr "type" "lea")))
180 "decodern,p0")
181
182 ;; Shift and rotate execute on port 0 with latency and throughput 1.
183 ;; The load and store units need to be reserved when memory operands
184 ;; are involved.
185 (define_insn_reservation "ppro_shift_rotate" 1
186 (and (eq_attr "cpu" "pentiumpro")
187 (and (eq_attr "memory" "none")
188 (eq_attr "type" "ishift,ishift1,rotate,rotate1")))
189 "decodern,p0")
190
191 (define_insn_reservation "ppro_shift_rotate_mem" 4
192 (and (eq_attr "cpu" "pentiumpro")
193 (and (eq_attr "memory" "!none")
194 (eq_attr "type" "ishift,ishift1,rotate,rotate1")))
195 "decoder0,p2+p0,p4+p3")
196
197
198 ;; The P6 has a sophisticated branch prediction mechanism to minimize
199 ;; latencies due to branching. In particular, it has a fast way to
200 ;; execute branches that are taken multiple times (such as in loops).
201 ;; Branches not taken suffer no penalty, and correctly predicted
202 ;; branches cost only one fetch cycle. Mispredicted branches are very
203 ;; costly: typically 15 cycles and possibly as many as 26 cycles.
204 ;;
205 ;; Unfortunately all this makes it quite difficult to properly model
206 ;; the latencies for the compiler. Here I've made the choice to be
207 ;; optimistic and assume branches are often predicted correctly, so
208 ;; they have latency 1, and the decoders are not blocked.
209 ;;
210 ;; In addition, the model assumes a branch always decodes to only 1 uop,
211 ;; which is not exactly true because there are a few instructions that
212 ;; decode to 2 uops or microcode. But this probably gives the best
213 ;; results because we can assume these instructions can decode on all
214 ;; decoders.
215 (define_insn_reservation "ppro_branch" 1
216 (and (eq_attr "cpu" "pentiumpro")
217 (and (eq_attr "memory" "none")
218 (eq_attr "type" "ibr")))
219 "decodern,p1")
220
221 ;; ??? Indirect branches probably have worse latency than this.
222 (define_insn_reservation "ppro_indirect_branch" 6
223 (and (eq_attr "cpu" "pentiumpro")
224 (and (eq_attr "memory" "!none")
225 (eq_attr "type" "ibr")))
226 "decoder0,p2+p1")
227
228 (define_insn_reservation "ppro_leave" 4
229 (and (eq_attr "cpu" "pentiumpro")
230 (eq_attr "type" "leave"))
231 "decoder0,p2+(p0|p1),(p0|p1)")
232
233 ;; imul has throughput one, but latency 4, and can only execute on port 0.
234 (define_insn_reservation "ppro_imul" 4
235 (and (eq_attr "cpu" "pentiumpro")
236 (and (eq_attr "memory" "none")
237 (eq_attr "type" "imul")))
238 "decodern,p0")
239
240 (define_insn_reservation "ppro_imul_mem" 4
241 (and (eq_attr "cpu" "pentiumpro")
242 (and (eq_attr "memory" "!none")
243 (eq_attr "type" "imul")))
244 "decoder0,p2+p0")
245
246 ;; div and idiv are very similar, so we model them the same.
247 ;; QI, HI, and SI have issue latency 12, 21, and 37, respectively.
248 ;; These issue latencies are modelled via the ppro_div automaton.
249 (define_insn_reservation "ppro_idiv_QI" 19
250 (and (eq_attr "cpu" "pentiumpro")
251 (and (eq_attr "memory" "none")
252 (and (eq_attr "mode" "QI")
253 (eq_attr "type" "idiv"))))
254 "decoder0,(p0+idiv)*2,(p0|p1)+idiv,idiv*9")
255
256 (define_insn_reservation "ppro_idiv_QI_load" 19
257 (and (eq_attr "cpu" "pentiumpro")
258 (and (eq_attr "memory" "load")
259 (and (eq_attr "mode" "QI")
260 (eq_attr "type" "idiv"))))
261 "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*9")
262
263 (define_insn_reservation "ppro_idiv_HI" 23
264 (and (eq_attr "cpu" "pentiumpro")
265 (and (eq_attr "memory" "none")
266 (and (eq_attr "mode" "HI")
267 (eq_attr "type" "idiv"))))
268 "decoder0,(p0+idiv)*3,(p0|p1)+idiv,idiv*17")
269
270 (define_insn_reservation "ppro_idiv_HI_load" 23
271 (and (eq_attr "cpu" "pentiumpro")
272 (and (eq_attr "memory" "load")
273 (and (eq_attr "mode" "HI")
274 (eq_attr "type" "idiv"))))
275 "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*18")
276
277 (define_insn_reservation "ppro_idiv_SI" 39
278 (and (eq_attr "cpu" "pentiumpro")
279 (and (eq_attr "memory" "none")
280 (and (eq_attr "mode" "SI")
281 (eq_attr "type" "idiv"))))
282 "decoder0,(p0+idiv)*3,(p0|p1)+idiv,idiv*33")
283
284 (define_insn_reservation "ppro_idiv_SI_load" 39
285 (and (eq_attr "cpu" "pentiumpro")
286 (and (eq_attr "memory" "load")
287 (and (eq_attr "mode" "SI")
288 (eq_attr "type" "idiv"))))
289 "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*34")
290
291 ;; Floating point operations always execute on port 0.
292 ;; ??? where do these latencies come from? fadd has latency 3 and
293 ;; has throughput "1/cycle (align with FADD)". What do they
294 ;; mean and how can we model that?
295 (define_insn_reservation "ppro_fop" 3
296 (and (eq_attr "cpu" "pentiumpro")
297 (and (eq_attr "memory" "none,unknown")
298 (eq_attr "type" "fop")))
299 "decodern,p0")
300
301 (define_insn_reservation "ppro_fop_load" 5
302 (and (eq_attr "cpu" "pentiumpro")
303 (and (eq_attr "memory" "load")
304 (eq_attr "type" "fop")))
305 "decoder0,p2+p0,p0")
306
307 (define_insn_reservation "ppro_fop_store" 3
308 (and (eq_attr "cpu" "pentiumpro")
309 (and (eq_attr "memory" "store")
310 (eq_attr "type" "fop")))
311 "decoder0,p0,p0,p0+p4+p3")
312
313 (define_insn_reservation "ppro_fop_both" 5
314 (and (eq_attr "cpu" "pentiumpro")
315 (and (eq_attr "memory" "both")
316 (eq_attr "type" "fop")))
317 "decoder0,p2+p0,p0+p4+p3")
318
319 (define_insn_reservation "ppro_fsgn" 1
320 (and (eq_attr "cpu" "pentiumpro")
321 (eq_attr "type" "fsgn"))
322 "decodern,p0")
323
324 (define_insn_reservation "ppro_fistp" 5
325 (and (eq_attr "cpu" "pentiumpro")
326 (eq_attr "type" "fistp"))
327 "decoder0,p0*2,p4+p3")
328
329 (define_insn_reservation "ppro_fcmov" 2
330 (and (eq_attr "cpu" "pentiumpro")
331 (eq_attr "type" "fcmov"))
332 "decoder0,p0*2")
333
334 (define_insn_reservation "ppro_fcmp" 1
335 (and (eq_attr "cpu" "pentiumpro")
336 (and (eq_attr "memory" "none")
337 (eq_attr "type" "fcmp")))
338 "decodern,p0")
339
340 (define_insn_reservation "ppro_fcmp_load" 4
341 (and (eq_attr "cpu" "pentiumpro")
342 (and (eq_attr "memory" "load")
343 (eq_attr "type" "fcmp")))
344 "decoder0,p2+p0")
345
346 (define_insn_reservation "ppro_fmov" 1
347 (and (eq_attr "cpu" "pentiumpro")
348 (and (eq_attr "memory" "none")
349 (eq_attr "type" "fmov")))
350 "decodern,p0")
351
352 (define_insn_reservation "ppro_fmov_load" 1
353 (and (eq_attr "cpu" "pentiumpro")
354 (and (eq_attr "memory" "load")
355 (and (eq_attr "mode" "!XF")
356 (eq_attr "type" "fmov"))))
357 "decodern,p2")
358
359 (define_insn_reservation "ppro_fmov_XF_load" 3
360 (and (eq_attr "cpu" "pentiumpro")
361 (and (eq_attr "memory" "load")
362 (and (eq_attr "mode" "XF")
363 (eq_attr "type" "fmov"))))
364 "decoder0,(p2+p0)*2")
365
366 (define_insn_reservation "ppro_fmov_store" 1
367 (and (eq_attr "cpu" "pentiumpro")
368 (and (eq_attr "memory" "store")
369 (and (eq_attr "mode" "!XF")
370 (eq_attr "type" "fmov"))))
371 "decodern,p0")
372
373 (define_insn_reservation "ppro_fmov_XF_store" 3
374 (and (eq_attr "cpu" "pentiumpro")
375 (and (eq_attr "memory" "store")
376 (and (eq_attr "mode" "XF")
377 (eq_attr "type" "fmov"))))
378 "decoder0,(p0+p4),(p0+p3)")
379
380 ;; fmul executes on port 0 with latency 5. It has issue latency 2,
381 ;; but we don't model this.
382 (define_insn_reservation "ppro_fmul" 5
383 (and (eq_attr "cpu" "pentiumpro")
384 (and (eq_attr "memory" "none")
385 (eq_attr "type" "fmul")))
386 "decoder0,p0*2")
387
388 (define_insn_reservation "ppro_fmul_load" 6
389 (and (eq_attr "cpu" "pentiumpro")
390 (and (eq_attr "memory" "load")
391 (eq_attr "type" "fmul")))
392 "decoder0,p2+p0,p0")
393
394 ;; fdiv latencies depend on the mode of the operands. XFmode gives
395 ;; a latency of 38 cycles, DFmode gives 32, and SFmode gives latency 18.
396 ;; Division by a power of 2 takes only 9 cycles, but we cannot model
397 ;; that. Throughput is equal to latency - 1, which we model using the
398 ;; ppro_div automaton.
399 (define_insn_reservation "ppro_fdiv_SF" 18
400 (and (eq_attr "cpu" "pentiumpro")
401 (and (eq_attr "memory" "none")
402 (and (eq_attr "mode" "SF")
403 (eq_attr "type" "fdiv,fpspc"))))
404 "decodern,p0+fdiv,fdiv*16")
405
406 (define_insn_reservation "ppro_fdiv_SF_load" 19
407 (and (eq_attr "cpu" "pentiumpro")
408 (and (eq_attr "memory" "load")
409 (and (eq_attr "mode" "SF")
410 (eq_attr "type" "fdiv,fpspc"))))
411 "decoder0,p2+p0+fdiv,fdiv*16")
412
413 (define_insn_reservation "ppro_fdiv_DF" 32
414 (and (eq_attr "cpu" "pentiumpro")
415 (and (eq_attr "memory" "none")
416 (and (eq_attr "mode" "DF")
417 (eq_attr "type" "fdiv,fpspc"))))
418 "decodern,p0+fdiv,fdiv*30")
419
420 (define_insn_reservation "ppro_fdiv_DF_load" 33
421 (and (eq_attr "cpu" "pentiumpro")
422 (and (eq_attr "memory" "load")
423 (and (eq_attr "mode" "DF")
424 (eq_attr "type" "fdiv,fpspc"))))
425 "decoder0,p2+p0+fdiv,fdiv*30")
426
427 (define_insn_reservation "ppro_fdiv_XF" 38
428 (and (eq_attr "cpu" "pentiumpro")
429 (and (eq_attr "memory" "none")
430 (and (eq_attr "mode" "XF")
431 (eq_attr "type" "fdiv,fpspc"))))
432 "decodern,p0+fdiv,fdiv*36")
433
434 (define_insn_reservation "ppro_fdiv_XF_load" 39
435 (and (eq_attr "cpu" "pentiumpro")
436 (and (eq_attr "memory" "load")
437 (and (eq_attr "mode" "XF")
438 (eq_attr "type" "fdiv,fpspc"))))
439 "decoder0,p2+p0+fdiv,fdiv*36")
440
441 ;; MMX instructions can execute on either port 0 or port 1 with a
442 ;; throughput of 1/cycle.
443 ;; on port 0: - ALU (latency 1)
444 ;; - Multiplier Unit (latency 3)
445 ;; on port 1: - ALU (latency 1)
446 ;; - Shift Unit (latency 1)
447 ;;
448 ;; MMX instructions are either of the type reg-reg, or read-modify, and
449 ;; except for mmxshft and mmxmul they can execute on port 0 or port 1,
450 ;; so they behave as "simple" instructions that need no special modelling.
451 ;; We only have to model mmxshft and mmxmul.
452 (define_insn_reservation "ppro_mmx_shft" 1
453 (and (eq_attr "cpu" "pentiumpro")
454 (and (eq_attr "memory" "none")
455 (eq_attr "type" "mmxshft")))
456 "decodern,p1")
457
458 (define_insn_reservation "ppro_mmx_shft_load" 2
459 (and (eq_attr "cpu" "pentiumpro")
460 (and (eq_attr "memory" "none")
461 (eq_attr "type" "mmxshft")))
462 "decoder0,p2+p1")
463
464 (define_insn_reservation "ppro_mmx_mul" 3
465 (and (eq_attr "cpu" "pentiumpro")
466 (and (eq_attr "memory" "none")
467 (eq_attr "type" "mmxmul")))
468 "decodern,p0")
469
470 (define_insn_reservation "ppro_mmx_mul_load" 3
471 (and (eq_attr "cpu" "pentiumpro")
472 (and (eq_attr "memory" "none")
473 (eq_attr "type" "mmxmul")))
474 "decoder0,p2+p0")
475
476 (define_insn_reservation "ppro_sse_mmxcvt" 4
477 (and (eq_attr "cpu" "pentiumpro")
478 (and (eq_attr "mode" "DI")
479 (eq_attr "type" "mmxcvt")))
480 "decodern,p1")
481
482 ;; FIXME: These are Pentium III only, but we cannot tell here if
483 ;; we're generating code for PentiumPro/Pentium II or Pentium III
484 ;; (define_insn_reservation "ppro_sse_mmxshft" 2
485 ;; (and (eq_attr "cpu" "pentiumpro")
486 ;; (and (eq_attr "mode" "DI")
487 ;; (eq_attr "type" "mmxshft")))
488 ;; "decodern,p0")
489
490 ;; SSE is very complicated, and takes a bit more effort.
491 ;; ??? I assumed that all SSE instructions decode on decoder0,
492 ;; but is this correct?
493
494 ;; The sfence instruction.
495 (define_insn_reservation "ppro_sse_sfence" 3
496 (and (eq_attr "cpu" "pentiumpro")
497 (and (eq_attr "memory" "unknown")
498 (eq_attr "type" "sse")))
499 "decoder0,p4+p3")
500
501 ;; FIXME: This reservation is all wrong when we're scheduling sqrtss.
502 (define_insn_reservation "ppro_sse_SF" 3
503 (and (eq_attr "cpu" "pentiumpro")
504 (and (eq_attr "mode" "SF")
505 (eq_attr "type" "sse")))
506 "decodern,p0")
507
508 (define_insn_reservation "ppro_sse_add_SF" 3
509 (and (eq_attr "cpu" "pentiumpro")
510 (and (eq_attr "memory" "none")
511 (and (eq_attr "mode" "SF")
512 (eq_attr "type" "sseadd"))))
513 "decodern,p1")
514
515 (define_insn_reservation "ppro_sse_add_SF_load" 3
516 (and (eq_attr "cpu" "pentiumpro")
517 (and (eq_attr "memory" "load")
518 (and (eq_attr "mode" "SF")
519 (eq_attr "type" "sseadd"))))
520 "decoder0,p2+p1")
521
522 (define_insn_reservation "ppro_sse_cmp_SF" 3
523 (and (eq_attr "cpu" "pentiumpro")
524 (and (eq_attr "memory" "none")
525 (and (eq_attr "mode" "SF")
526 (eq_attr "type" "ssecmp"))))
527 "decoder0,p1")
528
529 (define_insn_reservation "ppro_sse_cmp_SF_load" 3
530 (and (eq_attr "cpu" "pentiumpro")
531 (and (eq_attr "memory" "load")
532 (and (eq_attr "mode" "SF")
533 (eq_attr "type" "ssecmp"))))
534 "decoder0,p2+p1")
535
536 (define_insn_reservation "ppro_sse_comi_SF" 1
537 (and (eq_attr "cpu" "pentiumpro")
538 (and (eq_attr "memory" "none")
539 (and (eq_attr "mode" "SF")
540 (eq_attr "type" "ssecomi"))))
541 "decodern,p0")
542
543 (define_insn_reservation "ppro_sse_comi_SF_load" 1
544 (and (eq_attr "cpu" "pentiumpro")
545 (and (eq_attr "memory" "load")
546 (and (eq_attr "mode" "SF")
547 (eq_attr "type" "ssecomi"))))
548 "decoder0,p2+p0")
549
550 (define_insn_reservation "ppro_sse_mul_SF" 4
551 (and (eq_attr "cpu" "pentiumpro")
552 (and (eq_attr "memory" "none")
553 (and (eq_attr "mode" "SF")
554 (eq_attr "type" "ssemul"))))
555 "decodern,p0")
556
557 (define_insn_reservation "ppro_sse_mul_SF_load" 4
558 (and (eq_attr "cpu" "pentiumpro")
559 (and (eq_attr "memory" "load")
560 (and (eq_attr "mode" "SF")
561 (eq_attr "type" "ssemul"))))
562 "decoder0,p2+p0")
563
564 ;; FIXME: ssediv doesn't close p0 for 17 cycles, surely???
565 (define_insn_reservation "ppro_sse_div_SF" 18
566 (and (eq_attr "cpu" "pentiumpro")
567 (and (eq_attr "memory" "none")
568 (and (eq_attr "mode" "SF")
569 (eq_attr "type" "ssediv"))))
570 "decoder0,p0*17")
571
572 (define_insn_reservation "ppro_sse_div_SF_load" 18
573 (and (eq_attr "cpu" "pentiumpro")
574 (and (eq_attr "memory" "none")
575 (and (eq_attr "mode" "SF")
576 (eq_attr "type" "ssediv"))))
577 "decoder0,(p2+p0),p0*16")
578
579 (define_insn_reservation "ppro_sse_icvt_SF" 4
580 (and (eq_attr "cpu" "pentiumpro")
581 (and (eq_attr "mode" "SF")
582 (eq_attr "type" "sseicvt")))
583 "decoder0,(p2+p1)*2")
584
585 (define_insn_reservation "ppro_sse_icvt_SI" 3
586 (and (eq_attr "cpu" "pentiumpro")
587 (and (eq_attr "mode" "SI")
588 (eq_attr "type" "sseicvt")))
589 "decoder0,(p2+p1)")
590
591 (define_insn_reservation "ppro_sse_mov_SF" 3
592 (and (eq_attr "cpu" "pentiumpro")
593 (and (eq_attr "memory" "none")
594 (and (eq_attr "mode" "SF")
595 (eq_attr "type" "ssemov"))))
596 "decoder0,(p0|p1)")
597
598 (define_insn_reservation "ppro_sse_mov_SF_load" 3
599 (and (eq_attr "cpu" "pentiumpro")
600 (and (eq_attr "memory" "load")
601 (and (eq_attr "mode" "SF")
602 (eq_attr "type" "ssemov"))))
603 "decoder0,p2+(p0|p1)")
604
605 (define_insn_reservation "ppro_sse_mov_SF_store" 3
606 (and (eq_attr "cpu" "pentiumpro")
607 (and (eq_attr "memory" "store")
608 (and (eq_attr "mode" "SF")
609 (eq_attr "type" "ssemov"))))
610 "decoder0,p4+p3")
611
612 (define_insn_reservation "ppro_sse_V4SF" 4
613 (and (eq_attr "cpu" "pentiumpro")
614 (and (eq_attr "mode" "V4SF")
615 (eq_attr "type" "sse")))
616 "decoder0,p1*2")
617
618 (define_insn_reservation "ppro_sse_add_V4SF" 3
619 (and (eq_attr "cpu" "pentiumpro")
620 (and (eq_attr "memory" "none")
621 (and (eq_attr "mode" "V4SF")
622 (eq_attr "type" "sseadd"))))
623 "decoder0,p1*2")
624
625 (define_insn_reservation "ppro_sse_add_V4SF_load" 3
626 (and (eq_attr "cpu" "pentiumpro")
627 (and (eq_attr "memory" "load")
628 (and (eq_attr "mode" "V4SF")
629 (eq_attr "type" "sseadd"))))
630 "decoder0,(p2+p1)*2")
631
632 (define_insn_reservation "ppro_sse_cmp_V4SF" 3
633 (and (eq_attr "cpu" "pentiumpro")
634 (and (eq_attr "memory" "none")
635 (and (eq_attr "mode" "V4SF")
636 (eq_attr "type" "ssecmp"))))
637 "decoder0,p1*2")
638
639 (define_insn_reservation "ppro_sse_cmp_V4SF_load" 3
640 (and (eq_attr "cpu" "pentiumpro")
641 (and (eq_attr "memory" "load")
642 (and (eq_attr "mode" "V4SF")
643 (eq_attr "type" "ssecmp"))))
644 "decoder0,(p2+p1)*2")
645
646 (define_insn_reservation "ppro_sse_cvt_V4SF" 3
647 (and (eq_attr "cpu" "pentiumpro")
648 (and (eq_attr "memory" "none,unknown")
649 (and (eq_attr "mode" "V4SF")
650 (eq_attr "type" "ssecvt"))))
651 "decoder0,p1*2")
652
653 (define_insn_reservation "ppro_sse_cvt_V4SF_other" 4
654 (and (eq_attr "cpu" "pentiumpro")
655 (and (eq_attr "memory" "!none,unknown")
656 (and (eq_attr "mode" "V4SF")
657 (eq_attr "type" "ssecmp"))))
658 "decoder0,p1,p4+p3")
659
660 (define_insn_reservation "ppro_sse_mul_V4SF" 5
661 (and (eq_attr "cpu" "pentiumpro")
662 (and (eq_attr "memory" "none")
663 (and (eq_attr "mode" "V4SF")
664 (eq_attr "type" "ssemul"))))
665 "decoder0,p0*2")
666
667 (define_insn_reservation "ppro_sse_mul_V4SF_load" 5
668 (and (eq_attr "cpu" "pentiumpro")
669 (and (eq_attr "memory" "load")
670 (and (eq_attr "mode" "V4SF")
671 (eq_attr "type" "ssemul"))))
672 "decoder0,(p2+p0)*2")
673
674 ;; FIXME: p0 really closed this long???
675 (define_insn_reservation "ppro_sse_div_V4SF" 48
676 (and (eq_attr "cpu" "pentiumpro")
677 (and (eq_attr "memory" "none")
678 (and (eq_attr "mode" "V4SF")
679 (eq_attr "type" "ssediv"))))
680 "decoder0,p0*34")
681
682 (define_insn_reservation "ppro_sse_div_V4SF_load" 48
683 (and (eq_attr "cpu" "pentiumpro")
684 (and (eq_attr "memory" "load")
685 (and (eq_attr "mode" "V4SF")
686 (eq_attr "type" "ssediv"))))
687 "decoder0,(p2+p0)*2,p0*32")
688
689 (define_insn_reservation "ppro_sse_log_V4SF" 2
690 (and (eq_attr "cpu" "pentiumpro")
691 (and (eq_attr "memory" "none")
692 (and (eq_attr "mode" "V4SF")
693 (eq_attr "type" "sselog,sselog1"))))
694 "decodern,p1")
695
696 (define_insn_reservation "ppro_sse_log_V4SF_load" 2
697 (and (eq_attr "cpu" "pentiumpro")
698 (and (eq_attr "memory" "load")
699 (and (eq_attr "mode" "V4SF")
700 (eq_attr "type" "sselog,sselog1"))))
701 "decoder0,(p2+p1)")
702
703 (define_insn_reservation "ppro_sse_mov_V4SF" 1
704 (and (eq_attr "cpu" "pentiumpro")
705 (and (eq_attr "memory" "none")
706 (and (eq_attr "mode" "V4SF")
707 (eq_attr "type" "ssemov"))))
708 "decoder0,(p0|p1)*2")
709
710 (define_insn_reservation "ppro_sse_mov_V4SF_load" 2
711 (and (eq_attr "cpu" "pentiumpro")
712 (and (eq_attr "memory" "load")
713 (and (eq_attr "mode" "V4SF")
714 (eq_attr "type" "ssemov"))))
715 "decoder0,p2*2")
716
717 (define_insn_reservation "ppro_sse_mov_V4SF_store" 3
718 (and (eq_attr "cpu" "pentiumpro")
719 (and (eq_attr "memory" "store")
720 (and (eq_attr "mode" "V4SF")
721 (eq_attr "type" "ssemov"))))
722 "decoder0,(p4+p3)*2")
723
724 ;; All other instructions are modelled as simple instructions.
725 ;; We have already modelled all i387 floating point instructions, so all
726 ;; other instructions execute on either port 0 or port 1. This includes
727 ;; the ALU units, and the MMX units.
728 ;;
729 ;; reg-reg instructions produce 1 uop so they can be decoded on any of
730 ;; the three decoders.
731 (define_insn_reservation "ppro_insn" 1
732 (and (eq_attr "cpu" "pentiumpro")
733 (and (eq_attr "memory" "none,unknown")
734 (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseimul,mmx,mmxadd,mmxcmp")))
735 "decodern,(p0|p1)")
736
737 ;; read-modify and register-memory instructions have 2 or three uops,
738 ;; so they have to be decoded on decoder0.
739 (define_insn_reservation "ppro_insn_load" 3
740 (and (eq_attr "cpu" "pentiumpro")
741 (and (eq_attr "memory" "load")
742 (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseimul,mmx,mmxadd,mmxcmp")))
743 "decoder0,p2+(p0|p1)")
744
745 (define_insn_reservation "ppro_insn_store" 1
746 (and (eq_attr "cpu" "pentiumpro")
747 (and (eq_attr "memory" "store")
748 (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseimul,mmx,mmxadd,mmxcmp")))
749 "decoder0,(p0|p1),p4+p3")
750
751 ;; read-modify-store instructions produce 4 uops so they have to be
752 ;; decoded on decoder0 as well.
753 (define_insn_reservation "ppro_insn_both" 4
754 (and (eq_attr "cpu" "pentiumpro")
755 (and (eq_attr "memory" "both")
756 (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseimul,mmx,mmxadd,mmxcmp")))
757 "decoder0,p2+(p0|p1),p4+p3")
758