comparison gcc/config/arm/neon-schedgen.ml @ 0:a06113de4d67

first commit
author kent <kent@cr.ie.u-ryukyu.ac.jp>
date Fri, 17 Jul 2009 14:47:48 +0900
parents
children 3bfb6c00c1e0
comparison
equal deleted inserted replaced
-1:000000000000 0:a06113de4d67
1 (* Emission of the core of the Cortex-A8 NEON scheduling description.
2 Copyright (C) 2007 Free Software Foundation, Inc.
3 Contributed by CodeSourcery.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 3, or (at your option) any later
10 version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>.
20 *)
21
22 (* This scheduling description generator works as follows.
23 - Each group of instructions has source and destination requirements
24 specified. The source requirements may be specified using
25 Source (the stage at which all source operands not otherwise
26 described are read), Source_m (the stage at which Rm operands are
27 read), Source_n (likewise for Rn) and Source_d (likewise for Rd).
28 - For each group of instructions the earliest stage where a source
29 operand may be required is calculated.
30 - Each group of instructions is selected in turn as a producer.
31 The latencies between this group and every other group are then
32 calculated, yielding up to four values for each combination:
33 1. Producer -> consumer Rn latency
34 2. Producer -> consumer Rm latency
35 3. Producer -> consumer Rd (as a source) latency
36 4. Producer -> consumer worst-case latency.
37 Value 4 is calculated from the destination availability requirements
38 of the consumer and the earliest source availability requirements
39 of the producer.
40 - The largest Value 4 calculated for the current producer is the
41 worse-case latency, L, for that instruction group. This value is written
42 out in a define_insn_reservation for the producer group.
43 - For each producer and consumer pair, the latencies calculated above
44 are collated. The average (of up to four values) is calculated and
45 if this average is different from the worst-case latency, an
46 unguarded define_bypass construction is issued for that pair.
47 (For each pair only one define_bypass construction will be emitted,
48 and at present we do not emit specific guards.)
49 *)
50
51 open Utils
52
53 let n1 = 1 and n2 = 2 and n3 = 3 and n4 = 4 and n5 = 5 and n6 = 6
54 and n7 = 7 and n8 = 8 and n9 = 9
55
56 type availability = Source of int
57 | Source_n of int
58 | Source_m of int
59 | Source_d of int
60 | Dest of int
61 | Dest_n_after of int * int
62
63 type guard = Guard_none | Guard_only_m | Guard_only_n | Guard_only_d
64
65 (* Reservation behaviors. All but the last row here correspond to one
66 pipeline each. Each constructor will correspond to one
67 define_reservation. *)
68 type reservation =
69 Mul | Mul_2cycle | Mul_4cycle
70 | Shift | Shift_2cycle
71 | ALU | ALU_2cycle
72 | Fmul | Fmul_2cycle
73 | Fadd | Fadd_2cycle
74 (* | VFP *)
75 | Permute of int
76 | Ls of int
77 | Fmul_then_fadd | Fmul_then_fadd_2
78
79 (* This table must be kept as short as possible by conflating
80 entries with the same availability behavior.
81
82 First components: instruction group names
83 Second components: availability requirements, in the order in which
84 they should appear in the comments in the .md file.
85 Third components: reservation info
86 *)
87 let availability_table = [
88 (* NEON integer ALU instructions. *)
89 (* vbit vbif vbsl vorr vbic vnot vcls vclz vcnt vadd vand vorr
90 veor vbic vorn ddd qqq *)
91 "neon_int_1", [Source n2; Dest n3], ALU;
92 (* vadd vsub qqd vsub ddd qqq *)
93 "neon_int_2", [Source_m n1; Source_n n2; Dest n3], ALU;
94 (* vsum vneg dd qq vadd vsub qdd *)
95 "neon_int_3", [Source n1; Dest n3], ALU;
96 (* vabs vceqz vcgez vcbtz vclez vcltz vadh vradh vsbh vrsbh dqq *)
97 (* vhadd vrhadd vqadd vtst ddd qqq *)
98 "neon_int_4", [Source n2; Dest n4], ALU;
99 (* vabd qdd vhsub vqsub vabd vceq vcge vcgt vmax vmin vfmx vfmn ddd ddd *)
100 "neon_int_5", [Source_m n1; Source_n n2; Dest n4], ALU;
101 (* vqneg vqabs dd qq *)
102 "neon_vqneg_vqabs", [Source n1; Dest n4], ALU;
103 (* vmov vmvn *)
104 "neon_vmov", [Dest n3], ALU;
105 (* vaba *)
106 "neon_vaba", [Source_n n2; Source_m n1; Source_d n3; Dest n6], ALU;
107 "neon_vaba_qqq",
108 [Source_n n2; Source_m n1; Source_d n3; Dest_n_after (1, n6)], ALU_2cycle;
109 (* vsma *)
110 "neon_vsma", [Source_m n1; Source_d n3; Dest n6], ALU;
111
112 (* NEON integer multiply instructions. *)
113 (* vmul, vqdmlh, vqrdmlh *)
114 (* vmul, vqdmul, qdd 16/8 long 32/16 long *)
115 "neon_mul_ddd_8_16_qdd_16_8_long_32_16_long", [Source n2; Dest n6], Mul;
116 "neon_mul_qqq_8_16_32_ddd_32", [Source n2; Dest_n_after (1, n6)], Mul_2cycle;
117 (* vmul, vqdmul again *)
118 "neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar",
119 [Source_n n2; Source_m n1; Dest_n_after (1, n6)], Mul_2cycle;
120 (* vmla, vmls *)
121 "neon_mla_ddd_8_16_qdd_16_8_long_32_16_long",
122 [Source_n n2; Source_m n2; Source_d n3; Dest n6], Mul;
123 "neon_mla_qqq_8_16",
124 [Source_n n2; Source_m n2; Source_d n3; Dest_n_after (1, n6)], Mul_2cycle;
125 "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long",
126 [Source_n n2; Source_m n1; Source_d n3; Dest_n_after (1, n6)], Mul_2cycle;
127 "neon_mla_qqq_32_qqd_32_scalar",
128 [Source_n n2; Source_m n1; Source_d n3; Dest_n_after (3, n6)], Mul_4cycle;
129 (* vmul, vqdmulh, vqrdmulh *)
130 (* vmul, vqdmul *)
131 "neon_mul_ddd_16_scalar_32_16_long_scalar",
132 [Source_n n2; Source_m n1; Dest n6], Mul;
133 "neon_mul_qqd_32_scalar",
134 [Source_n n2; Source_m n1; Dest_n_after (3, n6)], Mul_4cycle;
135 (* vmla, vmls *)
136 (* vmla, vmla, vqdmla, vqdmls *)
137 "neon_mla_ddd_16_scalar_qdd_32_16_long_scalar",
138 [Source_n n2; Source_m n1; Source_d n3; Dest n6], Mul;
139
140 (* NEON integer shift instructions. *)
141 (* vshr/vshl immediate, vshr_narrow, vshl_vmvh, vsli_vsri_ddd *)
142 "neon_shift_1", [Source n1; Dest n3], Shift;
143 (* vqshl, vrshr immediate; vqshr, vqmov, vrshr, vqrshr narrow;
144 vqshl_vrshl_vqrshl_ddd *)
145 "neon_shift_2", [Source n1; Dest n4], Shift;
146 (* vsli, vsri and vshl for qqq *)
147 "neon_shift_3", [Source n1; Dest_n_after (1, n3)], Shift_2cycle;
148 "neon_vshl_ddd", [Source n1; Dest n1], Shift;
149 "neon_vqshl_vrshl_vqrshl_qqq", [Source n1; Dest_n_after (1, n4)],
150 Shift_2cycle;
151 "neon_vsra_vrsra", [Source_m n1; Source_d n3; Dest n6], Shift;
152
153 (* NEON floating-point instructions. *)
154 (* vadd, vsub, vabd, vmul, vceq, vcge, vcgt, vcage, vcagt, vmax, vmin *)
155 (* vabs, vneg, vceqz, vcgez, vcgtz, vclez, vcltz, vrecpe, vrsqrte, vcvt *)
156 "neon_fp_vadd_ddd_vabs_dd", [Source n2; Dest n5], Fadd;
157 "neon_fp_vadd_qqq_vabs_qq", [Source n2; Dest_n_after (1, n5)],
158 Fadd_2cycle;
159 (* vsum, fvmx, vfmn *)
160 "neon_fp_vsum", [Source n1; Dest n5], Fadd;
161 "neon_fp_vmul_ddd", [Source_n n2; Source_m n1; Dest n5], Fmul;
162 "neon_fp_vmul_qqd", [Source_n n2; Source_m n1; Dest_n_after (1, n5)],
163 Fmul_2cycle;
164 (* vmla, vmls *)
165 "neon_fp_vmla_ddd",
166 [Source_n n2; Source_m n2; Source_d n3; Dest n9], Fmul_then_fadd;
167 "neon_fp_vmla_qqq",
168 [Source_n n2; Source_m n2; Source_d n3; Dest_n_after (1, n9)],
169 Fmul_then_fadd_2;
170 "neon_fp_vmla_ddd_scalar",
171 [Source_n n2; Source_m n1; Source_d n3; Dest n9], Fmul_then_fadd;
172 "neon_fp_vmla_qqq_scalar",
173 [Source_n n2; Source_m n1; Source_d n3; Dest_n_after (1, n9)],
174 Fmul_then_fadd_2;
175 "neon_fp_vrecps_vrsqrts_ddd", [Source n2; Dest n9], Fmul_then_fadd;
176 "neon_fp_vrecps_vrsqrts_qqq", [Source n2; Dest_n_after (1, n9)],
177 Fmul_then_fadd_2;
178
179 (* NEON byte permute instructions. *)
180 (* vmov; vtrn and vswp for dd; vzip for dd; vuzp for dd; vrev; vext for dd *)
181 "neon_bp_simple", [Source n1; Dest n2], Permute 1;
182 (* vswp for qq; vext for qqq; vtbl with {Dn} or {Dn, Dn1};
183 similarly for vtbx *)
184 "neon_bp_2cycle", [Source n1; Dest_n_after (1, n2)], Permute 2;
185 (* all the rest *)
186 "neon_bp_3cycle", [Source n1; Dest_n_after (2, n2)], Permute 3;
187
188 (* NEON load/store instructions. *)
189 "neon_ldr", [Dest n1], Ls 1;
190 "neon_str", [Source n1], Ls 1;
191 "neon_vld1_1_2_regs", [Dest_n_after (1, n1)], Ls 2;
192 "neon_vld1_3_4_regs", [Dest_n_after (2, n1)], Ls 3;
193 "neon_vld2_2_regs_vld1_vld2_all_lanes", [Dest_n_after (1, n2)], Ls 2;
194 "neon_vld2_4_regs", [Dest_n_after (2, n2)], Ls 3;
195 "neon_vld3_vld4", [Dest_n_after (3, n2)], Ls 4;
196 "neon_vst1_1_2_regs_vst2_2_regs", [Source n1], Ls 2;
197 "neon_vst1_3_4_regs", [Source n1], Ls 3;
198 "neon_vst2_4_regs_vst3_vst4", [Source n1], Ls 4;
199 "neon_vst3_vst4", [Source n1], Ls 4;
200 "neon_vld1_vld2_lane", [Source n1; Dest_n_after (2, n2)], Ls 3;
201 "neon_vld3_vld4_lane", [Source n1; Dest_n_after (4, n2)], Ls 5;
202 "neon_vst1_vst2_lane", [Source n1], Ls 2;
203 "neon_vst3_vst4_lane", [Source n1], Ls 3;
204 "neon_vld3_vld4_all_lanes", [Dest_n_after (1, n2)], Ls 3;
205
206 (* NEON register transfer instructions. *)
207 "neon_mcr", [Dest n2], Permute 1;
208 "neon_mcr_2_mcrr", [Dest n2], Permute 2;
209 (* MRC instructions are in the .tpl file. *)
210 ]
211
212 (* Augment the tuples in the availability table with an extra component
213 that describes the earliest stage where a source operand may be
214 required. (It is also possible that an entry in the table has no
215 source requirements.) *)
216 let calculate_sources =
217 List.map (fun (name, avail, res) ->
218 let earliest_stage =
219 List.fold_left
220 (fun cur -> fun info ->
221 match info with
222 Source stage
223 | Source_n stage
224 | Source_m stage
225 | Source_d stage ->
226 (match cur with
227 None -> Some stage
228 | Some stage' when stage < stage' -> Some stage
229 | _ -> cur)
230 | _ -> cur) None avail
231 in
232 (name, avail, res, earliest_stage))
233
234 (* Find the stage, if any, at the end of which a group produces a result. *)
235 let find_dest (attr, avail, _, _) =
236 try
237 find_with_result
238 (fun av -> match av with
239 Dest st -> Some (Some st)
240 | Dest_n_after (after, st) -> Some (Some (after + st))
241 | _ -> None) avail
242 with Not_found -> None
243
244 (* Find the worst-case latency between a producer and a consumer. *)
245 let worst_case_latency producer (_, _, _, earliest_required) =
246 let dest = find_dest producer in
247 match earliest_required, dest with
248 None, _ ->
249 (* The consumer doesn't have any source requirements. *)
250 None
251 | _, None ->
252 (* The producer doesn't produce any results (e.g. a store insn). *)
253 None
254 | Some consumed, Some produced -> Some (produced - consumed + 1)
255
256 (* Helper function for below. *)
257 let latency_calc f producer (_, avail, _, _) =
258 try
259 let source_avail = find_with_result f avail in
260 match find_dest producer with
261 None ->
262 (* The producer does not produce a result. *)
263 Some 0
264 | Some produced ->
265 let latency = produced - source_avail + 1 in
266 (* Latencies below zero are raised to zero since we don't have
267 delay slots. *)
268 if latency < 0 then Some 0 else Some latency
269 with Not_found -> None
270
271 (* Find any Rm latency between a producer and a consumer. If no
272 Rm source requirement is explicitly specified for the consumer,
273 return "positive infinity". Also return "positive infinity" if
274 the latency matches the supplied worst-case latency for this
275 producer. *)
276 let get_m_latency producer consumer =
277 match latency_calc (fun av -> match av with Source_m stage -> Some stage
278 | _ -> None) producer consumer
279 with None -> [] | Some latency -> [(Guard_only_m, latency)]
280
281 (* Likewise for Rn. *)
282 let get_n_latency producer consumer =
283 match latency_calc (fun av -> match av with Source_n stage -> Some stage
284 | _ -> None) producer consumer
285 with None -> [] | Some latency -> [(Guard_only_n, latency)]
286
287 (* Likewise for Rd. *)
288 let get_d_latency producer consumer =
289 match
290 latency_calc (fun av -> match av with Source_d stage -> Some stage
291 | _ -> None) producer consumer
292 with None -> [] | Some latency -> [(Guard_only_d, latency)]
293
294 (* Given a producer and a consumer, work out the latency of the producer
295 to the consumer in each of the four cases (availability information
296 permitting) identified at the top of this file. Return the
297 consumer, the worst-case unguarded latency and any guarded latencies. *)
298 let calculate_latencies producer consumer =
299 let worst = worst_case_latency producer consumer in
300 let m_latency = get_m_latency producer consumer in
301 let n_latency = get_n_latency producer consumer in
302 let d_latency = get_d_latency producer consumer in
303 (consumer, worst, m_latency @ n_latency @ d_latency)
304
305 (* Helper function for below. *)
306 let pick_latency largest worst guards =
307 let guards =
308 match worst with
309 None -> guards
310 | Some worst -> (Guard_none, worst) :: guards
311 in
312 if List.length guards = 0 then None else
313 let total_latency =
314 List.fold_left (fun acc -> fun (_, latency) -> acc + latency) 0 guards
315 in
316 let average_latency = (float_of_int total_latency) /.
317 (float_of_int (List.length guards)) in
318 let rounded_latency = int_of_float (ceil average_latency) in
319 if rounded_latency = largest then None
320 else Some (Guard_none, rounded_latency)
321
322 (* Collate all bypasses for a particular producer as required in
323 worst_case_latencies_and_bypasses. (By this stage there is a maximum
324 of one bypass from this producer to any particular consumer listed
325 in LATENCIES.) Use a hash table to collate bypasses with the
326 same latency and guard. *)
327 let collate_bypasses (producer_name, _, _, _) largest latencies =
328 let ht = Hashtbl.create 42 in
329 let keys = ref [] in
330 List.iter (
331 fun ((consumer, _, _, _), worst, guards) ->
332 (* Find out which latency to use. Ignoring latencies that match
333 the *overall* worst-case latency for this producer (which will
334 be in define_insn_reservation), we have to examine:
335 1. the latency with no guard between this producer and this
336 consumer; and
337 2. any guarded latency. *)
338 let guard_latency_opt = pick_latency largest worst guards in
339 match guard_latency_opt with
340 None -> ()
341 | Some (guard, latency) ->
342 begin
343 (if (try ignore (Hashtbl.find ht (guard, latency)); false
344 with Not_found -> true) then
345 keys := (guard, latency) :: !keys);
346 Hashtbl.add ht (guard, latency) consumer
347 end
348 ) latencies;
349 (* The hash table now has bypasses collated so that ones with the
350 same latency and guard have the same keys. Walk through all the
351 keys, extract the associated bypasses, and concatenate the names
352 of the consumers for each bypass. *)
353 List.map (
354 fun ((guard, latency) as key) ->
355 let consumers = Hashtbl.find_all ht key in
356 (producer_name,
357 String.concat ",\\\n " consumers,
358 latency,
359 guard)
360 ) !keys
361
362 (* For every producer, find the worst-case latency between it and
363 *any* consumer. Also determine (if such a thing exists) the
364 lowest-latency bypass from each producer to each consumer. Group
365 the output in such a way that all bypasses with the same producer
366 and latency are together, and so that bypasses with the worst-case
367 latency are ignored. *)
368 let worst_case_latencies_and_bypasses =
369 let rec f (worst_acc, bypasses_acc) prev xs =
370 match xs with
371 [] -> (worst_acc, bypasses_acc)
372 | ((producer_name, producer_avail, res_string, _) as producer)::next ->
373 (* For this particular producer, work out the latencies between
374 it and every consumer. *)
375 let latencies =
376 List.fold_left (fun acc -> fun consumer ->
377 (calculate_latencies producer consumer) :: acc)
378 [] (prev @ xs)
379 in
380 (* Now work out what the overall worst case latency was for this
381 particular producer. *)
382 match latencies with
383 [] -> assert false
384 | _ ->
385 let comp_fn (_, l1, _) (_, l2, _) =
386 if l1 > l2 then -1 else if l1 = l2 then 0 else 1
387 in
388 let largest =
389 match List.hd (List.sort comp_fn latencies) with
390 (_, None, _) -> 0 (* Producer has no consumers. *)
391 | (_, Some worst, _) -> worst
392 in
393 (* Having got the largest latency, collect all bypasses for
394 this producer and filter out those with that larger
395 latency. Record the others for later emission. *)
396 let bypasses = collate_bypasses producer largest latencies in
397 (* Go on to process remaining producers, having noted
398 the result for this one. *)
399 f ((producer_name, producer_avail, largest,
400 res_string) :: worst_acc,
401 bypasses @ bypasses_acc)
402 (prev @ [producer]) next
403 in
404 f ([], []) []
405
406 (* Emit a helpful comment for a define_insn_reservation. *)
407 let write_comment producer avail =
408 let seen_source = ref false in
409 let describe info =
410 let read = if !seen_source then "" else "read " in
411 match info with
412 Source stage ->
413 seen_source := true;
414 Printf.printf "%stheir source operands at N%d" read stage
415 | Source_n stage ->
416 seen_source := true;
417 Printf.printf "%stheir (D|Q)n operands at N%d" read stage
418 | Source_m stage ->
419 seen_source := true;
420 Printf.printf "%stheir (D|Q)m operands at N%d" read stage
421 | Source_d stage ->
422 Printf.printf "%stheir (D|Q)d operands at N%d" read stage
423 | Dest stage ->
424 Printf.printf "produce a result at N%d" stage
425 | Dest_n_after (after, stage) ->
426 Printf.printf "produce a result at N%d on cycle %d" stage (after + 1)
427 in
428 Printf.printf ";; Instructions using this reservation ";
429 let rec f infos x =
430 let sep = if x mod 2 = 1 then "" else "\n;;" in
431 match infos with
432 [] -> assert false
433 | [info] -> describe info; Printf.printf ".\n"
434 | info::(_::[] as infos) ->
435 describe info; Printf.printf ", and%s " sep; f infos (x+1)
436 | info::infos -> describe info; Printf.printf ",%s " sep; f infos (x+1)
437 in
438 f avail 0
439
440 (* Emit a define_insn_reservation for each producer. The latency
441 written in will be its worst-case latency. *)
442 let emit_insn_reservations =
443 List.iter (
444 fun (producer, avail, latency, reservation) ->
445 write_comment producer avail;
446 Printf.printf "(define_insn_reservation \"%s\" %d\n" producer latency;
447 Printf.printf " (and (eq_attr \"tune\" \"cortexa8\")\n";
448 Printf.printf " (eq_attr \"neon_type\" \"%s\"))\n" producer;
449 let str =
450 match reservation with
451 Mul -> "dp" | Mul_2cycle -> "dp_2" | Mul_4cycle -> "dp_4"
452 | Shift -> "dp" | Shift_2cycle -> "dp_2"
453 | ALU -> "dp" | ALU_2cycle -> "dp_2"
454 | Fmul -> "dp" | Fmul_2cycle -> "dp_2"
455 | Fadd -> "fadd" | Fadd_2cycle -> "fadd_2"
456 | Ls 1 -> "ls"
457 | Ls n -> "ls_" ^ (string_of_int n)
458 | Permute 1 -> "perm"
459 | Permute n -> "perm_" ^ (string_of_int n)
460 | Fmul_then_fadd -> "fmul_then_fadd"
461 | Fmul_then_fadd_2 -> "fmul_then_fadd_2"
462 in
463 Printf.printf " \"cortex_a8_neon_%s\")\n\n" str
464 )
465
466 (* Given a guard description, return the name of the C function to
467 be used as the guard for define_bypass. *)
468 let guard_fn g =
469 match g with
470 Guard_only_m -> "arm_neon_only_m_dependency"
471 | Guard_only_n -> "arm_neon_only_n_dependency"
472 | Guard_only_d -> "arm_neon_only_d_dependency"
473 | Guard_none -> assert false
474
475 (* Emit a define_bypass for each bypass. *)
476 let emit_bypasses =
477 List.iter (
478 fun (producer, consumers, latency, guard) ->
479 Printf.printf "(define_bypass %d \"%s\"\n" latency producer;
480 if guard = Guard_none then
481 Printf.printf " \"%s\")\n\n" consumers
482 else
483 begin
484 Printf.printf " \"%s\"\n" consumers;
485 Printf.printf " \"%s\")\n\n" (guard_fn guard)
486 end
487 )
488
489 (* Program entry point. *)
490 let main =
491 let table = calculate_sources availability_table in
492 let worst_cases, bypasses = worst_case_latencies_and_bypasses table in
493 emit_insn_reservations (List.rev worst_cases);
494 Printf.printf ";; Exceptions to the default latencies.\n\n";
495 emit_bypasses bypasses
496