111
|
1 .. Copyright (C) 2014-2017 Free Software Foundation, Inc.
|
|
2 Originally contributed by David Malcolm <dmalcolm@redhat.com>
|
|
3
|
|
4 This is free software: you can redistribute it and/or modify it
|
|
5 under the terms of the GNU General Public License as published by
|
|
6 the Free Software Foundation, either version 3 of the License, or
|
|
7 (at your option) any later version.
|
|
8
|
|
9 This program is distributed in the hope that it will be useful, but
|
|
10 WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
12 General Public License for more details.
|
|
13
|
|
14 You should have received a copy of the GNU General Public License
|
|
15 along with this program. If not, see
|
|
16 <http://www.gnu.org/licenses/>.
|
|
17
|
|
18 .. default-domain:: cpp
|
|
19
|
|
20 Tutorial part 4: Adding JIT-compilation to a toy interpreter
|
|
21 ------------------------------------------------------------
|
|
22 In this example we construct a "toy" interpreter, and add JIT-compilation
|
|
23 to it.
|
|
24
|
|
25 Our toy interpreter
|
|
26 *******************
|
|
27
|
|
28 It's a stack-based interpreter, and is intended as a (very simple) example
|
|
29 of the kind of bytecode interpreter seen in dynamic languages such as
|
|
30 Python, Ruby etc.
|
|
31
|
|
32 For the sake of simplicity, our toy virtual machine is very limited:
|
|
33
|
|
34 * The only data type is `int`
|
|
35
|
|
36 * It can only work on one function at a time (so that the only
|
|
37 function call that can be made is to recurse).
|
|
38
|
|
39 * Functions can only take one parameter.
|
|
40
|
|
41 * Functions have a stack of `int` values.
|
|
42
|
|
43 * We'll implement function call within the interpreter by calling a
|
|
44 function in our implementation, rather than implementing our own
|
|
45 frame stack.
|
|
46
|
|
47 * The parser is only good enough to get the examples to work.
|
|
48
|
|
49 Naturally, a real interpreter would be much more complicated that this.
|
|
50
|
|
51 The following operations are supported:
|
|
52
|
|
53 ====================== ======================== =============== ==============
|
|
54 Operation Meaning Old Stack New Stack
|
|
55 ====================== ======================== =============== ==============
|
|
56 DUP Duplicate top of stack. ``[..., x]`` ``[..., x, x]``
|
|
57 ROT Swap top two elements ``[..., x, y]`` ``[..., y, x]``
|
|
58 of stack.
|
|
59 BINARY_ADD Add the top two elements ``[..., x, y]`` ``[..., (x+y)]``
|
|
60 on the stack.
|
|
61 BINARY_SUBTRACT Likewise, but subtract. ``[..., x, y]`` ``[..., (x-y)]``
|
|
62 BINARY_MULT Likewise, but multiply. ``[..., x, y]`` ``[..., (x*y)]``
|
|
63 BINARY_COMPARE_LT Compare the top two ``[..., x, y]`` ``[..., (x<y)]``
|
|
64 elements on the stack
|
|
65 and push a nonzero/zero
|
|
66 if (x<y).
|
|
67 RECURSE Recurse, passing the top ``[..., x]`` ``[..., fn(x)]``
|
|
68 of the stack, and
|
|
69 popping the result.
|
|
70 RETURN Return the top of the ``[x]`` ``[]``
|
|
71 stack.
|
|
72 PUSH_CONST `arg` Push an int const. ``[...]`` ``[..., arg]``
|
|
73 JUMP_ABS_IF_TRUE `arg` Pop; if top of stack was ``[..., x]`` ``[...]``
|
|
74 nonzero, jump to
|
|
75 ``arg``.
|
|
76 ====================== ======================== =============== ==============
|
|
77
|
|
78 Programs can be interpreted, disassembled, and compiled to machine code.
|
|
79
|
|
80 The interpreter reads ``.toy`` scripts. Here's what a simple recursive
|
|
81 factorial program looks like, the script ``factorial.toy``.
|
|
82 The parser ignores lines beginning with a `#`.
|
|
83
|
|
84 .. literalinclude:: ../../examples/tut04-toyvm/factorial.toy
|
|
85 :lines: 1-
|
|
86 :language: c
|
|
87
|
|
88 The interpreter is a simple infinite loop with a big ``switch`` statement
|
|
89 based on what the next opcode is:
|
|
90
|
|
91 .. literalinclude:: ../../examples/tut04-toyvm/toyvm.cc
|
|
92 :start-after: /* Execute the given function. */
|
|
93 :end-before: /* JIT compilation. */
|
|
94 :language: c++
|
|
95
|
|
96 Compiling to machine code
|
|
97 *************************
|
|
98 We want to generate machine code that can be cast to this type and
|
|
99 then directly executed in-process:
|
|
100
|
|
101 .. literalinclude:: ../../examples/tut04-toyvm/toyvm.cc
|
|
102 :start-after: /* Functions are compiled to this function ptr type. */
|
|
103 :end-before: enum opcode
|
|
104 :language: c++
|
|
105
|
|
106 Our compiler isn't very sophisticated; it takes the implementation of
|
|
107 each opcode above, and maps it directly to the operations supported by
|
|
108 the libgccjit API.
|
|
109
|
|
110 How should we handle the stack? In theory we could calculate what the
|
|
111 stack depth will be at each opcode, and optimize away the stack
|
|
112 manipulation "by hand". We'll see below that libgccjit is able to do
|
|
113 this for us, so we'll implement stack manipulation
|
|
114 in a direct way, by creating a ``stack`` array and ``stack_depth``
|
|
115 variables, local within the generated function, equivalent to this C code:
|
|
116
|
|
117 .. code-block:: c
|
|
118
|
|
119 int stack_depth;
|
|
120 int stack[MAX_STACK_DEPTH];
|
|
121
|
|
122 We'll also have local variables ``x`` and ``y`` for use when implementing
|
|
123 the opcodes, equivalent to this:
|
|
124
|
|
125 .. code-block:: c
|
|
126
|
|
127 int x;
|
|
128 int y;
|
|
129
|
|
130 This means our compiler has the following state:
|
|
131
|
|
132 .. literalinclude:: ../../examples/tut04-toyvm/toyvm.cc
|
|
133 :start-after: /* State. */
|
|
134 :end-before: };
|
|
135 :language: c++
|
|
136
|
|
137 Setting things up
|
|
138 *****************
|
|
139
|
|
140 First we create our types:
|
|
141
|
|
142 .. literalinclude:: ../../examples/tut04-toyvm/toyvm.cc
|
|
143 :start-after: /* Create types. */
|
|
144 :end-before: /* The constant value 1. */
|
|
145 :language: c++
|
|
146
|
|
147 along with extracting a useful `int` constant:
|
|
148
|
|
149 .. literalinclude:: ../../examples/tut04-toyvm/toyvm.cc
|
|
150 :start-after: /* The constant value 1. */
|
|
151 :end-before: /* Create locations. */
|
|
152 :language: c++
|
|
153
|
|
154 We'll implement push and pop in terms of the ``stack`` array and
|
|
155 ``stack_depth``. Here are helper functions for adding statements to
|
|
156 a block, implementing pushing and popping values:
|
|
157
|
|
158 .. literalinclude:: ../../examples/tut04-toyvm/toyvm.cc
|
|
159 :start-after: /* Stack manipulation. */
|
|
160 :end-before: /* Create the context. */
|
|
161 :language: c++
|
|
162
|
|
163 We will support single-stepping through the generated code in the
|
|
164 debugger, so we need to create :type:`gccjit::location` instances, one
|
|
165 per operation in the source code. These will reference the lines of
|
|
166 e.g. ``factorial.toy``.
|
|
167
|
|
168 .. literalinclude:: ../../examples/tut04-toyvm/toyvm.cc
|
|
169 :start-after: /* Create locations. */
|
|
170 :end-before: /* Creating the function. */
|
|
171 :language: c++
|
|
172
|
|
173 Let's create the function itself. As usual, we create its parameter
|
|
174 first, then use the parameter to create the function:
|
|
175
|
|
176 .. literalinclude:: ../../examples/tut04-toyvm/toyvm.cc
|
|
177 :start-after: /* Creating the function. */
|
|
178 :end-before: /* Create stack lvalues. */
|
|
179 :language: c++
|
|
180
|
|
181 We create the locals within the function.
|
|
182
|
|
183 .. literalinclude:: ../../examples/tut04-toyvm/toyvm.cc
|
|
184 :start-after: /* Create stack lvalues. */
|
|
185 :end-before: /* 1st pass: create blocks, one per opcode.
|
|
186 :language: c++
|
|
187
|
|
188 Populating the function
|
|
189 ***********************
|
|
190
|
|
191 There's some one-time initialization, and the API treats the first block
|
|
192 you create as the entrypoint of the function, so we need to create that
|
|
193 block first:
|
|
194
|
|
195 .. literalinclude:: ../../examples/tut04-toyvm/toyvm.cc
|
|
196 :start-after: first. */
|
|
197 :end-before: /* Create a block per operation. */
|
|
198 :language: c++
|
|
199
|
|
200 We can now create blocks for each of the operations. Most of these will
|
|
201 be consolidated into larger blocks when the optimizer runs.
|
|
202
|
|
203 .. literalinclude:: ../../examples/tut04-toyvm/toyvm.cc
|
|
204 :start-after: /* Create a block per operation. */
|
|
205 :end-before: /* Populate the initial block. */
|
|
206 :language: c++
|
|
207
|
|
208 Now that we have a block it can jump to when it's done, we can populate
|
|
209 the initial block:
|
|
210
|
|
211 .. literalinclude:: ../../examples/tut04-toyvm/toyvm.cc
|
|
212 :start-after: /* Populate the initial block. */
|
|
213 :end-before: /* 2nd pass: fill in instructions. */
|
|
214 :language: c++
|
|
215
|
|
216 We can now populate the blocks for the individual operations. We loop
|
|
217 through them, adding instructions to their blocks:
|
|
218
|
|
219 .. literalinclude:: ../../examples/tut04-toyvm/toyvm.cc
|
|
220 :start-after: /* 2nd pass: fill in instructions. */
|
|
221 :end-before: /* Helper macros. */
|
|
222 :language: c++
|
|
223
|
|
224 We're going to have another big ``switch`` statement for implementing
|
|
225 the opcodes, this time for compiling them, rather than interpreting
|
|
226 them. It's helpful to have macros for implementing push and pop, so that
|
|
227 we can make the ``switch`` statement that's coming up look as much as
|
|
228 possible like the one above within the interpreter:
|
|
229
|
|
230 .. literalinclude:: ../../examples/tut04-toyvm/toyvm.cc
|
|
231 :start-after: /* Helper macros. */
|
|
232 :end-before: block.add_comment
|
|
233 :language: c++
|
|
234
|
|
235 .. note::
|
|
236
|
|
237 A particularly clever implementation would have an *identical*
|
|
238 ``switch`` statement shared by the interpreter and the compiler, with
|
|
239 some preprocessor "magic". We're not doing that here, for the sake
|
|
240 of simplicity.
|
|
241
|
|
242 When I first implemented this compiler, I accidentally missed an edit
|
|
243 when copying and pasting the ``Y_EQUALS_POP`` macro, so that popping the
|
|
244 stack into ``y`` instead erroneously assigned it to ``x``, leaving ``y``
|
|
245 uninitialized.
|
|
246
|
|
247 To track this kind of thing down, we can use
|
|
248 :func:`gccjit::block::add_comment` to add descriptive comments
|
|
249 to the internal representation. This is invaluable when looking through
|
|
250 the generated IR for, say ``factorial``:
|
|
251
|
|
252 .. literalinclude:: ../../examples/tut04-toyvm/toyvm.cc
|
|
253 :start-after: PUSH_RVALUE (y)
|
|
254 :end-before: /* Handle the individual opcodes. */
|
|
255 :language: c++
|
|
256
|
|
257 We can now write the big ``switch`` statement that implements the
|
|
258 individual opcodes, populating the relevant block with statements:
|
|
259
|
|
260 .. literalinclude:: ../../examples/tut04-toyvm/toyvm.cc
|
|
261 :start-after: /* Handle the individual opcodes. */
|
|
262 :end-before: /* Go to the next block. */
|
|
263 :language: c++
|
|
264
|
|
265 Every block must be terminated, via a call to one of the
|
|
266 ``gccjit::block::end_with_`` entrypoints. This has been done for two
|
|
267 of the opcodes, but we need to do it for the other ones, by jumping
|
|
268 to the next block.
|
|
269
|
|
270 .. literalinclude:: ../../examples/tut04-toyvm/toyvm.cc
|
|
271 :start-after: /* Go to the next block. */
|
|
272 :end-before: /* end of loop on PC locations. */
|
|
273 :language: c++
|
|
274
|
|
275 This is analogous to simply incrementing the program counter.
|
|
276
|
|
277 Verifying the control flow graph
|
|
278 ********************************
|
|
279 Having finished looping over the blocks, the context is complete.
|
|
280
|
|
281 As before, we can verify that the control flow and statements are sane by
|
|
282 using :func:`gccjit::function::dump_to_dot`:
|
|
283
|
|
284 .. code-block:: c++
|
|
285
|
|
286 fn.dump_to_dot ("/tmp/factorial.dot");
|
|
287
|
|
288 and viewing the result. Note how the label names, comments, and
|
|
289 variable names show up in the dump, to make it easier to spot
|
|
290 errors in our compiler.
|
|
291
|
|
292 .. figure:: ../../intro/factorial.png
|
|
293 :alt: image of a control flow graph
|
|
294
|
|
295 Compiling the context
|
|
296 *********************
|
|
297 Having finished looping over the blocks and populating them with
|
|
298 statements, the context is complete.
|
|
299
|
|
300 We can now compile it, extract machine code from the result, and
|
|
301 run it:
|
|
302
|
|
303 .. literalinclude:: ../../examples/tut04-toyvm/toyvm.cc
|
|
304 :start-after: /* Wrapper around a gcc_jit_result *. */
|
|
305 :end-before: /* Functions are compiled to this function ptr type. */
|
|
306 :language: c++
|
|
307
|
|
308 .. literalinclude:: ../../examples/tut04-toyvm/toyvm.cc
|
|
309 :start-after: /* JIT-compilation. */
|
|
310 :end-before: return 0;
|
|
311 :language: c++
|
|
312
|
|
313 Single-stepping through the generated code
|
|
314 ******************************************
|
|
315
|
|
316 It's possible to debug the generated code. To do this we need to both:
|
|
317
|
|
318 * Set up source code locations for our statements, so that we can
|
|
319 meaningfully step through the code. We did this above by
|
|
320 calling :func:`gccjit::context::new_location` and using the
|
|
321 results.
|
|
322
|
|
323 * Enable the generation of debugging information, by setting
|
|
324 :c:macro:`GCC_JIT_BOOL_OPTION_DEBUGINFO` on the
|
|
325 :type:`gccjit::context` via
|
|
326 :func:`gccjit::context::set_bool_option`:
|
|
327
|
|
328 .. code-block:: c++
|
|
329
|
|
330 ctxt.set_bool_option (GCC_JIT_BOOL_OPTION_DEBUGINFO, 1);
|
|
331
|
|
332 Having done this, we can put a breakpoint on the generated function:
|
|
333
|
|
334 .. code-block:: console
|
|
335
|
|
336 $ gdb --args ./toyvm factorial.toy 10
|
|
337 (gdb) break factorial
|
|
338 Function "factorial" not defined.
|
|
339 Make breakpoint pending on future shared library load? (y or [n]) y
|
|
340 Breakpoint 1 (factorial) pending.
|
|
341 (gdb) run
|
|
342 Breakpoint 1, factorial (arg=10) at factorial.toy:14
|
|
343 14 DUP
|
|
344
|
|
345 We've set up location information, which references ``factorial.toy``.
|
|
346 This allows us to use e.g. ``list`` to see where we are in the script:
|
|
347
|
|
348 .. code-block:: console
|
|
349
|
|
350 (gdb) list
|
|
351 9
|
|
352 10 # Initial state:
|
|
353 11 # stack: [arg]
|
|
354 12
|
|
355 13 # 0:
|
|
356 14 DUP
|
|
357 15 # stack: [arg, arg]
|
|
358 16
|
|
359 17 # 1:
|
|
360 18 PUSH_CONST 2
|
|
361
|
|
362 and to step through the function, examining the data:
|
|
363
|
|
364 .. code-block:: console
|
|
365
|
|
366 (gdb) n
|
|
367 18 PUSH_CONST 2
|
|
368 (gdb) n
|
|
369 22 BINARY_COMPARE_LT
|
|
370 (gdb) print stack
|
|
371 $5 = {10, 10, 2, 0, -7152, 32767, 0, 0}
|
|
372 (gdb) print stack_depth
|
|
373 $6 = 3
|
|
374
|
|
375 You'll see that the parts of the ``stack`` array that haven't been
|
|
376 touched yet are uninitialized.
|
|
377
|
|
378 .. note::
|
|
379
|
|
380 Turning on optimizations may lead to unpredictable results when
|
|
381 stepping through the generated code: the execution may appear to
|
|
382 "jump around" the source code. This is analogous to turning up the
|
|
383 optimization level in a regular compiler.
|
|
384
|
|
385 Examining the generated code
|
|
386 ****************************
|
|
387
|
|
388 How good is the optimized code?
|
|
389
|
|
390 We can turn up optimizations, by calling
|
|
391 :cpp:func:`gccjit::context::set_int_option` with
|
|
392 :c:macro:`GCC_JIT_INT_OPTION_OPTIMIZATION_LEVEL`:
|
|
393
|
|
394 .. code-block:: c++
|
|
395
|
|
396 ctxt.set_int_option (GCC_JIT_INT_OPTION_OPTIMIZATION_LEVEL, 3);
|
|
397
|
|
398 One of GCC's internal representations is called "gimple". A dump of the
|
|
399 initial gimple representation of the code can be seen by setting:
|
|
400
|
|
401 .. code-block:: c++
|
|
402
|
|
403 ctxt.set_bool_option (GCC_JIT_BOOL_OPTION_DUMP_INITIAL_GIMPLE, 1);
|
|
404
|
|
405 With optimization on and source locations displayed, this gives:
|
|
406
|
|
407 .. We'll use "c" for gimple dumps
|
|
408
|
|
409 .. code-block:: c
|
|
410
|
|
411 factorial (signed int arg)
|
|
412 {
|
|
413 <unnamed type> D.80;
|
|
414 signed int D.81;
|
|
415 signed int D.82;
|
|
416 signed int D.83;
|
|
417 signed int D.84;
|
|
418 signed int D.85;
|
|
419 signed int y;
|
|
420 signed int x;
|
|
421 signed int stack_depth;
|
|
422 signed int stack[8];
|
|
423
|
|
424 try
|
|
425 {
|
|
426 initial:
|
|
427 stack_depth = 0;
|
|
428 stack[stack_depth] = arg;
|
|
429 stack_depth = stack_depth + 1;
|
|
430 goto instr0;
|
|
431 instr0:
|
|
432 /* DUP */:
|
|
433 stack_depth = stack_depth + -1;
|
|
434 x = stack[stack_depth];
|
|
435 stack[stack_depth] = x;
|
|
436 stack_depth = stack_depth + 1;
|
|
437 stack[stack_depth] = x;
|
|
438 stack_depth = stack_depth + 1;
|
|
439 goto instr1;
|
|
440 instr1:
|
|
441 /* PUSH_CONST */:
|
|
442 stack[stack_depth] = 2;
|
|
443 stack_depth = stack_depth + 1;
|
|
444 goto instr2;
|
|
445
|
|
446 /* etc */
|
|
447
|
|
448 You can see the generated machine code in assembly form via:
|
|
449
|
|
450 .. code-block:: c++
|
|
451
|
|
452 ctxt.set_bool_option (GCC_JIT_BOOL_OPTION_DUMP_GENERATED_CODE, 1);
|
|
453 result = ctxt.compile ();
|
|
454
|
|
455 which shows that (on this x86_64 box) the compiler has unrolled the loop
|
|
456 and is using MMX instructions to perform several multiplications
|
|
457 simultaneously:
|
|
458
|
|
459 .. code-block:: gas
|
|
460
|
|
461 .file "fake.c"
|
|
462 .text
|
|
463 .Ltext0:
|
|
464 .p2align 4,,15
|
|
465 .globl factorial
|
|
466 .type factorial, @function
|
|
467 factorial:
|
|
468 .LFB0:
|
|
469 .file 1 "factorial.toy"
|
|
470 .loc 1 14 0
|
|
471 .cfi_startproc
|
|
472 .LVL0:
|
|
473 .L2:
|
|
474 .loc 1 26 0
|
|
475 cmpl $1, %edi
|
|
476 jle .L13
|
|
477 leal -1(%rdi), %edx
|
|
478 movl %edx, %ecx
|
|
479 shrl $2, %ecx
|
|
480 leal 0(,%rcx,4), %esi
|
|
481 testl %esi, %esi
|
|
482 je .L14
|
|
483 cmpl $9, %edx
|
|
484 jbe .L14
|
|
485 leal -2(%rdi), %eax
|
|
486 movl %eax, -16(%rsp)
|
|
487 leal -3(%rdi), %eax
|
|
488 movd -16(%rsp), %xmm0
|
|
489 movl %edi, -16(%rsp)
|
|
490 movl %eax, -12(%rsp)
|
|
491 movd -16(%rsp), %xmm1
|
|
492 xorl %eax, %eax
|
|
493 movl %edx, -16(%rsp)
|
|
494 movd -12(%rsp), %xmm4
|
|
495 movd -16(%rsp), %xmm6
|
|
496 punpckldq %xmm4, %xmm0
|
|
497 movdqa .LC1(%rip), %xmm4
|
|
498 punpckldq %xmm6, %xmm1
|
|
499 punpcklqdq %xmm0, %xmm1
|
|
500 movdqa .LC0(%rip), %xmm0
|
|
501 jmp .L5
|
|
502 # etc - edited for brevity
|
|
503
|
|
504 This is clearly overkill for a function that will likely overflow the
|
|
505 ``int`` type before the vectorization is worthwhile - but then again, this
|
|
506 is a toy example.
|
|
507
|
|
508 Turning down the optimization level to 2:
|
|
509
|
|
510 .. code-block:: c++
|
|
511
|
|
512 ctxt.set_int_option (GCC_JIT_INT_OPTION_OPTIMIZATION_LEVEL, 2);
|
|
513
|
|
514 yields this code, which is simple enough to quote in its entirety:
|
|
515
|
|
516 .. code-block:: gas
|
|
517
|
|
518 .file "fake.c"
|
|
519 .text
|
|
520 .p2align 4,,15
|
|
521 .globl factorial
|
|
522 .type factorial, @function
|
|
523 factorial:
|
|
524 .LFB0:
|
|
525 .cfi_startproc
|
|
526 .L2:
|
|
527 cmpl $1, %edi
|
|
528 jle .L8
|
|
529 movl $1, %edx
|
|
530 jmp .L4
|
|
531 .p2align 4,,10
|
|
532 .p2align 3
|
|
533 .L6:
|
|
534 movl %eax, %edi
|
|
535 .L4:
|
|
536 .L5:
|
|
537 leal -1(%rdi), %eax
|
|
538 imull %edi, %edx
|
|
539 cmpl $1, %eax
|
|
540 jne .L6
|
|
541 .L3:
|
|
542 .L7:
|
|
543 imull %edx, %eax
|
|
544 ret
|
|
545 .L8:
|
|
546 movl %edi, %eax
|
|
547 movl $1, %edx
|
|
548 jmp .L7
|
|
549 .cfi_endproc
|
|
550 .LFE0:
|
|
551 .size factorial, .-factorial
|
|
552 .ident "GCC: (GNU) 4.9.0 20131023 (Red Hat 0.2-%{gcc_release})"
|
|
553 .section .note.GNU-stack,"",@progbits
|
|
554
|
|
555 Note that the stack pushing and popping have been eliminated, as has the
|
|
556 recursive call (in favor of an iteration).
|
|
557
|
|
558 Putting it all together
|
|
559 ***********************
|
|
560
|
|
561 The complete example can be seen in the source tree at
|
|
562 ``gcc/jit/docs/examples/tut04-toyvm/toyvm.cc``
|
|
563
|
|
564 along with a Makefile and a couple of sample .toy scripts:
|
|
565
|
|
566 .. code-block:: console
|
|
567
|
|
568 $ ls -al
|
|
569 drwxrwxr-x. 2 david david 4096 Sep 19 17:46 .
|
|
570 drwxrwxr-x. 3 david david 4096 Sep 19 15:26 ..
|
|
571 -rw-rw-r--. 1 david david 615 Sep 19 12:43 factorial.toy
|
|
572 -rw-rw-r--. 1 david david 834 Sep 19 13:08 fibonacci.toy
|
|
573 -rw-rw-r--. 1 david david 238 Sep 19 14:22 Makefile
|
|
574 -rw-rw-r--. 1 david david 16457 Sep 19 17:07 toyvm.cc
|
|
575
|
|
576 $ make toyvm
|
|
577 g++ -Wall -g -o toyvm toyvm.cc -lgccjit
|
|
578
|
|
579 $ ./toyvm factorial.toy 10
|
|
580 interpreter result: 3628800
|
|
581 compiler result: 3628800
|
|
582
|
|
583 $ ./toyvm fibonacci.toy 10
|
|
584 interpreter result: 55
|
|
585 compiler result: 55
|
|
586
|
|
587 Behind the curtain: How does our code get optimized?
|
|
588 ****************************************************
|
|
589
|
|
590 Our example is done, but you may be wondering about exactly how the
|
|
591 compiler turned what we gave it into the machine code seen above.
|
|
592
|
|
593 We can examine what the compiler is doing in detail by setting:
|
|
594
|
|
595 .. code-block:: c++
|
|
596
|
|
597 state.ctxt.set_bool_option (GCC_JIT_BOOL_OPTION_DUMP_EVERYTHING, 1);
|
|
598 state.ctxt.set_bool_option (GCC_JIT_BOOL_OPTION_KEEP_INTERMEDIATES, 1);
|
|
599
|
|
600 This will dump detailed information about the compiler's state to a
|
|
601 directory under ``/tmp``, and keep it from being cleaned up.
|
|
602
|
|
603 The precise names and their formats of these files is subject to change.
|
|
604 Higher optimization levels lead to more files.
|
|
605 Here's what I saw (edited for brevity; there were almost 200 files):
|
|
606
|
|
607 .. code-block:: console
|
|
608
|
|
609 intermediate files written to /tmp/libgccjit-KPQbGw
|
|
610 $ ls /tmp/libgccjit-KPQbGw/
|
|
611 fake.c.000i.cgraph
|
|
612 fake.c.000i.type-inheritance
|
|
613 fake.c.004t.gimple
|
|
614 fake.c.007t.omplower
|
|
615 fake.c.008t.lower
|
|
616 fake.c.011t.eh
|
|
617 fake.c.012t.cfg
|
|
618 fake.c.014i.visibility
|
|
619 fake.c.015i.early_local_cleanups
|
|
620 fake.c.016t.ssa
|
|
621 # etc
|
|
622
|
|
623 The gimple code is converted into Static Single Assignment form,
|
|
624 with annotations for use when generating the debuginfo:
|
|
625
|
|
626 .. code-block:: console
|
|
627
|
|
628 $ less /tmp/libgccjit-KPQbGw/fake.c.016t.ssa
|
|
629
|
|
630 .. code-block:: c
|
|
631
|
|
632 ;; Function factorial (factorial, funcdef_no=0, decl_uid=53, symbol_order=0)
|
|
633
|
|
634 factorial (signed int arg)
|
|
635 {
|
|
636 signed int stack[8];
|
|
637 signed int stack_depth;
|
|
638 signed int x;
|
|
639 signed int y;
|
|
640 <unnamed type> _20;
|
|
641 signed int _21;
|
|
642 signed int _38;
|
|
643 signed int _44;
|
|
644 signed int _51;
|
|
645 signed int _56;
|
|
646
|
|
647 initial:
|
|
648 stack_depth_3 = 0;
|
|
649 # DEBUG stack_depth => stack_depth_3
|
|
650 stack[stack_depth_3] = arg_5(D);
|
|
651 stack_depth_7 = stack_depth_3 + 1;
|
|
652 # DEBUG stack_depth => stack_depth_7
|
|
653 # DEBUG instr0 => NULL
|
|
654 # DEBUG /* DUP */ => NULL
|
|
655 stack_depth_8 = stack_depth_7 + -1;
|
|
656 # DEBUG stack_depth => stack_depth_8
|
|
657 x_9 = stack[stack_depth_8];
|
|
658 # DEBUG x => x_9
|
|
659 stack[stack_depth_8] = x_9;
|
|
660 stack_depth_11 = stack_depth_8 + 1;
|
|
661 # DEBUG stack_depth => stack_depth_11
|
|
662 stack[stack_depth_11] = x_9;
|
|
663 stack_depth_13 = stack_depth_11 + 1;
|
|
664 # DEBUG stack_depth => stack_depth_13
|
|
665 # DEBUG instr1 => NULL
|
|
666 # DEBUG /* PUSH_CONST */ => NULL
|
|
667 stack[stack_depth_13] = 2;
|
|
668
|
|
669 /* etc; edited for brevity */
|
|
670
|
|
671 We can perhaps better see the code by turning off
|
|
672 :c:macro:`GCC_JIT_BOOL_OPTION_DEBUGINFO` to suppress all those ``DEBUG``
|
|
673 statements, giving:
|
|
674
|
|
675 .. code-block:: console
|
|
676
|
|
677 $ less /tmp/libgccjit-1Hywc0/fake.c.016t.ssa
|
|
678
|
|
679 .. code-block:: c
|
|
680
|
|
681 ;; Function factorial (factorial, funcdef_no=0, decl_uid=53, symbol_order=0)
|
|
682
|
|
683 factorial (signed int arg)
|
|
684 {
|
|
685 signed int stack[8];
|
|
686 signed int stack_depth;
|
|
687 signed int x;
|
|
688 signed int y;
|
|
689 <unnamed type> _20;
|
|
690 signed int _21;
|
|
691 signed int _38;
|
|
692 signed int _44;
|
|
693 signed int _51;
|
|
694 signed int _56;
|
|
695
|
|
696 initial:
|
|
697 stack_depth_3 = 0;
|
|
698 stack[stack_depth_3] = arg_5(D);
|
|
699 stack_depth_7 = stack_depth_3 + 1;
|
|
700 stack_depth_8 = stack_depth_7 + -1;
|
|
701 x_9 = stack[stack_depth_8];
|
|
702 stack[stack_depth_8] = x_9;
|
|
703 stack_depth_11 = stack_depth_8 + 1;
|
|
704 stack[stack_depth_11] = x_9;
|
|
705 stack_depth_13 = stack_depth_11 + 1;
|
|
706 stack[stack_depth_13] = 2;
|
|
707 stack_depth_15 = stack_depth_13 + 1;
|
|
708 stack_depth_16 = stack_depth_15 + -1;
|
|
709 y_17 = stack[stack_depth_16];
|
|
710 stack_depth_18 = stack_depth_16 + -1;
|
|
711 x_19 = stack[stack_depth_18];
|
|
712 _20 = x_19 < y_17;
|
|
713 _21 = (signed int) _20;
|
|
714 stack[stack_depth_18] = _21;
|
|
715 stack_depth_23 = stack_depth_18 + 1;
|
|
716 stack_depth_24 = stack_depth_23 + -1;
|
|
717 x_25 = stack[stack_depth_24];
|
|
718 if (x_25 != 0)
|
|
719 goto <bb 4> (instr9);
|
|
720 else
|
|
721 goto <bb 3> (instr4);
|
|
722
|
|
723 instr4:
|
|
724 /* DUP */:
|
|
725 stack_depth_26 = stack_depth_24 + -1;
|
|
726 x_27 = stack[stack_depth_26];
|
|
727 stack[stack_depth_26] = x_27;
|
|
728 stack_depth_29 = stack_depth_26 + 1;
|
|
729 stack[stack_depth_29] = x_27;
|
|
730 stack_depth_31 = stack_depth_29 + 1;
|
|
731 stack[stack_depth_31] = 1;
|
|
732 stack_depth_33 = stack_depth_31 + 1;
|
|
733 stack_depth_34 = stack_depth_33 + -1;
|
|
734 y_35 = stack[stack_depth_34];
|
|
735 stack_depth_36 = stack_depth_34 + -1;
|
|
736 x_37 = stack[stack_depth_36];
|
|
737 _38 = x_37 - y_35;
|
|
738 stack[stack_depth_36] = _38;
|
|
739 stack_depth_40 = stack_depth_36 + 1;
|
|
740 stack_depth_41 = stack_depth_40 + -1;
|
|
741 x_42 = stack[stack_depth_41];
|
|
742 _44 = factorial (x_42);
|
|
743 stack[stack_depth_41] = _44;
|
|
744 stack_depth_46 = stack_depth_41 + 1;
|
|
745 stack_depth_47 = stack_depth_46 + -1;
|
|
746 y_48 = stack[stack_depth_47];
|
|
747 stack_depth_49 = stack_depth_47 + -1;
|
|
748 x_50 = stack[stack_depth_49];
|
|
749 _51 = x_50 * y_48;
|
|
750 stack[stack_depth_49] = _51;
|
|
751 stack_depth_53 = stack_depth_49 + 1;
|
|
752
|
|
753 # stack_depth_1 = PHI <stack_depth_24(2), stack_depth_53(3)>
|
|
754 instr9:
|
|
755 /* RETURN */:
|
|
756 stack_depth_54 = stack_depth_1 + -1;
|
|
757 x_55 = stack[stack_depth_54];
|
|
758 _56 = x_55;
|
|
759 stack ={v} {CLOBBER};
|
|
760 return _56;
|
|
761
|
|
762 }
|
|
763
|
|
764 Note in the above how all the :type:`gccjit::block` instances we
|
|
765 created have been consolidated into just 3 blocks in GCC's internal
|
|
766 representation: ``initial``, ``instr4`` and ``instr9``.
|
|
767
|
|
768 Optimizing away stack manipulation
|
|
769 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
770 Recall our simple implementation of stack operations. Let's examine
|
|
771 how the stack operations are optimized away.
|
|
772
|
|
773 After a pass of constant-propagation, the depth of the stack at each
|
|
774 opcode can be determined at compile-time:
|
|
775
|
|
776 .. code-block:: console
|
|
777
|
|
778 $ less /tmp/libgccjit-1Hywc0/fake.c.021t.ccp1
|
|
779
|
|
780 .. code-block:: c
|
|
781
|
|
782 ;; Function factorial (factorial, funcdef_no=0, decl_uid=53, symbol_order=0)
|
|
783
|
|
784 factorial (signed int arg)
|
|
785 {
|
|
786 signed int stack[8];
|
|
787 signed int stack_depth;
|
|
788 signed int x;
|
|
789 signed int y;
|
|
790 <unnamed type> _20;
|
|
791 signed int _21;
|
|
792 signed int _38;
|
|
793 signed int _44;
|
|
794 signed int _51;
|
|
795
|
|
796 initial:
|
|
797 stack[0] = arg_5(D);
|
|
798 x_9 = stack[0];
|
|
799 stack[0] = x_9;
|
|
800 stack[1] = x_9;
|
|
801 stack[2] = 2;
|
|
802 y_17 = stack[2];
|
|
803 x_19 = stack[1];
|
|
804 _20 = x_19 < y_17;
|
|
805 _21 = (signed int) _20;
|
|
806 stack[1] = _21;
|
|
807 x_25 = stack[1];
|
|
808 if (x_25 != 0)
|
|
809 goto <bb 4> (instr9);
|
|
810 else
|
|
811 goto <bb 3> (instr4);
|
|
812
|
|
813 instr4:
|
|
814 /* DUP */:
|
|
815 x_27 = stack[0];
|
|
816 stack[0] = x_27;
|
|
817 stack[1] = x_27;
|
|
818 stack[2] = 1;
|
|
819 y_35 = stack[2];
|
|
820 x_37 = stack[1];
|
|
821 _38 = x_37 - y_35;
|
|
822 stack[1] = _38;
|
|
823 x_42 = stack[1];
|
|
824 _44 = factorial (x_42);
|
|
825 stack[1] = _44;
|
|
826 y_48 = stack[1];
|
|
827 x_50 = stack[0];
|
|
828 _51 = x_50 * y_48;
|
|
829 stack[0] = _51;
|
|
830
|
|
831 instr9:
|
|
832 /* RETURN */:
|
|
833 x_55 = stack[0];
|
|
834 x_56 = x_55;
|
|
835 stack ={v} {CLOBBER};
|
|
836 return x_56;
|
|
837
|
|
838 }
|
|
839
|
|
840 Note how, in the above, all those ``stack_depth`` values are now just
|
|
841 constants: we're accessing specific stack locations at each opcode.
|
|
842
|
|
843 The "esra" pass ("Early Scalar Replacement of Aggregates") breaks
|
|
844 out our "stack" array into individual elements:
|
|
845
|
|
846 .. code-block:: console
|
|
847
|
|
848 $ less /tmp/libgccjit-1Hywc0/fake.c.024t.esra
|
|
849
|
|
850 .. code-block:: c
|
|
851
|
|
852 ;; Function factorial (factorial, funcdef_no=0, decl_uid=53, symbol_order=0)
|
|
853
|
|
854 Created a replacement for stack offset: 0, size: 32: stack$0
|
|
855 Created a replacement for stack offset: 32, size: 32: stack$1
|
|
856 Created a replacement for stack offset: 64, size: 32: stack$2
|
|
857
|
|
858 Symbols to be put in SSA form
|
|
859 { D.89 D.90 D.91 }
|
|
860 Incremental SSA update started at block: 0
|
|
861 Number of blocks in CFG: 5
|
|
862 Number of blocks to update: 4 ( 80%)
|
|
863
|
|
864
|
|
865 factorial (signed int arg)
|
|
866 {
|
|
867 signed int stack$2;
|
|
868 signed int stack$1;
|
|
869 signed int stack$0;
|
|
870 signed int stack[8];
|
|
871 signed int stack_depth;
|
|
872 signed int x;
|
|
873 signed int y;
|
|
874 <unnamed type> _20;
|
|
875 signed int _21;
|
|
876 signed int _38;
|
|
877 signed int _44;
|
|
878 signed int _51;
|
|
879
|
|
880 initial:
|
|
881 stack$0_45 = arg_5(D);
|
|
882 x_9 = stack$0_45;
|
|
883 stack$0_39 = x_9;
|
|
884 stack$1_32 = x_9;
|
|
885 stack$2_30 = 2;
|
|
886 y_17 = stack$2_30;
|
|
887 x_19 = stack$1_32;
|
|
888 _20 = x_19 < y_17;
|
|
889 _21 = (signed int) _20;
|
|
890 stack$1_28 = _21;
|
|
891 x_25 = stack$1_28;
|
|
892 if (x_25 != 0)
|
|
893 goto <bb 4> (instr9);
|
|
894 else
|
|
895 goto <bb 3> (instr4);
|
|
896
|
|
897 instr4:
|
|
898 /* DUP */:
|
|
899 x_27 = stack$0_39;
|
|
900 stack$0_22 = x_27;
|
|
901 stack$1_14 = x_27;
|
|
902 stack$2_12 = 1;
|
|
903 y_35 = stack$2_12;
|
|
904 x_37 = stack$1_14;
|
|
905 _38 = x_37 - y_35;
|
|
906 stack$1_10 = _38;
|
|
907 x_42 = stack$1_10;
|
|
908 _44 = factorial (x_42);
|
|
909 stack$1_6 = _44;
|
|
910 y_48 = stack$1_6;
|
|
911 x_50 = stack$0_22;
|
|
912 _51 = x_50 * y_48;
|
|
913 stack$0_1 = _51;
|
|
914
|
|
915 # stack$0_52 = PHI <stack$0_39(2), stack$0_1(3)>
|
|
916 instr9:
|
|
917 /* RETURN */:
|
|
918 x_55 = stack$0_52;
|
|
919 x_56 = x_55;
|
|
920 stack ={v} {CLOBBER};
|
|
921 return x_56;
|
|
922
|
|
923 }
|
|
924
|
|
925 Hence at this point, all those pushes and pops of the stack are now
|
|
926 simply assignments to specific temporary variables.
|
|
927
|
|
928 After some copy propagation, the stack manipulation has been completely
|
|
929 optimized away:
|
|
930
|
|
931 .. code-block:: console
|
|
932
|
|
933 $ less /tmp/libgccjit-1Hywc0/fake.c.026t.copyprop1
|
|
934
|
|
935 .. code-block:: c
|
|
936
|
|
937 ;; Function factorial (factorial, funcdef_no=0, decl_uid=53, symbol_order=0)
|
|
938
|
|
939 factorial (signed int arg)
|
|
940 {
|
|
941 signed int stack$2;
|
|
942 signed int stack$1;
|
|
943 signed int stack$0;
|
|
944 signed int stack[8];
|
|
945 signed int stack_depth;
|
|
946 signed int x;
|
|
947 signed int y;
|
|
948 <unnamed type> _20;
|
|
949 signed int _21;
|
|
950 signed int _38;
|
|
951 signed int _44;
|
|
952 signed int _51;
|
|
953
|
|
954 initial:
|
|
955 stack$0_39 = arg_5(D);
|
|
956 _20 = arg_5(D) <= 1;
|
|
957 _21 = (signed int) _20;
|
|
958 if (_21 != 0)
|
|
959 goto <bb 4> (instr9);
|
|
960 else
|
|
961 goto <bb 3> (instr4);
|
|
962
|
|
963 instr4:
|
|
964 /* DUP */:
|
|
965 _38 = arg_5(D) + -1;
|
|
966 _44 = factorial (_38);
|
|
967 _51 = arg_5(D) * _44;
|
|
968 stack$0_1 = _51;
|
|
969
|
|
970 # stack$0_52 = PHI <arg_5(D)(2), _51(3)>
|
|
971 instr9:
|
|
972 /* RETURN */:
|
|
973 stack ={v} {CLOBBER};
|
|
974 return stack$0_52;
|
|
975
|
|
976 }
|
|
977
|
|
978 Later on, another pass finally eliminated ``stack_depth`` local and the
|
|
979 unused parts of the `stack`` array altogether:
|
|
980
|
|
981 .. code-block:: console
|
|
982
|
|
983 $ less /tmp/libgccjit-1Hywc0/fake.c.036t.release_ssa
|
|
984
|
|
985 .. code-block:: c
|
|
986
|
|
987 ;; Function factorial (factorial, funcdef_no=0, decl_uid=53, symbol_order=0)
|
|
988
|
|
989 Released 44 names, 314.29%, removed 44 holes
|
|
990 factorial (signed int arg)
|
|
991 {
|
|
992 signed int stack$0;
|
|
993 signed int mult_acc_1;
|
|
994 <unnamed type> _5;
|
|
995 signed int _6;
|
|
996 signed int _7;
|
|
997 signed int mul_tmp_10;
|
|
998 signed int mult_acc_11;
|
|
999 signed int mult_acc_13;
|
|
1000
|
|
1001 # arg_9 = PHI <arg_8(D)(0)>
|
|
1002 # mult_acc_13 = PHI <1(0)>
|
|
1003 initial:
|
|
1004
|
|
1005 <bb 5>:
|
|
1006 # arg_4 = PHI <arg_9(2), _7(3)>
|
|
1007 # mult_acc_1 = PHI <mult_acc_13(2), mult_acc_11(3)>
|
|
1008 _5 = arg_4 <= 1;
|
|
1009 _6 = (signed int) _5;
|
|
1010 if (_6 != 0)
|
|
1011 goto <bb 4> (instr9);
|
|
1012 else
|
|
1013 goto <bb 3> (instr4);
|
|
1014
|
|
1015 instr4:
|
|
1016 /* DUP */:
|
|
1017 _7 = arg_4 + -1;
|
|
1018 mult_acc_11 = mult_acc_1 * arg_4;
|
|
1019 goto <bb 5>;
|
|
1020
|
|
1021 # stack$0_12 = PHI <arg_4(5)>
|
|
1022 instr9:
|
|
1023 /* RETURN */:
|
|
1024 mul_tmp_10 = mult_acc_1 * stack$0_12;
|
|
1025 return mul_tmp_10;
|
|
1026
|
|
1027 }
|
|
1028
|
|
1029
|
|
1030 Elimination of tail recursion
|
|
1031 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
1032 Another significant optimization is the detection that the call to
|
|
1033 ``factorial`` is tail recursion, which can be eliminated in favor of
|
|
1034 an iteration:
|
|
1035
|
|
1036 .. code-block:: console
|
|
1037
|
|
1038 $ less /tmp/libgccjit-1Hywc0/fake.c.030t.tailr1
|
|
1039
|
|
1040 .. code-block:: c
|
|
1041
|
|
1042 ;; Function factorial (factorial, funcdef_no=0, decl_uid=53, symbol_order=0)
|
|
1043
|
|
1044
|
|
1045 Symbols to be put in SSA form
|
|
1046 { D.88 }
|
|
1047 Incremental SSA update started at block: 0
|
|
1048 Number of blocks in CFG: 5
|
|
1049 Number of blocks to update: 4 ( 80%)
|
|
1050
|
|
1051
|
|
1052 factorial (signed int arg)
|
|
1053 {
|
|
1054 signed int stack$2;
|
|
1055 signed int stack$1;
|
|
1056 signed int stack$0;
|
|
1057 signed int stack[8];
|
|
1058 signed int stack_depth;
|
|
1059 signed int x;
|
|
1060 signed int y;
|
|
1061 signed int mult_acc_1;
|
|
1062 <unnamed type> _20;
|
|
1063 signed int _21;
|
|
1064 signed int _38;
|
|
1065 signed int mul_tmp_44;
|
|
1066 signed int mult_acc_51;
|
|
1067
|
|
1068 # arg_5 = PHI <arg_39(D)(0), _38(3)>
|
|
1069 # mult_acc_1 = PHI <1(0), mult_acc_51(3)>
|
|
1070 initial:
|
|
1071 _20 = arg_5 <= 1;
|
|
1072 _21 = (signed int) _20;
|
|
1073 if (_21 != 0)
|
|
1074 goto <bb 4> (instr9);
|
|
1075 else
|
|
1076 goto <bb 3> (instr4);
|
|
1077
|
|
1078 instr4:
|
|
1079 /* DUP */:
|
|
1080 _38 = arg_5 + -1;
|
|
1081 mult_acc_51 = mult_acc_1 * arg_5;
|
|
1082 goto <bb 2> (initial);
|
|
1083
|
|
1084 # stack$0_52 = PHI <arg_5(2)>
|
|
1085 instr9:
|
|
1086 /* RETURN */:
|
|
1087 stack ={v} {CLOBBER};
|
|
1088 mul_tmp_44 = mult_acc_1 * stack$0_52;
|
|
1089 return mul_tmp_44;
|
|
1090
|
|
1091 }
|