How good is the optimized code?
We can turn up optimizations, by calling gcc_jit_context_set_int_option() with GCC_JIT_INT_OPTION_OPTIMIZATION_LEVEL:
gcc_jit_context_set_int_option ( ctxt, GCC_JIT_INT_OPTION_OPTIMIZATION_LEVEL, 3);
One of GCC’s internal representations is called “gimple”. A dump of the initial gimple representation of the code can be seen by setting:
gcc_jit_context_set_bool_option (ctxt,
GCC_JIT_BOOL_OPTION_DUMP_INITIAL_GIMPLE,
1);
With optimization on and source locations displayed, this gives:
factorial (signed int arg)
{
<unnamed type> D.80;
signed int D.81;
signed int D.82;
signed int D.83;
signed int D.84;
signed int D.85;
signed int y;
signed int x;
signed int stack_depth;
signed int stack[8];
try
{
initial:
stack_depth = 0;
stack[stack_depth] = arg;
stack_depth = stack_depth + 1;
goto instr0;
instr0:
/* DUP */:
stack_depth = stack_depth + -1;
x = stack[stack_depth];
stack[stack_depth] = x;
stack_depth = stack_depth + 1;
stack[stack_depth] = x;
stack_depth = stack_depth + 1;
goto instr1;
instr1:
/* PUSH_CONST */:
stack[stack_depth] = 2;
stack_depth = stack_depth + 1;
goto instr2;
/* etc */
You can see the generated machine code in assembly form via:
gcc_jit_context_set_bool_option ( ctxt, GCC_JIT_BOOL_OPTION_DUMP_GENERATED_CODE, 1); result = gcc_jit_context_compile (ctxt);
which shows that (on this x86_64 box) the compiler has unrolled the loop and is using MMX instructions to perform several multiplications simultaneously:
.file "fake.c"
.text
.Ltext0:
.p2align 4,,15
.globl factorial
.type factorial, @function
factorial:
.LFB0:
.file 1 "factorial.toy"
.loc 1 14 0
.cfi_startproc
.LVL0:
.L2:
.loc 1 26 0
cmpl $1, %edi
jle .L13
leal -1(%rdi), %edx
movl %edx, %ecx
shrl $2, %ecx
leal 0(,%rcx,4), %esi
testl %esi, %esi
je .L14
cmpl $9, %edx
jbe .L14
leal -2(%rdi), %eax
movl %eax, -16(%rsp)
leal -3(%rdi), %eax
movd -16(%rsp), %xmm0
movl %edi, -16(%rsp)
movl %eax, -12(%rsp)
movd -16(%rsp), %xmm1
xorl %eax, %eax
movl %edx, -16(%rsp)
movd -12(%rsp), %xmm4
movd -16(%rsp), %xmm6
punpckldq %xmm4, %xmm0
movdqa .LC1(%rip), %xmm4
punpckldq %xmm6, %xmm1
punpcklqdq %xmm0, %xmm1
movdqa .LC0(%rip), %xmm0
jmp .L5
# etc - edited for brevity
This is clearly overkill for a function that will likely overflow the
int type before the vectorization is worthwhile - but then again, this
is a toy example.
Turning down the optimization level to 2:
gcc_jit_context_set_int_option ( ctxt, GCC_JIT_INT_OPTION_OPTIMIZATION_LEVEL, 3);
yields this code, which is simple enough to quote in its entirety:
.file "fake.c"
.text
.p2align 4,,15
.globl factorial
.type factorial, @function
factorial:
.LFB0:
.cfi_startproc
.L2:
cmpl $1, %edi
jle .L8
movl $1, %edx
jmp .L4
.p2align 4,,10
.p2align 3
.L6:
movl %eax, %edi
.L4:
.L5:
leal -1(%rdi), %eax
imull %edi, %edx
cmpl $1, %eax
jne .L6
.L3:
.L7:
imull %edx, %eax
ret
.L8:
movl %edi, %eax
movl $1, %edx
jmp .L7
.cfi_endproc
.LFE0:
.size factorial, .-factorial
.ident "GCC: (GNU) 4.9.0 20131023 (Red Hat 0.2-%{gcc_release})"
.section .note.GNU-stack,"",@progbits
Note that the stack pushing and popping have been eliminated, as has the recursive call (in favor of an iteration).