3.1.4.8 Examining the generated code

How good is the optimized code?

We can turn up optimizations, by calling gccjit;;context;;set_int_option() with GCC_JIT_INT_OPTION_OPTIMIZATION_LEVEL:

ctxt.set_int_option (GCC_JIT_INT_OPTION_OPTIMIZATION_LEVEL, 3);

One of GCC’s internal representations is called “gimple”. A dump of the initial gimple representation of the code can be seen by setting:

ctxt.set_bool_option (GCC_JIT_BOOL_OPTION_DUMP_INITIAL_GIMPLE, 1);

With optimization on and source locations displayed, this gives:

factorial (signed int arg)
{
  <unnamed type> D.80;
  signed int D.81;
  signed int D.82;
  signed int D.83;
  signed int D.84;
  signed int D.85;
  signed int y;
  signed int x;
  signed int stack_depth;
  signed int stack[8];

  try
    {
      initial:
      stack_depth = 0;
      stack[stack_depth] = arg;
      stack_depth = stack_depth + 1;
      goto instr0;
      instr0:
      /* DUP */:
      stack_depth = stack_depth + -1;
      x = stack[stack_depth];
      stack[stack_depth] = x;
      stack_depth = stack_depth + 1;
      stack[stack_depth] = x;
      stack_depth = stack_depth + 1;
      goto instr1;
      instr1:
      /* PUSH_CONST */:
      stack[stack_depth] = 2;
      stack_depth = stack_depth + 1;
      goto instr2;

      /* etc */

You can see the generated machine code in assembly form via:

ctxt.set_bool_option (GCC_JIT_BOOL_OPTION_DUMP_GENERATED_CODE, 1);
result = ctxt.compile ();

which shows that (on this x86_64 box) the compiler has unrolled the loop and is using MMX instructions to perform several multiplications simultaneously:

        .file   "fake.c"
        .text
.Ltext0:
        .p2align 4,,15
        .globl  factorial
        .type   factorial, @function
factorial:
.LFB0:
        .file 1 "factorial.toy"
        .loc 1 14 0
        .cfi_startproc
.LVL0:
.L2:
        .loc 1 26 0
        cmpl    $1, %edi
        jle     .L13
        leal    -1(%rdi), %edx
        movl    %edx, %ecx
        shrl    $2, %ecx
        leal    0(,%rcx,4), %esi
        testl   %esi, %esi
        je      .L14
        cmpl    $9, %edx
        jbe     .L14
        leal    -2(%rdi), %eax
        movl    %eax, -16(%rsp)
        leal    -3(%rdi), %eax
        movd    -16(%rsp), %xmm0
        movl    %edi, -16(%rsp)
        movl    %eax, -12(%rsp)
        movd    -16(%rsp), %xmm1
        xorl    %eax, %eax
        movl    %edx, -16(%rsp)
        movd    -12(%rsp), %xmm4
        movd    -16(%rsp), %xmm6
        punpckldq       %xmm4, %xmm0
        movdqa  .LC1(%rip), %xmm4
        punpckldq       %xmm6, %xmm1
        punpcklqdq      %xmm0, %xmm1
        movdqa  .LC0(%rip), %xmm0
        jmp     .L5
        # etc - edited for brevity

This is clearly overkill for a function that will likely overflow the int type before the vectorization is worthwhile - but then again, this is a toy example.

Turning down the optimization level to 2:

ctxt.set_int_option (GCC_JIT_INT_OPTION_OPTIMIZATION_LEVEL, 2);

yields this code, which is simple enough to quote in its entirety:

        .file   "fake.c"
        .text
        .p2align 4,,15
        .globl  factorial
        .type   factorial, @function
factorial:
.LFB0:
        .cfi_startproc
.L2:
        cmpl    $1, %edi
        jle     .L8
        movl    $1, %edx
        jmp     .L4
        .p2align 4,,10
        .p2align 3
.L6:
        movl    %eax, %edi
.L4:
.L5:
        leal    -1(%rdi), %eax
        imull   %edi, %edx
        cmpl    $1, %eax
        jne     .L6
.L3:
.L7:
        imull   %edx, %eax
        ret
.L8:
        movl    %edi, %eax
        movl    $1, %edx
        jmp     .L7
        .cfi_endproc
.LFE0:
        .size   factorial, .-factorial
        .ident  "GCC: (GNU) 4.9.0 20131023 (Red Hat 0.2-%{gcc_release})"
        .section        .note.GNU-stack,"",@progbits

Note that the stack pushing and popping have been eliminated, as has the recursive call (in favor of an iteration).