Recall our simple implementation of stack operations. Let’s examine how the stack operations are optimized away.
After a pass of constant-propagation, the depth of the stack at each opcode can be determined at compile-time:
$ less /tmp/libgccjit-1Hywc0/fake.c.021t.ccp1
;; Function factorial (factorial, funcdef_no=0, decl_uid=53, symbol_order=0) factorial (signed int arg) { signed int stack[8]; signed int stack_depth; signed int x; signed int y; <unnamed type> _20; signed int _21; signed int _38; signed int _44; signed int _51; initial: stack[0] = arg_5(D); x_9 = stack[0]; stack[0] = x_9; stack[1] = x_9; stack[2] = 2; y_17 = stack[2]; x_19 = stack[1]; _20 = x_19 < y_17; _21 = (signed int) _20; stack[1] = _21; x_25 = stack[1]; if (x_25 != 0) goto <bb 4> (instr9); else goto <bb 3> (instr4); instr4: /* DUP */: x_27 = stack[0]; stack[0] = x_27; stack[1] = x_27; stack[2] = 1; y_35 = stack[2]; x_37 = stack[1]; _38 = x_37 - y_35; stack[1] = _38; x_42 = stack[1]; _44 = factorial (x_42); stack[1] = _44; y_48 = stack[1]; x_50 = stack[0]; _51 = x_50 * y_48; stack[0] = _51; instr9: /* RETURN */: x_55 = stack[0]; x_56 = x_55; stack ={v} {CLOBBER}; return x_56; }
Note how, in the above, all those stack_depth
values are now just
constants: we’re accessing specific stack locations at each opcode.
The “esra” pass (“Early Scalar Replacement of Aggregates”) breaks out our “stack” array into individual elements:
$ less /tmp/libgccjit-1Hywc0/fake.c.024t.esra
;; Function factorial (factorial, funcdef_no=0, decl_uid=53, symbol_order=0) Created a replacement for stack offset: 0, size: 32: stack$0 Created a replacement for stack offset: 32, size: 32: stack$1 Created a replacement for stack offset: 64, size: 32: stack$2 Symbols to be put in SSA form { D.89 D.90 D.91 } Incremental SSA update started at block: 0 Number of blocks in CFG: 5 Number of blocks to update: 4 ( 80%) factorial (signed int arg) { signed int stack$2; signed int stack$1; signed int stack$0; signed int stack[8]; signed int stack_depth; signed int x; signed int y; <unnamed type> _20; signed int _21; signed int _38; signed int _44; signed int _51; initial: stack$0_45 = arg_5(D); x_9 = stack$0_45; stack$0_39 = x_9; stack$1_32 = x_9; stack$2_30 = 2; y_17 = stack$2_30; x_19 = stack$1_32; _20 = x_19 < y_17; _21 = (signed int) _20; stack$1_28 = _21; x_25 = stack$1_28; if (x_25 != 0) goto <bb 4> (instr9); else goto <bb 3> (instr4); instr4: /* DUP */: x_27 = stack$0_39; stack$0_22 = x_27; stack$1_14 = x_27; stack$2_12 = 1; y_35 = stack$2_12; x_37 = stack$1_14; _38 = x_37 - y_35; stack$1_10 = _38; x_42 = stack$1_10; _44 = factorial (x_42); stack$1_6 = _44; y_48 = stack$1_6; x_50 = stack$0_22; _51 = x_50 * y_48; stack$0_1 = _51; # stack$0_52 = PHI <stack$0_39(2), stack$0_1(3)> instr9: /* RETURN */: x_55 = stack$0_52; x_56 = x_55; stack ={v} {CLOBBER}; return x_56; }
Hence at this point, all those pushes and pops of the stack are now simply assignments to specific temporary variables.
After some copy propagation, the stack manipulation has been completely optimized away:
$ less /tmp/libgccjit-1Hywc0/fake.c.026t.copyprop1
;; Function factorial (factorial, funcdef_no=0, decl_uid=53, symbol_order=0) factorial (signed int arg) { signed int stack$2; signed int stack$1; signed int stack$0; signed int stack[8]; signed int stack_depth; signed int x; signed int y; <unnamed type> _20; signed int _21; signed int _38; signed int _44; signed int _51; initial: stack$0_39 = arg_5(D); _20 = arg_5(D) <= 1; _21 = (signed int) _20; if (_21 != 0) goto <bb 4> (instr9); else goto <bb 3> (instr4); instr4: /* DUP */: _38 = arg_5(D) + -1; _44 = factorial (_38); _51 = arg_5(D) * _44; stack$0_1 = _51; # stack$0_52 = PHI <arg_5(D)(2), _51(3)> instr9: /* RETURN */: stack ={v} {CLOBBER}; return stack$0_52; }
Later on, another pass finally eliminated stack_depth
local and the
unused parts of the stack‘ array altogether:
$ less /tmp/libgccjit-1Hywc0/fake.c.036t.release_ssa
;; Function factorial (factorial, funcdef_no=0, decl_uid=53, symbol_order=0) Released 44 names, 314.29%, removed 44 holes factorial (signed int arg) { signed int stack$0; signed int mult_acc_1; <unnamed type> _5; signed int _6; signed int _7; signed int mul_tmp_10; signed int mult_acc_11; signed int mult_acc_13; # arg_9 = PHI <arg_8(D)(0)> # mult_acc_13 = PHI <1(0)> initial: <bb 5>: # arg_4 = PHI <arg_9(2), _7(3)> # mult_acc_1 = PHI <mult_acc_13(2), mult_acc_11(3)> _5 = arg_4 <= 1; _6 = (signed int) _5; if (_6 != 0) goto <bb 4> (instr9); else goto <bb 3> (instr4); instr4: /* DUP */: _7 = arg_4 + -1; mult_acc_11 = mult_acc_1 * arg_4; goto <bb 5>; # stack$0_12 = PHI <arg_4(5)> instr9: /* RETURN */: mul_tmp_10 = mult_acc_1 * stack$0_12; return mul_tmp_10; }