GCC Optimizations

Started by biolizard89, September 25, 2012, 09:02:51 PM

Previous topic - Next topic

biolizard89

So I'm writing a simple loop in C, and trying to convert it to ASM such that it might be embedded in a Gecko code.  Compiling it yields this ASM:

.file "discovery_hook.c"
.gnu_attribute 4, 1
.gnu_attribute 8, 1
.gnu_attribute 12, 1
.section ".text"
.align 2
.globl discover_pointer
.type discover_pointer, @function
discover_pointer:
stwu 1,-40(1)
stw 31,36(1)
mr 31,1
stw 3,24(31)
stw 4,28(31)
lis 0,0x4c4
ori 0,0,47888
mr 9,0
lwz 0,0(9)
addic 0,0,6
stw 0,12(31)
lwz 0,24(31)
addic 0,0,2
slwi 0,0,2
lwz 9,12(31)
add 0,9,0
stw 0,8(31)
.L6:
lwz 0,8(31)
mr 11,0
lwz 9,0(11)
lwz 0,28(31)
cmpw 7,9,0
beq 7,.L8
.L2:
lwz 0,8(31)
mr 11,0
lwz 9,0(11)
li 0,-1
cmpw 7,9,0
beq 7,.L9
.L4:
lwz 0,8(31)
mr 9,0
lwz 0,0(9)
cmpwi 7,0,0
bne 7,.L5
lwz 0,8(31)
lwz 9,28(31)
mr 11,0
stw 9,0(11)
.L5:
lwz 0,12(31)
mr 9,0
lwz 0,0(9)
slwi 0,0,2
lwz 9,8(31)
add 0,9,0
stw 0,8(31)
b .L6
.L8:
nop
b .L7
.L9:
nop
.L7:
addi 11,31,40
lwz 31,-4(11)
mr 1,11
blr
.size discover_pointer, .-discover_pointer
.align 2
.globl test
.type test, @function
test:
stwu 1,-16(1)
mflr 0
stw 0,20(1)
stw 31,12(1)
mr 31,1
# 37 "discovery_hook.c" 1
nop
# 0 "" 2
li 3,16
lis 0,0x8012
ori 4,0,13398
bl discover_pointer
# 41 "discovery_hook.c" 1
nop
# 0 "" 2
li 3,32
lis 0,0x8065
ori 4,0,17185
bl discover_pointer
# 45 "discovery_hook.c" 1
nop
# 0 "" 2
addi 11,31,16
lwz 0,4(11)
mtlr 0
lwz 31,-4(11)
mr 1,11
blr
.size test, .-test
.ident "GCC: (devkitPPC release 21) 4.4.3"


The logic involved in the loop isn't that interesting, what matters to me is that the test routine calls the discover_pointer routine using bl, and both routines end in blr, which is what I would expect.

I then compiled it with the -Os flag, which optimizes for size.  The result:

.file "discovery_hook.c"
.gnu_attribute 4, 1
.gnu_attribute 8, 1
.gnu_attribute 12, 1
.section ".text"
.align 2
.globl discover_pointer
.type discover_pointer, @function
discover_pointer:
lis 9,0x4c4
addi 3,3,2
ori 9,9,47888
slwi 3,3,2
lwz 9,0(9)
addi 9,9,6
add 3,9,3
.L4:
lwz 0,0(3)
cmpw 7,0,4
beqlr- 7
lwz 0,0(3)
cmpwi 7,0,-1
beqlr- 7
lwz 0,0(3)
cmpwi 7,0,0
bne- 7,.L3
stw 4,0(3)
.L3:
lwz 0,0(9)
slwi 0,0,2
add 3,3,0
b .L4
.size discover_pointer, .-discover_pointer
.align 2
.globl test
.type test, @function
test:
# 37 "discovery_hook.c" 1
nop
# 0 "" 2
lis 9,0x4c4
lis 0,0x8012
ori 9,9,47888
ori 0,0,13398
lwz 11,0(9)
addi 9,11,78
addi 11,11,6
.L10:
lwz 10,0(9)
cmpw 7,10,0
beq- 7,.L8
lwz 10,0(9)
cmpwi 7,10,-1
beq- 7,.L8
lwz 10,0(9)
cmpwi 7,10,0
bne- 7,.L9
stw 0,0(9)
.L9:
lwz 10,0(11)
slwi 10,10,2
add 9,9,10
b .L10
.L8:
# 41 "discovery_hook.c" 1
nop
# 0 "" 2
lis 9,0x4c4
lis 0,0x8065
ori 9,9,47888
ori 0,0,17185
lwz 11,0(9)
addi 9,11,142
addi 11,11,6
.L13:
lwz 10,0(9)
cmpw 7,10,0
beq- 7,.L11
lwz 10,0(9)
cmpwi 7,10,-1
beq- 7,.L11
lwz 10,0(9)
cmpwi 7,10,0
bne- 7,.L12
stw 0,0(9)
.L12:
lwz 10,0(11)
slwi 10,10,2
add 9,9,10
b .L13
.L11:
# 45 "discovery_hook.c" 1
nop
# 0 "" 2
blr
.size test, .-test
.ident "GCC: (devkitPPC release 21) 4.4.3"


There are now no "bl" instructions to be found, and the "blr" at the end of discover_pointer is gone too.  What did GCC do here?  Is this code still valid and equivalent to the first one?  If I were to put the 2 arguments for the size-optimized discover_pointer into r3 and r4 and then branch to that routine, would it behave the same as the non-optimized one?

An explanation would be greatly appreciated.

dcx2

Please post the corresponding C code that generated this ASM, so that we know what kind of loop it is, what causes it to terminate, etc.  Might help understand what's going on.

Note that discover_pointer also has beqlr-, which essentially means "blr if equal".  I tried to look for bctrl or blrl or beql or bnel, but couldn't find any.  Strange.  Maybe it in-lined discover_pointer to avoid creating a stack frame?

biolizard89

#2
Quote from: dcx2 on September 25, 2012, 09:18:23 PM
Please post the corresponding C code that generated this ASM, so that we know what kind of loop it is, what causes it to terminate, etc.  Might help understand what's going on.

Note that discover_pointer also has beqlr-, which essentially means "blr if equal".  I tried to look for bctrl or blrl or beql or bnel, but couldn't find any.  Strange.  Maybe it in-lined discover_pointer to avoid creating a stack frame?
Here's the C code:

// arguments are r3, r4
void discover_pointer(long code_list_offset, unsigned long pointer)
{
volatile unsigned long* list_length = (volatile unsigned long*) ( (*((volatile unsigned long*)(80001808))) + 6);
volatile unsigned long* code_check = list_length + (code_list_offset + 2);

for(;;)
{
if(*code_check == pointer)
{
break;
}
if(*code_check == 0xFFFFFFFF)
{
break;
}
if(! (*code_check))
{
*code_check = pointer;
}

code_check += *list_length;
}
}

void test()
{
asm("nop");

discover_pointer(16, 0x80123456);

asm("nop");

discover_pointer(32, 0x80654321);

asm("nop");
}

"blr if equal" actually makes some sense, since there are if statements which result in the function ending.  I was unaware that such an instruction existed, although I'm not surprised in retrospect.  But where did the bl from the test routine go?  :confused:

Edit: also, if GCC inlined the function, why would it choose to do so when I'm optimizing for size, since it made the test function longer?

dcx2

Yeah, it definitely inlined your discover_pointer.  I don't know why it included discover_pointer at the top, but that chunk of code is never called.  If you look, you can see that discover_pointer's code looks suspiciously like the stuff between your nop's (e.g. you see lis 0x4C4 at the beginning of both)

Not sure why it chose to do that.  Being that the function is small, it may actually take up less space to inline it twice rather than handle all the associated stack management.

I hate conditional blr's like beqlr because they interfere with Gecko.NET's ability to automatically detect the beginning and end of functions, because conditional blr's do not necessarily indicate the end of a function, and are often used for detecting corner cases at the beginning of a function (e.g. if (!ptr) return; )

FWIW, any branch can be made into a linking branch, as well.  So you can have bl, bctrl, beql, and even blrl (which I always interpret as "do a barrel roll!" lol) (believe it or not, blrl is actually used by the code handler because occasionally the code handler pretends ctr is non-volatile)

Note that in a conditional bl, the address after the branch is ALWAYS placed in the LR whether or not the branch is taken.  I have actually used this feature before when embedding data inside a C2 code, because it puts a pointer to the code into LR.