Why are gcc's switch-generated jumps faster than equivalent function calls but only with static linking?
I was curious about how switch jumps perform relative to function calls so I whipped out a quick benchmark:
#!/bin/bash -eu
cat > get.c <<EOF
#include <stdint.h>
int get(int (Getter)(void))
{
uintptr_t getter=(uintptr_t)Getter;
if(1){
switch(getter){
case 0: return $RANDOM;
case 1: return $RANDOM;
case 2: return $RANDOM;
case 3: return $RANDOM;
case 4: return $RANDOM;
case 5: return $RANDOM;
default: return Getter();
}
}else{
if(0==getter) return $RANDOM;
else if(1==getter) return $RANDOM;
else if(2==getter) return $RANDOM;
else if(3==getter) return $RANDOM;
else if(4==getter) return $RANDOM;
else if(5==getter) return $RANDOM;
else return Getter();
}
}
EOF
cat > main.c <<EOF
int get(int (Getter)(void));
int Getter(void){ return 42; }
int main(int C, char**V)
{
if(C==1)
for(int i=0; i<1000000000;i++)
get((int(*)(void))4);
else
for(int i=0; i<1000000000;i++)
get(Getter);
}
EOF
: ${CC:=gcc}
arg='-Os -fpic'
for c in *.c; do $CC $arg -c $c; done
$CC get.o -o libget.so -shared
$CC main.o $PWD/libget.so -o dso
$CC main.o get.o -o dso -o static
set -x
time ./dso
time ./dso 1
time ./static
time ./static 1
The timings (relatively stable) are:
+ ./dso
real 0m3.778s
user 0m3.709s
sys 0m0.056s
+ ./dso 1
real 0m3.739s
user 0m3.736s
sys 0m0.000s
+ ./static
real 0m2.478s
user 0m2.477s
sys 0m0.000s
+ ./static 1
real 0m3.425s
user 0m3.411s
sys 0m0.000s
Why do switch jumps perform quite a bit better but only when the function is linked statically?
Disassembly diff (sdiff-generated) of the dynamic and static version respectively:
000000000000111a <get>: | 0000000000001180 <get>:
cmp $0xc,%rdi cmp $0xc,%rdi
ja 1178 <get+0x5e> | ja 11de <get+0x5e>
lea 0xed9(%rip),%rdx # 2000 <_fini+0xe80> | lea 0xe77(%rip),%rdx # 2004 <_IO_stdin_used
movslq (%rdx,%rdi,4),%rax movslq (%rdx,%rdi,4),%rax
add %rdx,%rax add %rdx,%rax
jmpq *%rax jmpq *%rax
mov $0x132b,%eax mov $0x132b,%eax
retq retq
mov $0x2740,%eax mov $0x2740,%eax
retq retq
mov $0x79b6,%eax mov $0x79b6,%eax
retq retq
mov $0x5234,%eax mov $0x5234,%eax
retq retq
mov $0x6389,%eax mov $0x6389,%eax
retq retq
mov $0x37de,%eax mov $0x37de,%eax
retq retq
mov $0x6a22,%eax mov $0x6a22,%eax
retq retq
mov $0x1a35,%eax mov $0x1a35,%eax
retq retq
mov $0x2ce8,%eax mov $0x2ce8,%eax
retq retq
mov $0x4fed,%eax mov $0x4fed,%eax
retq retq
mov $0xfe3,%eax mov $0xfe3,%eax
retq retq
mov $0x4229,%eax mov $0x4229,%eax
retq retq
jmpq *%rdi jmpq *%rdi
mov $0x529e,%eax mov $0x529e,%eax
retq retq
<
c gcc assembly shared-libraries x86-64
|
show 11 more comments
I was curious about how switch jumps perform relative to function calls so I whipped out a quick benchmark:
#!/bin/bash -eu
cat > get.c <<EOF
#include <stdint.h>
int get(int (Getter)(void))
{
uintptr_t getter=(uintptr_t)Getter;
if(1){
switch(getter){
case 0: return $RANDOM;
case 1: return $RANDOM;
case 2: return $RANDOM;
case 3: return $RANDOM;
case 4: return $RANDOM;
case 5: return $RANDOM;
default: return Getter();
}
}else{
if(0==getter) return $RANDOM;
else if(1==getter) return $RANDOM;
else if(2==getter) return $RANDOM;
else if(3==getter) return $RANDOM;
else if(4==getter) return $RANDOM;
else if(5==getter) return $RANDOM;
else return Getter();
}
}
EOF
cat > main.c <<EOF
int get(int (Getter)(void));
int Getter(void){ return 42; }
int main(int C, char**V)
{
if(C==1)
for(int i=0; i<1000000000;i++)
get((int(*)(void))4);
else
for(int i=0; i<1000000000;i++)
get(Getter);
}
EOF
: ${CC:=gcc}
arg='-Os -fpic'
for c in *.c; do $CC $arg -c $c; done
$CC get.o -o libget.so -shared
$CC main.o $PWD/libget.so -o dso
$CC main.o get.o -o dso -o static
set -x
time ./dso
time ./dso 1
time ./static
time ./static 1
The timings (relatively stable) are:
+ ./dso
real 0m3.778s
user 0m3.709s
sys 0m0.056s
+ ./dso 1
real 0m3.739s
user 0m3.736s
sys 0m0.000s
+ ./static
real 0m2.478s
user 0m2.477s
sys 0m0.000s
+ ./static 1
real 0m3.425s
user 0m3.411s
sys 0m0.000s
Why do switch jumps perform quite a bit better but only when the function is linked statically?
Disassembly diff (sdiff-generated) of the dynamic and static version respectively:
000000000000111a <get>: | 0000000000001180 <get>:
cmp $0xc,%rdi cmp $0xc,%rdi
ja 1178 <get+0x5e> | ja 11de <get+0x5e>
lea 0xed9(%rip),%rdx # 2000 <_fini+0xe80> | lea 0xe77(%rip),%rdx # 2004 <_IO_stdin_used
movslq (%rdx,%rdi,4),%rax movslq (%rdx,%rdi,4),%rax
add %rdx,%rax add %rdx,%rax
jmpq *%rax jmpq *%rax
mov $0x132b,%eax mov $0x132b,%eax
retq retq
mov $0x2740,%eax mov $0x2740,%eax
retq retq
mov $0x79b6,%eax mov $0x79b6,%eax
retq retq
mov $0x5234,%eax mov $0x5234,%eax
retq retq
mov $0x6389,%eax mov $0x6389,%eax
retq retq
mov $0x37de,%eax mov $0x37de,%eax
retq retq
mov $0x6a22,%eax mov $0x6a22,%eax
retq retq
mov $0x1a35,%eax mov $0x1a35,%eax
retq retq
mov $0x2ce8,%eax mov $0x2ce8,%eax
retq retq
mov $0x4fed,%eax mov $0x4fed,%eax
retq retq
mov $0xfe3,%eax mov $0xfe3,%eax
retq retq
mov $0x4229,%eax mov $0x4229,%eax
retq retq
jmpq *%rdi jmpq *%rdi
mov $0x529e,%eax mov $0x529e,%eax
retq retq
<
c gcc assembly shared-libraries x86-64
my guess is it's all about cache locality stackoverflow.com/questions/16699247/… and en.wikipedia.org/wiki/Locality_of_reference
– Sergei Nikulov
Dec 28 '18 at 12:53
My results, using Apple LLVM 10.0.0 clang-1000.11.45.5, show faster results for theswitchcode in both cases. Why would you think there is a general answer not dependent on your compiler? You have not stated what tools you are using. State the compiler and linker versions, use-Sto generate assembly code, and show the assembly code.
– Eric Postpischil
Dec 28 '18 at 12:55
1
why he is speaking about a switch while the difference only concern the fact the called function is linked statically or dynamically ? The switch is not relevant here
– bruno
Dec 28 '18 at 13:09
1
@Someprogrammerdude: I think the compiler is "compressing" the table (storing offsets and adding at runtime instead storing full pointers) not because of-Os, but because it's trying to ensure position-independent code+data. (A static jump table would have absolute addresses hardcoded in it.) I don't think that's a signficant source of extra cost here, just a couple cycles or so of extra branch-miss cost before the right address can be found, if a branch mispredicts. Moreover, the asm is the same for both versions. Try with-no-pie-fno-pie`.
– Peter Cordes
Dec 28 '18 at 13:39
1
@PSkocik then I am guessing that you're branching too deep and spilling items from the branch prediction table or the pipeline stalls or sth. I don't think it is the cache, pretty sure all the code would fit in the innermost cache anyway. i7 for example has got 32k memory in L1.
– Antti Haapala
Dec 28 '18 at 13:49
|
show 11 more comments
I was curious about how switch jumps perform relative to function calls so I whipped out a quick benchmark:
#!/bin/bash -eu
cat > get.c <<EOF
#include <stdint.h>
int get(int (Getter)(void))
{
uintptr_t getter=(uintptr_t)Getter;
if(1){
switch(getter){
case 0: return $RANDOM;
case 1: return $RANDOM;
case 2: return $RANDOM;
case 3: return $RANDOM;
case 4: return $RANDOM;
case 5: return $RANDOM;
default: return Getter();
}
}else{
if(0==getter) return $RANDOM;
else if(1==getter) return $RANDOM;
else if(2==getter) return $RANDOM;
else if(3==getter) return $RANDOM;
else if(4==getter) return $RANDOM;
else if(5==getter) return $RANDOM;
else return Getter();
}
}
EOF
cat > main.c <<EOF
int get(int (Getter)(void));
int Getter(void){ return 42; }
int main(int C, char**V)
{
if(C==1)
for(int i=0; i<1000000000;i++)
get((int(*)(void))4);
else
for(int i=0; i<1000000000;i++)
get(Getter);
}
EOF
: ${CC:=gcc}
arg='-Os -fpic'
for c in *.c; do $CC $arg -c $c; done
$CC get.o -o libget.so -shared
$CC main.o $PWD/libget.so -o dso
$CC main.o get.o -o dso -o static
set -x
time ./dso
time ./dso 1
time ./static
time ./static 1
The timings (relatively stable) are:
+ ./dso
real 0m3.778s
user 0m3.709s
sys 0m0.056s
+ ./dso 1
real 0m3.739s
user 0m3.736s
sys 0m0.000s
+ ./static
real 0m2.478s
user 0m2.477s
sys 0m0.000s
+ ./static 1
real 0m3.425s
user 0m3.411s
sys 0m0.000s
Why do switch jumps perform quite a bit better but only when the function is linked statically?
Disassembly diff (sdiff-generated) of the dynamic and static version respectively:
000000000000111a <get>: | 0000000000001180 <get>:
cmp $0xc,%rdi cmp $0xc,%rdi
ja 1178 <get+0x5e> | ja 11de <get+0x5e>
lea 0xed9(%rip),%rdx # 2000 <_fini+0xe80> | lea 0xe77(%rip),%rdx # 2004 <_IO_stdin_used
movslq (%rdx,%rdi,4),%rax movslq (%rdx,%rdi,4),%rax
add %rdx,%rax add %rdx,%rax
jmpq *%rax jmpq *%rax
mov $0x132b,%eax mov $0x132b,%eax
retq retq
mov $0x2740,%eax mov $0x2740,%eax
retq retq
mov $0x79b6,%eax mov $0x79b6,%eax
retq retq
mov $0x5234,%eax mov $0x5234,%eax
retq retq
mov $0x6389,%eax mov $0x6389,%eax
retq retq
mov $0x37de,%eax mov $0x37de,%eax
retq retq
mov $0x6a22,%eax mov $0x6a22,%eax
retq retq
mov $0x1a35,%eax mov $0x1a35,%eax
retq retq
mov $0x2ce8,%eax mov $0x2ce8,%eax
retq retq
mov $0x4fed,%eax mov $0x4fed,%eax
retq retq
mov $0xfe3,%eax mov $0xfe3,%eax
retq retq
mov $0x4229,%eax mov $0x4229,%eax
retq retq
jmpq *%rdi jmpq *%rdi
mov $0x529e,%eax mov $0x529e,%eax
retq retq
<
c gcc assembly shared-libraries x86-64
I was curious about how switch jumps perform relative to function calls so I whipped out a quick benchmark:
#!/bin/bash -eu
cat > get.c <<EOF
#include <stdint.h>
int get(int (Getter)(void))
{
uintptr_t getter=(uintptr_t)Getter;
if(1){
switch(getter){
case 0: return $RANDOM;
case 1: return $RANDOM;
case 2: return $RANDOM;
case 3: return $RANDOM;
case 4: return $RANDOM;
case 5: return $RANDOM;
default: return Getter();
}
}else{
if(0==getter) return $RANDOM;
else if(1==getter) return $RANDOM;
else if(2==getter) return $RANDOM;
else if(3==getter) return $RANDOM;
else if(4==getter) return $RANDOM;
else if(5==getter) return $RANDOM;
else return Getter();
}
}
EOF
cat > main.c <<EOF
int get(int (Getter)(void));
int Getter(void){ return 42; }
int main(int C, char**V)
{
if(C==1)
for(int i=0; i<1000000000;i++)
get((int(*)(void))4);
else
for(int i=0; i<1000000000;i++)
get(Getter);
}
EOF
: ${CC:=gcc}
arg='-Os -fpic'
for c in *.c; do $CC $arg -c $c; done
$CC get.o -o libget.so -shared
$CC main.o $PWD/libget.so -o dso
$CC main.o get.o -o dso -o static
set -x
time ./dso
time ./dso 1
time ./static
time ./static 1
The timings (relatively stable) are:
+ ./dso
real 0m3.778s
user 0m3.709s
sys 0m0.056s
+ ./dso 1
real 0m3.739s
user 0m3.736s
sys 0m0.000s
+ ./static
real 0m2.478s
user 0m2.477s
sys 0m0.000s
+ ./static 1
real 0m3.425s
user 0m3.411s
sys 0m0.000s
Why do switch jumps perform quite a bit better but only when the function is linked statically?
Disassembly diff (sdiff-generated) of the dynamic and static version respectively:
000000000000111a <get>: | 0000000000001180 <get>:
cmp $0xc,%rdi cmp $0xc,%rdi
ja 1178 <get+0x5e> | ja 11de <get+0x5e>
lea 0xed9(%rip),%rdx # 2000 <_fini+0xe80> | lea 0xe77(%rip),%rdx # 2004 <_IO_stdin_used
movslq (%rdx,%rdi,4),%rax movslq (%rdx,%rdi,4),%rax
add %rdx,%rax add %rdx,%rax
jmpq *%rax jmpq *%rax
mov $0x132b,%eax mov $0x132b,%eax
retq retq
mov $0x2740,%eax mov $0x2740,%eax
retq retq
mov $0x79b6,%eax mov $0x79b6,%eax
retq retq
mov $0x5234,%eax mov $0x5234,%eax
retq retq
mov $0x6389,%eax mov $0x6389,%eax
retq retq
mov $0x37de,%eax mov $0x37de,%eax
retq retq
mov $0x6a22,%eax mov $0x6a22,%eax
retq retq
mov $0x1a35,%eax mov $0x1a35,%eax
retq retq
mov $0x2ce8,%eax mov $0x2ce8,%eax
retq retq
mov $0x4fed,%eax mov $0x4fed,%eax
retq retq
mov $0xfe3,%eax mov $0xfe3,%eax
retq retq
mov $0x4229,%eax mov $0x4229,%eax
retq retq
jmpq *%rdi jmpq *%rdi
mov $0x529e,%eax mov $0x529e,%eax
retq retq
<
c gcc assembly shared-libraries x86-64
c gcc assembly shared-libraries x86-64
edited Dec 28 '18 at 13:12
PSkocik
asked Dec 28 '18 at 12:40
PSkocikPSkocik
32.4k64770
32.4k64770
my guess is it's all about cache locality stackoverflow.com/questions/16699247/… and en.wikipedia.org/wiki/Locality_of_reference
– Sergei Nikulov
Dec 28 '18 at 12:53
My results, using Apple LLVM 10.0.0 clang-1000.11.45.5, show faster results for theswitchcode in both cases. Why would you think there is a general answer not dependent on your compiler? You have not stated what tools you are using. State the compiler and linker versions, use-Sto generate assembly code, and show the assembly code.
– Eric Postpischil
Dec 28 '18 at 12:55
1
why he is speaking about a switch while the difference only concern the fact the called function is linked statically or dynamically ? The switch is not relevant here
– bruno
Dec 28 '18 at 13:09
1
@Someprogrammerdude: I think the compiler is "compressing" the table (storing offsets and adding at runtime instead storing full pointers) not because of-Os, but because it's trying to ensure position-independent code+data. (A static jump table would have absolute addresses hardcoded in it.) I don't think that's a signficant source of extra cost here, just a couple cycles or so of extra branch-miss cost before the right address can be found, if a branch mispredicts. Moreover, the asm is the same for both versions. Try with-no-pie-fno-pie`.
– Peter Cordes
Dec 28 '18 at 13:39
1
@PSkocik then I am guessing that you're branching too deep and spilling items from the branch prediction table or the pipeline stalls or sth. I don't think it is the cache, pretty sure all the code would fit in the innermost cache anyway. i7 for example has got 32k memory in L1.
– Antti Haapala
Dec 28 '18 at 13:49
|
show 11 more comments
my guess is it's all about cache locality stackoverflow.com/questions/16699247/… and en.wikipedia.org/wiki/Locality_of_reference
– Sergei Nikulov
Dec 28 '18 at 12:53
My results, using Apple LLVM 10.0.0 clang-1000.11.45.5, show faster results for theswitchcode in both cases. Why would you think there is a general answer not dependent on your compiler? You have not stated what tools you are using. State the compiler and linker versions, use-Sto generate assembly code, and show the assembly code.
– Eric Postpischil
Dec 28 '18 at 12:55
1
why he is speaking about a switch while the difference only concern the fact the called function is linked statically or dynamically ? The switch is not relevant here
– bruno
Dec 28 '18 at 13:09
1
@Someprogrammerdude: I think the compiler is "compressing" the table (storing offsets and adding at runtime instead storing full pointers) not because of-Os, but because it's trying to ensure position-independent code+data. (A static jump table would have absolute addresses hardcoded in it.) I don't think that's a signficant source of extra cost here, just a couple cycles or so of extra branch-miss cost before the right address can be found, if a branch mispredicts. Moreover, the asm is the same for both versions. Try with-no-pie-fno-pie`.
– Peter Cordes
Dec 28 '18 at 13:39
1
@PSkocik then I am guessing that you're branching too deep and spilling items from the branch prediction table or the pipeline stalls or sth. I don't think it is the cache, pretty sure all the code would fit in the innermost cache anyway. i7 for example has got 32k memory in L1.
– Antti Haapala
Dec 28 '18 at 13:49
my guess is it's all about cache locality stackoverflow.com/questions/16699247/… and en.wikipedia.org/wiki/Locality_of_reference
– Sergei Nikulov
Dec 28 '18 at 12:53
my guess is it's all about cache locality stackoverflow.com/questions/16699247/… and en.wikipedia.org/wiki/Locality_of_reference
– Sergei Nikulov
Dec 28 '18 at 12:53
My results, using Apple LLVM 10.0.0 clang-1000.11.45.5, show faster results for the
switch code in both cases. Why would you think there is a general answer not dependent on your compiler? You have not stated what tools you are using. State the compiler and linker versions, use -S to generate assembly code, and show the assembly code.– Eric Postpischil
Dec 28 '18 at 12:55
My results, using Apple LLVM 10.0.0 clang-1000.11.45.5, show faster results for the
switch code in both cases. Why would you think there is a general answer not dependent on your compiler? You have not stated what tools you are using. State the compiler and linker versions, use -S to generate assembly code, and show the assembly code.– Eric Postpischil
Dec 28 '18 at 12:55
1
1
why he is speaking about a switch while the difference only concern the fact the called function is linked statically or dynamically ? The switch is not relevant here
– bruno
Dec 28 '18 at 13:09
why he is speaking about a switch while the difference only concern the fact the called function is linked statically or dynamically ? The switch is not relevant here
– bruno
Dec 28 '18 at 13:09
1
1
@Someprogrammerdude: I think the compiler is "compressing" the table (storing offsets and adding at runtime instead storing full pointers) not because of
-Os, but because it's trying to ensure position-independent code+data. (A static jump table would have absolute addresses hardcoded in it.) I don't think that's a signficant source of extra cost here, just a couple cycles or so of extra branch-miss cost before the right address can be found, if a branch mispredicts. Moreover, the asm is the same for both versions. Try with -no-pie -fno-pie`.– Peter Cordes
Dec 28 '18 at 13:39
@Someprogrammerdude: I think the compiler is "compressing" the table (storing offsets and adding at runtime instead storing full pointers) not because of
-Os, but because it's trying to ensure position-independent code+data. (A static jump table would have absolute addresses hardcoded in it.) I don't think that's a signficant source of extra cost here, just a couple cycles or so of extra branch-miss cost before the right address can be found, if a branch mispredicts. Moreover, the asm is the same for both versions. Try with -no-pie -fno-pie`.– Peter Cordes
Dec 28 '18 at 13:39
1
1
@PSkocik then I am guessing that you're branching too deep and spilling items from the branch prediction table or the pipeline stalls or sth. I don't think it is the cache, pretty sure all the code would fit in the innermost cache anyway. i7 for example has got 32k memory in L1.
– Antti Haapala
Dec 28 '18 at 13:49
@PSkocik then I am guessing that you're branching too deep and spilling items from the branch prediction table or the pipeline stalls or sth. I don't think it is the cache, pretty sure all the code would fit in the innermost cache anyway. i7 for example has got 32k memory in L1.
– Antti Haapala
Dec 28 '18 at 13:49
|
show 11 more comments
1 Answer
1
active
oldest
votes
The calls can't inline (because you put the definition in a separate file and didn't use link-time optimization).
I think you're measuring the extra overhead of calling through the PLT when calling a function in a shared library, traditional Unix style, which gcc does by default. Use -fno-plt to emit memory-indirect call instructions that use the GOT entry directly, instead of calling a memory-indirect jmp. See Sorry state of dynamic libraries on Linux for more about PLT overhead, or disassemble it yourself. (TODO: add disassembly to this answer.)
I expect -fno-plt will make both versions run almost exactly the same.
The asm for both versions of "get" is identical, modulo different random numbers and different addresses. They presumably perform the same, both slow because gcc misses the optimization of turning the switch into a table lookup. See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85585 for that and related stuff. (BTW, gcc compresses the table into offsets instead of using a classic jump table of raw pointers because it's trying to avoid absolute addresses everywhere, even as data. Some targets don't support fixups even for that, and gcc currently avoids them even on targets like x86-64/Linux where it would be fine with runtime fixups. But of course it's silly to do an indirect branch instead of just looking up the data in a table in this case.)
Also related: 32-bit absolute addresses no longer allowed in x86-64 Linux? talks some about the cost of -fpie and -fpic. In this case there's nothing to save by omitting -fpic and/or using -fno-pie -no-pie, because separate files are also keeping the function from inlining, not just possible symbol-interposition / ELF symbol visibility.
You just guessed it without even trying it out? :D Yes.-fno-pltis making both versions run almost exactly the same. Thanks.
– PSkocik
Dec 28 '18 at 14:01
1
@PSkocik: yeah, I know what-fno-pltdoes to the asm, and that a memory-indirectcall get@GOTPCREL(%rip)will perform identically to a directcall rel32in a microbenchmark where the branch predictors are always hot, on modern Intel and AMD CPUs.
– Peter Cordes
Dec 28 '18 at 14:09
1
@PSkocik: BTW, your question title doesn't make sense. You're not comparing switch-jump vs. a function call. You have the same amount of function calls in both cases. You may have written this to compare switch vs. an if chain or tree (again which isn't a function call), but now you're comparing identical code with the only difference being call within an executable (but across files) vs. calling a function in a shared library. Using LTOgcc -O3 -fltoto compile and link will make the single-executable version much faster because get can inline instead of actually calling.
– Peter Cordes
Dec 28 '18 at 14:13
1
@PSkocik: It's slowing down the call from the loop inmaintoget. Either way of compiling, the caller passes the real function pointer (not a pointer to a PLT stub) as an argument, so a call from inside the switch doesn't also go through the PLT.
– Peter Cordes
Dec 28 '18 at 14:31
1
@PSkocik: I'll have to sleep on that and maybe try it myself, not sure why that's the case. If you can check perf counters to see if there are any branch misses, that'd be a good idea. (perf stat ./static 1and so on.)
– Peter Cordes
Dec 28 '18 at 14:41
|
show 3 more comments
Your Answer
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53958793%2fwhy-are-gccs-switch-generated-jumps-faster-than-equivalent-function-calls-but-o%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
1 Answer
1
active
oldest
votes
1 Answer
1
active
oldest
votes
active
oldest
votes
active
oldest
votes
The calls can't inline (because you put the definition in a separate file and didn't use link-time optimization).
I think you're measuring the extra overhead of calling through the PLT when calling a function in a shared library, traditional Unix style, which gcc does by default. Use -fno-plt to emit memory-indirect call instructions that use the GOT entry directly, instead of calling a memory-indirect jmp. See Sorry state of dynamic libraries on Linux for more about PLT overhead, or disassemble it yourself. (TODO: add disassembly to this answer.)
I expect -fno-plt will make both versions run almost exactly the same.
The asm for both versions of "get" is identical, modulo different random numbers and different addresses. They presumably perform the same, both slow because gcc misses the optimization of turning the switch into a table lookup. See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85585 for that and related stuff. (BTW, gcc compresses the table into offsets instead of using a classic jump table of raw pointers because it's trying to avoid absolute addresses everywhere, even as data. Some targets don't support fixups even for that, and gcc currently avoids them even on targets like x86-64/Linux where it would be fine with runtime fixups. But of course it's silly to do an indirect branch instead of just looking up the data in a table in this case.)
Also related: 32-bit absolute addresses no longer allowed in x86-64 Linux? talks some about the cost of -fpie and -fpic. In this case there's nothing to save by omitting -fpic and/or using -fno-pie -no-pie, because separate files are also keeping the function from inlining, not just possible symbol-interposition / ELF symbol visibility.
You just guessed it without even trying it out? :D Yes.-fno-pltis making both versions run almost exactly the same. Thanks.
– PSkocik
Dec 28 '18 at 14:01
1
@PSkocik: yeah, I know what-fno-pltdoes to the asm, and that a memory-indirectcall get@GOTPCREL(%rip)will perform identically to a directcall rel32in a microbenchmark where the branch predictors are always hot, on modern Intel and AMD CPUs.
– Peter Cordes
Dec 28 '18 at 14:09
1
@PSkocik: BTW, your question title doesn't make sense. You're not comparing switch-jump vs. a function call. You have the same amount of function calls in both cases. You may have written this to compare switch vs. an if chain or tree (again which isn't a function call), but now you're comparing identical code with the only difference being call within an executable (but across files) vs. calling a function in a shared library. Using LTOgcc -O3 -fltoto compile and link will make the single-executable version much faster because get can inline instead of actually calling.
– Peter Cordes
Dec 28 '18 at 14:13
1
@PSkocik: It's slowing down the call from the loop inmaintoget. Either way of compiling, the caller passes the real function pointer (not a pointer to a PLT stub) as an argument, so a call from inside the switch doesn't also go through the PLT.
– Peter Cordes
Dec 28 '18 at 14:31
1
@PSkocik: I'll have to sleep on that and maybe try it myself, not sure why that's the case. If you can check perf counters to see if there are any branch misses, that'd be a good idea. (perf stat ./static 1and so on.)
– Peter Cordes
Dec 28 '18 at 14:41
|
show 3 more comments
The calls can't inline (because you put the definition in a separate file and didn't use link-time optimization).
I think you're measuring the extra overhead of calling through the PLT when calling a function in a shared library, traditional Unix style, which gcc does by default. Use -fno-plt to emit memory-indirect call instructions that use the GOT entry directly, instead of calling a memory-indirect jmp. See Sorry state of dynamic libraries on Linux for more about PLT overhead, or disassemble it yourself. (TODO: add disassembly to this answer.)
I expect -fno-plt will make both versions run almost exactly the same.
The asm for both versions of "get" is identical, modulo different random numbers and different addresses. They presumably perform the same, both slow because gcc misses the optimization of turning the switch into a table lookup. See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85585 for that and related stuff. (BTW, gcc compresses the table into offsets instead of using a classic jump table of raw pointers because it's trying to avoid absolute addresses everywhere, even as data. Some targets don't support fixups even for that, and gcc currently avoids them even on targets like x86-64/Linux where it would be fine with runtime fixups. But of course it's silly to do an indirect branch instead of just looking up the data in a table in this case.)
Also related: 32-bit absolute addresses no longer allowed in x86-64 Linux? talks some about the cost of -fpie and -fpic. In this case there's nothing to save by omitting -fpic and/or using -fno-pie -no-pie, because separate files are also keeping the function from inlining, not just possible symbol-interposition / ELF symbol visibility.
You just guessed it without even trying it out? :D Yes.-fno-pltis making both versions run almost exactly the same. Thanks.
– PSkocik
Dec 28 '18 at 14:01
1
@PSkocik: yeah, I know what-fno-pltdoes to the asm, and that a memory-indirectcall get@GOTPCREL(%rip)will perform identically to a directcall rel32in a microbenchmark where the branch predictors are always hot, on modern Intel and AMD CPUs.
– Peter Cordes
Dec 28 '18 at 14:09
1
@PSkocik: BTW, your question title doesn't make sense. You're not comparing switch-jump vs. a function call. You have the same amount of function calls in both cases. You may have written this to compare switch vs. an if chain or tree (again which isn't a function call), but now you're comparing identical code with the only difference being call within an executable (but across files) vs. calling a function in a shared library. Using LTOgcc -O3 -fltoto compile and link will make the single-executable version much faster because get can inline instead of actually calling.
– Peter Cordes
Dec 28 '18 at 14:13
1
@PSkocik: It's slowing down the call from the loop inmaintoget. Either way of compiling, the caller passes the real function pointer (not a pointer to a PLT stub) as an argument, so a call from inside the switch doesn't also go through the PLT.
– Peter Cordes
Dec 28 '18 at 14:31
1
@PSkocik: I'll have to sleep on that and maybe try it myself, not sure why that's the case. If you can check perf counters to see if there are any branch misses, that'd be a good idea. (perf stat ./static 1and so on.)
– Peter Cordes
Dec 28 '18 at 14:41
|
show 3 more comments
The calls can't inline (because you put the definition in a separate file and didn't use link-time optimization).
I think you're measuring the extra overhead of calling through the PLT when calling a function in a shared library, traditional Unix style, which gcc does by default. Use -fno-plt to emit memory-indirect call instructions that use the GOT entry directly, instead of calling a memory-indirect jmp. See Sorry state of dynamic libraries on Linux for more about PLT overhead, or disassemble it yourself. (TODO: add disassembly to this answer.)
I expect -fno-plt will make both versions run almost exactly the same.
The asm for both versions of "get" is identical, modulo different random numbers and different addresses. They presumably perform the same, both slow because gcc misses the optimization of turning the switch into a table lookup. See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85585 for that and related stuff. (BTW, gcc compresses the table into offsets instead of using a classic jump table of raw pointers because it's trying to avoid absolute addresses everywhere, even as data. Some targets don't support fixups even for that, and gcc currently avoids them even on targets like x86-64/Linux where it would be fine with runtime fixups. But of course it's silly to do an indirect branch instead of just looking up the data in a table in this case.)
Also related: 32-bit absolute addresses no longer allowed in x86-64 Linux? talks some about the cost of -fpie and -fpic. In this case there's nothing to save by omitting -fpic and/or using -fno-pie -no-pie, because separate files are also keeping the function from inlining, not just possible symbol-interposition / ELF symbol visibility.
The calls can't inline (because you put the definition in a separate file and didn't use link-time optimization).
I think you're measuring the extra overhead of calling through the PLT when calling a function in a shared library, traditional Unix style, which gcc does by default. Use -fno-plt to emit memory-indirect call instructions that use the GOT entry directly, instead of calling a memory-indirect jmp. See Sorry state of dynamic libraries on Linux for more about PLT overhead, or disassemble it yourself. (TODO: add disassembly to this answer.)
I expect -fno-plt will make both versions run almost exactly the same.
The asm for both versions of "get" is identical, modulo different random numbers and different addresses. They presumably perform the same, both slow because gcc misses the optimization of turning the switch into a table lookup. See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85585 for that and related stuff. (BTW, gcc compresses the table into offsets instead of using a classic jump table of raw pointers because it's trying to avoid absolute addresses everywhere, even as data. Some targets don't support fixups even for that, and gcc currently avoids them even on targets like x86-64/Linux where it would be fine with runtime fixups. But of course it's silly to do an indirect branch instead of just looking up the data in a table in this case.)
Also related: 32-bit absolute addresses no longer allowed in x86-64 Linux? talks some about the cost of -fpie and -fpic. In this case there's nothing to save by omitting -fpic and/or using -fno-pie -no-pie, because separate files are also keeping the function from inlining, not just possible symbol-interposition / ELF symbol visibility.
answered Dec 28 '18 at 13:57
Peter CordesPeter Cordes
120k17184312
120k17184312
You just guessed it without even trying it out? :D Yes.-fno-pltis making both versions run almost exactly the same. Thanks.
– PSkocik
Dec 28 '18 at 14:01
1
@PSkocik: yeah, I know what-fno-pltdoes to the asm, and that a memory-indirectcall get@GOTPCREL(%rip)will perform identically to a directcall rel32in a microbenchmark where the branch predictors are always hot, on modern Intel and AMD CPUs.
– Peter Cordes
Dec 28 '18 at 14:09
1
@PSkocik: BTW, your question title doesn't make sense. You're not comparing switch-jump vs. a function call. You have the same amount of function calls in both cases. You may have written this to compare switch vs. an if chain or tree (again which isn't a function call), but now you're comparing identical code with the only difference being call within an executable (but across files) vs. calling a function in a shared library. Using LTOgcc -O3 -fltoto compile and link will make the single-executable version much faster because get can inline instead of actually calling.
– Peter Cordes
Dec 28 '18 at 14:13
1
@PSkocik: It's slowing down the call from the loop inmaintoget. Either way of compiling, the caller passes the real function pointer (not a pointer to a PLT stub) as an argument, so a call from inside the switch doesn't also go through the PLT.
– Peter Cordes
Dec 28 '18 at 14:31
1
@PSkocik: I'll have to sleep on that and maybe try it myself, not sure why that's the case. If you can check perf counters to see if there are any branch misses, that'd be a good idea. (perf stat ./static 1and so on.)
– Peter Cordes
Dec 28 '18 at 14:41
|
show 3 more comments
You just guessed it without even trying it out? :D Yes.-fno-pltis making both versions run almost exactly the same. Thanks.
– PSkocik
Dec 28 '18 at 14:01
1
@PSkocik: yeah, I know what-fno-pltdoes to the asm, and that a memory-indirectcall get@GOTPCREL(%rip)will perform identically to a directcall rel32in a microbenchmark where the branch predictors are always hot, on modern Intel and AMD CPUs.
– Peter Cordes
Dec 28 '18 at 14:09
1
@PSkocik: BTW, your question title doesn't make sense. You're not comparing switch-jump vs. a function call. You have the same amount of function calls in both cases. You may have written this to compare switch vs. an if chain or tree (again which isn't a function call), but now you're comparing identical code with the only difference being call within an executable (but across files) vs. calling a function in a shared library. Using LTOgcc -O3 -fltoto compile and link will make the single-executable version much faster because get can inline instead of actually calling.
– Peter Cordes
Dec 28 '18 at 14:13
1
@PSkocik: It's slowing down the call from the loop inmaintoget. Either way of compiling, the caller passes the real function pointer (not a pointer to a PLT stub) as an argument, so a call from inside the switch doesn't also go through the PLT.
– Peter Cordes
Dec 28 '18 at 14:31
1
@PSkocik: I'll have to sleep on that and maybe try it myself, not sure why that's the case. If you can check perf counters to see if there are any branch misses, that'd be a good idea. (perf stat ./static 1and so on.)
– Peter Cordes
Dec 28 '18 at 14:41
You just guessed it without even trying it out? :D Yes.
-fno-plt is making both versions run almost exactly the same. Thanks.– PSkocik
Dec 28 '18 at 14:01
You just guessed it without even trying it out? :D Yes.
-fno-plt is making both versions run almost exactly the same. Thanks.– PSkocik
Dec 28 '18 at 14:01
1
1
@PSkocik: yeah, I know what
-fno-plt does to the asm, and that a memory-indirect call get@GOTPCREL(%rip) will perform identically to a direct call rel32 in a microbenchmark where the branch predictors are always hot, on modern Intel and AMD CPUs.– Peter Cordes
Dec 28 '18 at 14:09
@PSkocik: yeah, I know what
-fno-plt does to the asm, and that a memory-indirect call get@GOTPCREL(%rip) will perform identically to a direct call rel32 in a microbenchmark where the branch predictors are always hot, on modern Intel and AMD CPUs.– Peter Cordes
Dec 28 '18 at 14:09
1
1
@PSkocik: BTW, your question title doesn't make sense. You're not comparing switch-jump vs. a function call. You have the same amount of function calls in both cases. You may have written this to compare switch vs. an if chain or tree (again which isn't a function call), but now you're comparing identical code with the only difference being call within an executable (but across files) vs. calling a function in a shared library. Using LTO
gcc -O3 -flto to compile and link will make the single-executable version much faster because get can inline instead of actually calling.– Peter Cordes
Dec 28 '18 at 14:13
@PSkocik: BTW, your question title doesn't make sense. You're not comparing switch-jump vs. a function call. You have the same amount of function calls in both cases. You may have written this to compare switch vs. an if chain or tree (again which isn't a function call), but now you're comparing identical code with the only difference being call within an executable (but across files) vs. calling a function in a shared library. Using LTO
gcc -O3 -flto to compile and link will make the single-executable version much faster because get can inline instead of actually calling.– Peter Cordes
Dec 28 '18 at 14:13
1
1
@PSkocik: It's slowing down the call from the loop in
main to get. Either way of compiling, the caller passes the real function pointer (not a pointer to a PLT stub) as an argument, so a call from inside the switch doesn't also go through the PLT.– Peter Cordes
Dec 28 '18 at 14:31
@PSkocik: It's slowing down the call from the loop in
main to get. Either way of compiling, the caller passes the real function pointer (not a pointer to a PLT stub) as an argument, so a call from inside the switch doesn't also go through the PLT.– Peter Cordes
Dec 28 '18 at 14:31
1
1
@PSkocik: I'll have to sleep on that and maybe try it myself, not sure why that's the case. If you can check perf counters to see if there are any branch misses, that'd be a good idea. (
perf stat ./static 1 and so on.)– Peter Cordes
Dec 28 '18 at 14:41
@PSkocik: I'll have to sleep on that and maybe try it myself, not sure why that's the case. If you can check perf counters to see if there are any branch misses, that'd be a good idea. (
perf stat ./static 1 and so on.)– Peter Cordes
Dec 28 '18 at 14:41
|
show 3 more comments
Thanks for contributing an answer to Stack Overflow!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53958793%2fwhy-are-gccs-switch-generated-jumps-faster-than-equivalent-function-calls-but-o%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
my guess is it's all about cache locality stackoverflow.com/questions/16699247/… and en.wikipedia.org/wiki/Locality_of_reference
– Sergei Nikulov
Dec 28 '18 at 12:53
My results, using Apple LLVM 10.0.0 clang-1000.11.45.5, show faster results for the
switchcode in both cases. Why would you think there is a general answer not dependent on your compiler? You have not stated what tools you are using. State the compiler and linker versions, use-Sto generate assembly code, and show the assembly code.– Eric Postpischil
Dec 28 '18 at 12:55
1
why he is speaking about a switch while the difference only concern the fact the called function is linked statically or dynamically ? The switch is not relevant here
– bruno
Dec 28 '18 at 13:09
1
@Someprogrammerdude: I think the compiler is "compressing" the table (storing offsets and adding at runtime instead storing full pointers) not because of
-Os, but because it's trying to ensure position-independent code+data. (A static jump table would have absolute addresses hardcoded in it.) I don't think that's a signficant source of extra cost here, just a couple cycles or so of extra branch-miss cost before the right address can be found, if a branch mispredicts. Moreover, the asm is the same for both versions. Try with-no-pie-fno-pie`.– Peter Cordes
Dec 28 '18 at 13:39
1
@PSkocik then I am guessing that you're branching too deep and spilling items from the branch prediction table or the pipeline stalls or sth. I don't think it is the cache, pretty sure all the code would fit in the innermost cache anyway. i7 for example has got 32k memory in L1.
– Antti Haapala
Dec 28 '18 at 13:49