Anton Ertl
2024-04-09 15:59:58 UTC
30 years ago I wrote a paper about Gforth's locals and also gave some
performance results [ertl94l]. Since then the implementation of
Gforth, and also of locals has changed quite a bit; in particular,
recently we have improved the code generated for TO <local>.
So here I look at it again. As a first example, I look at the fib
benchmark (which has a one-off error, but that makes little difference):
no locals with a local
: fib ( n1 -- n2 ) : fib { n -- n2 }
dup 2 < if n 2 < if
drop 1 1
else else
dup n 1- recurse
1- recurse n 2 - recurse
swap 2 - recurse +
+ then ;
then ;
For 43 fib the performance results (AMD64 instructions, Skylake
cycles) are:
no locals with a local factor
13_805_561_653 15_012_002_369 1.09
41_475_949_910 48_490_119_217 1.17
So those of you who dislike locals can still point to worse
performance of code with locals on Gforth.
Another example is string comparison, where I compared three versions:
\ no locals
: strcmp1 ( addr1 u1 addr2 u2 -- n )
rot 2dup 2>r min 0 ?do ( a1 a2 )
over c@ over c@ - dup if
nip nip 2rdrop unloop exit then
drop
char+ swap char+ swap
loop
2drop r> r> - ;
\ locals with TO
: strcmp2
{ addr1 u1 addr2 u2 -- n }
u1 u2 min 0
?do
addr1 c@ addr2 c@ - dup if
unloop exit then
drop
addr1 char+ TO addr1
addr2 char+ TO addr2
loop
u1 u2 - ;
\ locals without TO
: strcmp3
{ addr1 u1 addr2 u2 -- n }
addr1 addr2
u1 u2 min 0
?do { s1 s2 }
s1 c@ s2 c@ - dup if
unloop exit then
drop
s1 char+ s2 char+
loop
2drop
u1 u2 - ;
I am too lazy to benchmark this, but here's the number of
native-instruction bytes in the inner loop:
129 strcmp1
128 strcmp2
149 strcmp3
Here's the code for body of the loop:
strcmp1 strcmp2 strcmp3
over 1->1 @local0 1->1 >l 1->1
mov $00[r13],r8 mov $00[r13],r8 mov rax,rbp
sub r13,$08 sub r13,$08 add r13,$08
mov r8,$10[r13] mov r8,$00[rbp] lea rbp,-$08[rbp]
c@ 1->1 c@ 1->1 mov -$08[rax],r8
movzx r8d,bytePTR[r8] movzx r8d,bytePTR[r8] mov r8,$00[r13]
over 1->2 @local2 1->2 >l @local0 1->1
mov r15,$08[r13] mov r15,$10[rbp] @local0
c@ 2->2 c@ 2->2 mov rax,rbp
movzx r15d,bytePTR[r15] movzx r15d,bytePTR[r15] lea rbp,-$08[rbp]
- 2->1 - 2->1 mov -$08[rax],r8
sub r8,r15 sub r8,r15 c@ 1->1
dup 1->2 dup 1->2 movzx r8d,bytePTR[r8]
mov r15,r8 mov r15,r8 @local1 1->2
?branch 2->1 ?branch 2->1 mov r15,$08[rbp]
<strcmp1+$A8> <strcmp2+$C0> c@ 2->2
add rbx,$68 add rbx,$68 movzx r15d,bytePTR[r15]
mov rax,[rbx] mov rax,[rbx] - 2->1
test r15,r15 test r15,r15 sub r8,r15
jnz $7FED3DB2C317 jnz $7FED3DB2C0DC dup 1->2
jmp eax jmp eax mov r15,r8
nip 1->1 unloop 1->1 ?branch 2->1
add r13,$08 add r14,$10 <strcmp3+$E0>
nip 1->1 lit 1->2 add rbx,$78
add r13,$08 #32 mov rax,[rbx]
2rdrop 1->1 sub rbx,$10 test r15,r15
add r14,$10 mov r15,-$08[rbx] jnz $7FED3DB2C208
unloop 1->1 lp+! 2->1 jmp eax
add r14,$10 add rbp,r15 unloop 1->1
;s 1->1 ;s 1->1 add r14,$10
mov rbx,[r14] mov rbx,[r14] lit 1->2
add r14,$08 add r14,$08 #48
mov rax,[rbx] mov rax,[rbx] sub rbx,$10
jmp eax jmp eax mov r15,-$08[rbx]
drop 1->1 drop 1->1 lp+! 2->1
mov r8,$08[r13] mov r8,$08[r13] add rbp,r15
add r13,$08 add r13,$08 ;s 1->1
char+ 1->1 @local0 1->2 mov rbx,[r14]
add r8,$01 mov r15,$00[rbp] add r14,$08
swap 1->2 char+ 2->2 mov rax,[rbx]
mov r15,$08[r13] add r15,$01 jmp eax
add r13,$08 !local0 2->1 drop 1->0
char+ 2->2 mov $00[rbp],r15 @local0 0->1
add r15,$01 @local2 1->2 mov r8,$00[rbp]
swap 2->1 mov r15,$10[rbp] char+ 1->1
mov $00[r13],r15 char+ 2->2 add r8,$01
sub r13,$08 add r15,$01 @local1 1->1
(loop) 1->1 !local2 2->1 mov $00[r13],r8
<strcmp1+$40> mov $10[rbp],r15 sub r13,$08
sub rbx,$68 (loop) 1->1 mov r8,$08[rbp]
mov rax,[r14] <strcmp2+$58> char+ 1->1
add rax,$01 sub rbx,$68 add r8,$01
cmp $08[r14],rax mov rax,[r14] (loop)-lp+!# 1->1
mov [r14],rax add rax,$01 <strcmp3+$68>
jz $7FED3DB2C36C cmp $08[r14],rax #16
mov rax,[rbx] mov [r14],rax add rbx,$40
jmp eax jz $7FED3DB2C130 mov rax,[r14]
mov rax,[rbx] mov rsi,-$10[rbx]
jmp eax add rax,$01
cmp $08[r14],rax
jz $7FED3DB2C25F
add rbp,-$08[rbx]
mov [r14],rax
mov rbx,rsi
mov rax,[rsi]
jmp eax
@InProceedings{ertl94l,
author = "M. Anton Ertl",
title = "Automatic Scoping of Local Variables",
booktitle = "EuroForth~'94 Conference Proceedings",
year = "1994",
address = "Winchester, UK",
pages = "31--37",
url = "http://www.complang.tuwien.ac.at/papers/ertl94l.ps.gz",
abstract = "In the process of lifting the restrictions on using
locals in Forth, an interesting problem poses
itself: What does it mean if a local is defined in a
control structure? Where is the local visible? Since
the user can create every possible control structure
in ANS Forth, the answer is not as simple as it may
seem. Ideally, the local is visible at a place if
the control flow {\em must} pass through the
definition of the local to reach this place. This
paper discusses locals in general, the visibility
problem, its solution, the consequences and the
implementation as well as related programming style
questions."
}
- anton
performance results [ertl94l]. Since then the implementation of
Gforth, and also of locals has changed quite a bit; in particular,
recently we have improved the code generated for TO <local>.
So here I look at it again. As a first example, I look at the fib
benchmark (which has a one-off error, but that makes little difference):
no locals with a local
: fib ( n1 -- n2 ) : fib { n -- n2 }
dup 2 < if n 2 < if
drop 1 1
else else
dup n 1- recurse
1- recurse n 2 - recurse
swap 2 - recurse +
+ then ;
then ;
For 43 fib the performance results (AMD64 instructions, Skylake
cycles) are:
no locals with a local factor
13_805_561_653 15_012_002_369 1.09
41_475_949_910 48_490_119_217 1.17
So those of you who dislike locals can still point to worse
performance of code with locals on Gforth.
Another example is string comparison, where I compared three versions:
\ no locals
: strcmp1 ( addr1 u1 addr2 u2 -- n )
rot 2dup 2>r min 0 ?do ( a1 a2 )
over c@ over c@ - dup if
nip nip 2rdrop unloop exit then
drop
char+ swap char+ swap
loop
2drop r> r> - ;
\ locals with TO
: strcmp2
{ addr1 u1 addr2 u2 -- n }
u1 u2 min 0
?do
addr1 c@ addr2 c@ - dup if
unloop exit then
drop
addr1 char+ TO addr1
addr2 char+ TO addr2
loop
u1 u2 - ;
\ locals without TO
: strcmp3
{ addr1 u1 addr2 u2 -- n }
addr1 addr2
u1 u2 min 0
?do { s1 s2 }
s1 c@ s2 c@ - dup if
unloop exit then
drop
s1 char+ s2 char+
loop
2drop
u1 u2 - ;
I am too lazy to benchmark this, but here's the number of
native-instruction bytes in the inner loop:
129 strcmp1
128 strcmp2
149 strcmp3
Here's the code for body of the loop:
strcmp1 strcmp2 strcmp3
over 1->1 @local0 1->1 >l 1->1
mov $00[r13],r8 mov $00[r13],r8 mov rax,rbp
sub r13,$08 sub r13,$08 add r13,$08
mov r8,$10[r13] mov r8,$00[rbp] lea rbp,-$08[rbp]
c@ 1->1 c@ 1->1 mov -$08[rax],r8
movzx r8d,bytePTR[r8] movzx r8d,bytePTR[r8] mov r8,$00[r13]
over 1->2 @local2 1->2 >l @local0 1->1
mov r15,$08[r13] mov r15,$10[rbp] @local0
c@ 2->2 c@ 2->2 mov rax,rbp
movzx r15d,bytePTR[r15] movzx r15d,bytePTR[r15] lea rbp,-$08[rbp]
- 2->1 - 2->1 mov -$08[rax],r8
sub r8,r15 sub r8,r15 c@ 1->1
dup 1->2 dup 1->2 movzx r8d,bytePTR[r8]
mov r15,r8 mov r15,r8 @local1 1->2
?branch 2->1 ?branch 2->1 mov r15,$08[rbp]
<strcmp1+$A8> <strcmp2+$C0> c@ 2->2
add rbx,$68 add rbx,$68 movzx r15d,bytePTR[r15]
mov rax,[rbx] mov rax,[rbx] - 2->1
test r15,r15 test r15,r15 sub r8,r15
jnz $7FED3DB2C317 jnz $7FED3DB2C0DC dup 1->2
jmp eax jmp eax mov r15,r8
nip 1->1 unloop 1->1 ?branch 2->1
add r13,$08 add r14,$10 <strcmp3+$E0>
nip 1->1 lit 1->2 add rbx,$78
add r13,$08 #32 mov rax,[rbx]
2rdrop 1->1 sub rbx,$10 test r15,r15
add r14,$10 mov r15,-$08[rbx] jnz $7FED3DB2C208
unloop 1->1 lp+! 2->1 jmp eax
add r14,$10 add rbp,r15 unloop 1->1
;s 1->1 ;s 1->1 add r14,$10
mov rbx,[r14] mov rbx,[r14] lit 1->2
add r14,$08 add r14,$08 #48
mov rax,[rbx] mov rax,[rbx] sub rbx,$10
jmp eax jmp eax mov r15,-$08[rbx]
drop 1->1 drop 1->1 lp+! 2->1
mov r8,$08[r13] mov r8,$08[r13] add rbp,r15
add r13,$08 add r13,$08 ;s 1->1
char+ 1->1 @local0 1->2 mov rbx,[r14]
add r8,$01 mov r15,$00[rbp] add r14,$08
swap 1->2 char+ 2->2 mov rax,[rbx]
mov r15,$08[r13] add r15,$01 jmp eax
add r13,$08 !local0 2->1 drop 1->0
char+ 2->2 mov $00[rbp],r15 @local0 0->1
add r15,$01 @local2 1->2 mov r8,$00[rbp]
swap 2->1 mov r15,$10[rbp] char+ 1->1
mov $00[r13],r15 char+ 2->2 add r8,$01
sub r13,$08 add r15,$01 @local1 1->1
(loop) 1->1 !local2 2->1 mov $00[r13],r8
<strcmp1+$40> mov $10[rbp],r15 sub r13,$08
sub rbx,$68 (loop) 1->1 mov r8,$08[rbp]
mov rax,[r14] <strcmp2+$58> char+ 1->1
add rax,$01 sub rbx,$68 add r8,$01
cmp $08[r14],rax mov rax,[r14] (loop)-lp+!# 1->1
mov [r14],rax add rax,$01 <strcmp3+$68>
jz $7FED3DB2C36C cmp $08[r14],rax #16
mov rax,[rbx] mov [r14],rax add rbx,$40
jmp eax jz $7FED3DB2C130 mov rax,[r14]
mov rax,[rbx] mov rsi,-$10[rbx]
jmp eax add rax,$01
cmp $08[r14],rax
jz $7FED3DB2C25F
add rbp,-$08[rbx]
mov [r14],rax
mov rbx,rsi
mov rax,[rsi]
jmp eax
@InProceedings{ertl94l,
author = "M. Anton Ertl",
title = "Automatic Scoping of Local Variables",
booktitle = "EuroForth~'94 Conference Proceedings",
year = "1994",
address = "Winchester, UK",
pages = "31--37",
url = "http://www.complang.tuwien.ac.at/papers/ertl94l.ps.gz",
abstract = "In the process of lifting the restrictions on using
locals in Forth, an interesting problem poses
itself: What does it mean if a local is defined in a
control structure? Where is the local visible? Since
the user can create every possible control structure
in ANS Forth, the answer is not as simple as it may
seem. Ideally, the local is visible at a place if
the control flow {\em must} pass through the
definition of the local to reach this place. This
paper discusses locals in general, the visibility
problem, its solution, the consequences and the
implementation as well as related programming style
questions."
}
- anton
--
M. Anton Ertl http://www.complang.tuwien.ac.at/anton/home.html
comp.lang.forth FAQs: http://www.complang.tuwien.ac.at/forth/faq/toc.html
New standard: https://forth-standard.org/
EuroForth 2023: https://euro.theforth.net/2023
M. Anton Ertl http://www.complang.tuwien.ac.at/anton/home.html
comp.lang.forth FAQs: http://www.complang.tuwien.ac.at/forth/faq/toc.html
New standard: https://forth-standard.org/
EuroForth 2023: https://euro.theforth.net/2023