A response to response to hello world
Recently I’ve received an email from StackOverflow newsletters with a link to a quite controversial (at first glance) blog, A response to Hello World by Caleb Doxsey. This blog is a response to another curious read, Hello world by Drew DeVault.
In the former article, author compared the performance of a tiny “Hello, World” program in Assembly to the same program in Go. He then tried to optimize the program in Go to run faster and towards the end of an article comes up with a program that is faster than its Assembly counterpart.
And this totally makes sense, since if you give it a good read, you will notice that author did optimize the program in Go but did not do that for the Assembly program.
I have decided to burn few hours of my life and jump onto this topic, since, in my opinion, author did not do a fair comparison.
The original code Caleb has used is (I have added the imports and package declaration required to run the program):
Go
package main
import "strconv"
import "os"
func main() {
n, _ := strconv.Atoi(os.Args[1])
for i := 0; i < n; i++ {
os.Stdout.Write([]byte("hello world\n"))
}
}
Assembly
_start:
mov rdi, [rsp + 16]
call atoi
mov r12, rax
.helloloop:
mov rdx, len
mov rsi, msg
mov rdi, 1
mov rax, 1
syscall
dec r12
cmp r12, 0
jg .helloloop
mov rdi, 0
mov rax, 60
syscall
Optimized Go
package main
import "strconv"
import "os"
import "bufio"
func main() {
n, _ := strconv.Atoi(os.Args[1])
w := bufio.NewWriter(os.Stdout)
defer w.Flush()
for i := 0; i < n; i++ {
w.Write([]byte("hello world"))
}
}
What Caleb did was string concatenation in memory, creating a big string in memory and then calling the system function to print it out only once. Indeed this is way more performant than making an IO syscall every iteration.
Since Caleb did not provide an equal program in Assembly, I decided to fill this gap.
I have spent quite some time (predominantly because I am quite rusty with Assembly at the moment and I forgot that different OSes use different calling conventions).
I came up with the code in Assembly that does exactly the same - allocates the memory, fills it with the repeated "hello world"
string and then makes a syscall to print it to the console.
My optimized Assembly
;; compile me on OSX: nasm -fmacho64 helloworld.asm && ld helloworld.o -lSystem -macosx_version_min 10.13
;; compile me on Linux: nasm -felf64 helloworld.asm && ld helloworld.o # AND DO NOT FORGET TO ALIGN WITH THE SYSCALL CONVENTION BY USING STACK INSTEAD OF REGISTERS
global _main
extern _malloc, _puts, _atoi, _free
section .text
_main:
.load_counter:
cmp rdi, 2
jne .exit_failure
mov r12, rdi
mov r13, [rsi + 8]
push rdi
mov rdi, r13
call _atoi
mov r13, rax
pop rdi
.calculate_memory_length:
mov rdi, r13
mov rax, len
imul rdi, rax
inc rdi
mov r12, rdi
.allocate_memory:
push rax
mov rdi, r12
xor rax, rax
call _malloc
test rax, rax
jz .exit_failure
mov r12, rax
pop rax
.repeat_message:
mov rcx, r13
mov rdi, r12
.repeat_message_1:
push rcx
lea rsi, [rel msg]
mov rcx, len
cld
.copy_message:
mov rax, [rsi]
mov [rdi], rax
inc rsi
inc rdi
dec rcx
jnz .copy_message
.repeat_message_2:
pop rcx
dec rcx
jnz .repeat_message_1
xor rax, rax
mov [rdi], rax
.print_string_builder:
push rdi
mov rdi, r12
xor rax, rax
call _puts
pop rdi
.free_memory:
push rdi
mov rdi, r12
call _free
pop rdi
.exit_success:
xor rdi, rdi
jmp .exit
.exit_failure:
mov rdi, 1
.exit:
mov rax, 0x02000001
syscall
section .data
msg: db "Hello, world", 10
len equ $ - msg
My version with logs
As Caleb highlights in his blog, a lot of “burden” introduced by many languages (referring to the original blog by Drew DeVault) is the debugging and safety related information.
I went ahead and added few logs, memory deallocation and test for allocation success to the code in Assembly:
;; compile me on OSX: nasm -fmacho64 helloworld.asm && ld helloworld.o -lSystem -macosx_version_min 10.13
;; compile me on Linux: nasm -felf64 helloworld.asm && ld helloworld.o # AND DO NOT FORGET TO ALIGN WITH THE SYSCALL CONVENTION BY USING STACK INSTEAD OF REGISTERS
global _main ;; a function declaration - an entry point; use `_start` for Linux
extern _malloc, _puts, _atoi, _printf
section .text
_main:
.load_counter:
push r12
push r13
cmp rdi, 2 ;; check ARGC - program takes exactly ONE argument, so checking if ARGC == 2 (first one being program name)
jne .exit_failure
;; log
mov r12, rdi ;; store ARGC in R12
mov r13, [rsi + 8] ;; store the ARGV[2] (by addressing it with *(ARGV + 1)) in R13
push rsi ;; preserve all the used registers on stack - required by OSX calling convention
push rdi
push rdx
mov rdi, log1 ;; copy format string to RDI
mov rsi, r12 ;; copy first argument for _printf to RSI
mov rdx, r13 ;; copy second argument for _printf to RDX
xor rax, rax ;; zero out RAX to enable variable function arguments
call _printf ;; syscall to printf()
pop rdx ;; restore the registers' values
pop rdi
pop rsi
;; end log
; mov r12, rsi ;; pointer to argv (which is a pointer on its own)
push rdi
mov rdi, r13 ;; move the address of argv[1] (r12 + 0 => argv[0], r12 + 8 => argv[1]) to rdi
call _atoi ;; convert char* argv[1] to int and store the result in rax
mov r13, rax ;; store counter as int in r13
pop rdi
;; log
push rdi
push rsi
mov rdi, log2
mov rsi, r13
xor rax, rax
call _printf
pop rsi
pop rdi
;; end log
.calculate_memory_length:
;; rdi = counter * len
mov rdi, r13
mov rax, len
imul rdi, rax
inc rdi ;; +1 for the terminal character (\0)
mov r12, rdi ;; store total length in r12
.allocate_memory:
push rax
mov rdi, r12 ;; number of bytes to allocate
xor rax, rax
call _malloc
test rax, rax ;; test the return code of _malloc syscall
jz .exit_failure ;; if it is an error - go to exit(1)
mov r12, rax ;; store address in r12
pop rax
.repeat_message:
mov rcx, r13 ;; set outer loop counter to COUNTER
mov rdi, r12 ;; set destination address to the stored address from malloc call
.repeat_message_1:
push rcx ;; push outer loop counter onto stack - we don't need it just yet, but will need it after the inner loop is done
lea rsi, [rel msg] ;; load address of our message to repeat into the source address
mov rcx, len ;; set inner loop counter to the length of a message
cld ;; reset the string copying direction flag
.copy_message:
mov rax, [rsi] ;; load the next byte from the source memory
mov [rdi], rax ;; copy the loaded byte to the destination memory
inc rsi ;; advance source pointer
inc rdi ;; advamce destimation pointer
dec rcx ;; decrease the inner loop counter
jnz .copy_message
.repeat_message_2:
pop rcx ;; restore the outer loop counter
dec rcx ;; decrement the outer loop counter
jnz .repeat_message_1
xor rax, rax
mov [rdi], rax ;; put \0 at current RDI
.print_string_builder:
push rdi
mov rdi, r12 ;; set first argument to the allocated memory start
xor rax, rax
call _puts
pop rdi
.exit_success:
xor rdi, rdi ;; exit argument - exit status - 0
jmp .exit
.exit_failure:
mov rdi, 1
.exit:
mov rax, 0x02000001 ;; syscall id - exit; use `60` for Linux
syscall
section .data
msg: db "Hello, world", 10
len equ $ - msg
log1: db "argc = %d, argv[1] = %s", 10, 0
log2: db "atoi = %d", 10, 0
Comparison
Finally, I ran the aforementioned programs with the following arguments: 1M
, 2.5M
, 5M
, 7.5M
, 10M
, 12.5M
, 15M
and 17.5M
(where M
stands for “million”).
The results became more reasonable: