; compile with:
; $ [ny]asm -felf(32|64) -oe.o e.asm
; $ (gcc|clang) -m(32|64) -oe e.o -nostdlib -nostartfiles

section     .text
global      _start

%if __BITS__ == 32
%define r(n) e%+n
%define SYS_write 4
%define rarg0 ebx
%define rarg1 ecx
%define rarg2 edx
%define syscall int 0x80
%else
%define r(n) r%+n
%define SYS_write 1
%define rarg0 rdi
%define rarg1 rsi
%define rarg2 rdx
default rel
%endif

; size of a Linux pipe buffer
%define PIPE_SIZE 0x10000
%define STDOUT_FILENO 1

; Instead of simply storing a char in .rodata and write(2)-ing it
; over and over again, we first fill a buffer full of e's, and *then*
; write the entire buffer. This is much faster than the first option,
; because we only need to issue a syscall once every 65536 bytes. (Remember
; that doing a syscall requires the kernel to handle an interrupt etc etc etc.)

_start:
        ; allocate space for the message
        mov r(cx), PIPE_SIZE
        mov r(bx), r(cx) ; we'll need it later
        sub r(sp), r(cx)

        ; quick memset(3)
        mov al, 'e'
        mov r(di), r(sp)
        rep stosb

        ; push+pop is actually a smaller encoding than mov for ints that fit within 8 bit
        push STDOUT_FILENO
        pop rarg0
        mov rarg1, r(sp)
        mov rarg2, r(bx)

.loop:
        ; set this within the loop because the syscall's exit code is placed in r(ax)
        push SYS_write
        pop r(ax)
        syscall
        jmp short .loop