For reference here's a simple C version: #include "stdlib.h" #include "string.h" #include "stdio.h"
#define N 100000000 #define L 6 char h[L] = "hello\n"; int main() { char *ptr; if (N <= 100) ptr = malloc(N * L + 1); // the +1 is for the final printing else ptr = malloc(N * L); char *p = ptr; int i; for (i = 0; i < N; i++) { memcpy(p, h, L); p += L; } // to see if it works decrease N if (N <= 100) { ptr[N * L] = '\0'; printf("%s", ptr); } return 0; } It takes 0.59 s to run on the same PC and OS of mine, both using GCC and LLVM-GCC. (Note that the loop in the code is a bit different now, the end result is the same). The ASM of the inner loop: L: movl _h, %eax movl %eax, (%edx) movzwl _h+4, %eax movw %ax, 4(%edx) addl $6, %edx cmpl %ecx, %edx jne L Bye, bearophile