>>7
(Ow shit codepad doesn't handle asm, sorry for this long post)
As you said, gcc -O2 seems to optimize naive.c with bit operations, inlining of collatzLen and tail-call elimination.
loop:
.LFB13:
.cfi_startproc
cmpq $1, %rdi
movl $1, %eax
movl $1, %r9d
movl $2, %r8d
jle .L22
.p2align 4,,10
.p2align 3
.L18:
movq %r8, %rdx
movl $1, %ecx
jmp .L16
.p2align 4,,10
.p2align 3
.L24:
movq %rdx, %rsi
addl $1, %ecx
shrq $63, %rsi
addq %rsi, %rdx
sarq %rdx
cmpq $1, %rdx
je .L23
.L16:
testb $1, %dl
je .L24
leaq 1(%rdx,%rdx,2), %rdx
addl $1, %ecx
cmpq $1, %rdx
jne .L16
.L23:
cmpl %r9d, %ecx
jle .L17
movq %r8, %rax
movl %ecx, %r9d
.L17:
addq $1, %r8
cmpq %r8, %rdi
jge .L18
rep
ret
.L22:
rep
ret
.cfi_endproc