diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/string/x86_64/memset.s | 81 | 
1 files changed, 55 insertions, 26 deletions
| diff --git a/src/string/x86_64/memset.s b/src/string/x86_64/memset.s index 3cc8fcf6..2d3f5e52 100644 --- a/src/string/x86_64/memset.s +++ b/src/string/x86_64/memset.s @@ -1,43 +1,72 @@  .global memset  .type memset,@function  memset: -	movzbl %sil,%esi -	mov $0x101010101010101,%rax -	# 64-bit imul has 3-7 cycles latency, launch early -	imul %rsi,%rax +	movzbq %sil,%rax +	mov $0x101010101010101,%r8 +	imul %r8,%rax -	cmp $16,%rdx -	jb 1f +	cmp $126,%rdx +	ja 2f -	lea -1(%rdx),%rcx -	mov %rdi,%r8 -	shr $3,%rcx -	mov %rax,-8(%rdi,%rdx) -	rep -	stosq -	mov %r8,%rax -	ret - -1:	test %edx,%edx +	test %edx,%edx  	jz 1f -	mov %al,(%rdi) -	mov %al,-1(%rdi,%rdx) +	mov %sil,(%rdi) +	mov %sil,-1(%rdi,%rdx)  	cmp $2,%edx  	jbe 1f -	mov %al,1(%rdi) -	mov %al,-2(%rdi,%rdx) -	cmp $4,%edx +	mov %ax,1(%rdi) +	mov %ax,(-1-2)(%rdi,%rdx) +	cmp $6,%edx +	jbe 1f + +	mov %eax,(1+2)(%rdi) +	mov %eax,(-1-2-4)(%rdi,%rdx) +	cmp $14,%edx +	jbe 1f + +	mov %rax,(1+2+4)(%rdi) +	mov %rax,(-1-2-4-8)(%rdi,%rdx) +	cmp $30,%edx  	jbe 1f -	mov %eax,(%rdi) -	mov %eax,-4(%rdi,%rdx) -	cmp $8,%edx +	mov %rax,(1+2+4+8)(%rdi) +	mov %rax,(1+2+4+8+8)(%rdi) +	mov %rax,(-1-2-4-8-16)(%rdi,%rdx) +	mov %rax,(-1-2-4-8-8)(%rdi,%rdx) +	cmp $62,%edx  	jbe 1f -	mov %eax,4(%rdi) -	mov %eax,-8(%rdi,%rdx) +	mov %rax,(1+2+4+8+16)(%rdi) +	mov %rax,(1+2+4+8+16+8)(%rdi) +	mov %rax,(1+2+4+8+16+16)(%rdi) +	mov %rax,(1+2+4+8+16+24)(%rdi) +	mov %rax,(-1-2-4-8-16-32)(%rdi,%rdx) +	mov %rax,(-1-2-4-8-16-24)(%rdi,%rdx) +	mov %rax,(-1-2-4-8-16-16)(%rdi,%rdx) +	mov %rax,(-1-2-4-8-16-8)(%rdi,%rdx)  1:	mov %rdi,%rax  	ret + +2:	test $15,%edi +	mov %rdi,%r8 +	mov %rax,-8(%rdi,%rdx) +	mov %rdx,%rcx +	jnz 2f + +1:	shr $3,%rcx +	rep +	stosq +	mov %r8,%rax +	ret + +2:	xor %edx,%edx +	sub %edi,%edx +	and $15,%edx +	mov %rax,(%rdi) +	mov %rax,8(%rdi) +	sub %rdx,%rcx +	add %rdx,%rdi +	jmp 1b | 
