overhaul optimized x86_64 memset asm

on most cpu models, "rep stosq" has high overhead that makes it undesirable for small memset sizes. the new code extends the minimal-branch fast path for short memsets from size 15 up to size 126, and shrink-wraps this code path. in addition, "rep stosq" is sensitive to misalignment. the cost varies with size and with cpu model, but it has been observed performing 1.5 times slower when the destination address is not aligned mod 16. the new code thus ensures alignment mod 16, but also preserves any existing additional alignment, in case there are cpu models where it is beneficial. this version is based in part on changes proposed by Denys Vlasenko.
author: Rich Felker <dalias@aerifal.cx> 2015-02-26 02:07:08 -0500
committer: Rich Felker <dalias@aerifal.cx> 2015-02-26 02:07:08 -0500
commit: e346ff86c8faee901a7c2d502b5beb983b99f972 (patch)
tree: 7f08e43d71fab39cf3b8379625944193322ff933 /src
parent: 69858fa93107aa7485b143c54137e745a7b7ad72 (diff)
download: musl-e346ff86c8faee901a7c2d502b5beb983b99f972.tar.gz
1 files changed, 55 insertions, 26 deletions
diff --git a/src/string/x86_64/memset.s b/src/string/x86_64/memset.s
index 3cc8fcf6..2d3f5e52 100644
--- a/src/string/x86_64/memset.s
+++ b/src/string/x86_64/memset.s
@@ -1,43 +1,72 @@
 .global memset
 .type memset,@function
 memset:
-	movzbl %sil,%esi
-	mov $0x101010101010101,%rax
-	# 64-bit imul has 3-7 cycles latency, launch early
-	imul %rsi,%rax
+	movzbq %sil,%rax
+	mov $0x101010101010101,%r8
+	imul %r8,%rax
 
-	cmp $16,%rdx
-	jb 1f
+	cmp $126,%rdx
+	ja 2f
 
-	lea -1(%rdx),%rcx
-	mov %rdi,%r8
-	shr $3,%rcx
-	mov %rax,-8(%rdi,%rdx)
-	rep
-	stosq
-	mov %r8,%rax
-	ret
-
-1:	test %edx,%edx
+	test %edx,%edx
 	jz 1f
 
-	mov %al,(%rdi)
-	mov %al,-1(%rdi,%rdx)
+	mov %sil,(%rdi)
+	mov %sil,-1(%rdi,%rdx)
 	cmp $2,%edx
 	jbe 1f
 
-	mov %al,1(%rdi)
-	mov %al,-2(%rdi,%rdx)
-	cmp $4,%edx
+	mov %ax,1(%rdi)
+	mov %ax,(-1-2)(%rdi,%rdx)
+	cmp $6,%edx
+	jbe 1f
+
+	mov %eax,(1+2)(%rdi)
+	mov %eax,(-1-2-4)(%rdi,%rdx)
+	cmp $14,%edx
+	jbe 1f
+
+	mov %rax,(1+2+4)(%rdi)
+	mov %rax,(-1-2-4-8)(%rdi,%rdx)
+	cmp $30,%edx
 	jbe 1f
 
-	mov %eax,(%rdi)
-	mov %eax,-4(%rdi,%rdx)
-	cmp $8,%edx
+	mov %rax,(1+2+4+8)(%rdi)
+	mov %rax,(1+2+4+8+8)(%rdi)
+	mov %rax,(-1-2-4-8-16)(%rdi,%rdx)
+	mov %rax,(-1-2-4-8-8)(%rdi,%rdx)
+	cmp $62,%edx
 	jbe 1f
 
-	mov %eax,4(%rdi)
-	mov %eax,-8(%rdi,%rdx)
+	mov %rax,(1+2+4+8+16)(%rdi)
+	mov %rax,(1+2+4+8+16+8)(%rdi)
+	mov %rax,(1+2+4+8+16+16)(%rdi)
+	mov %rax,(1+2+4+8+16+24)(%rdi)
+	mov %rax,(-1-2-4-8-16-32)(%rdi,%rdx)
+	mov %rax,(-1-2-4-8-16-24)(%rdi,%rdx)
+	mov %rax,(-1-2-4-8-16-16)(%rdi,%rdx)
+	mov %rax,(-1-2-4-8-16-8)(%rdi,%rdx)
 
 1:	mov %rdi,%rax
 	ret
+
+2:	test $15,%edi
+	mov %rdi,%r8
+	mov %rax,-8(%rdi,%rdx)
+	mov %rdx,%rcx
+	jnz 2f
+
+1:	shr $3,%rcx
+	rep
+	stosq
+	mov %r8,%rax
+	ret
+
+2:	xor %edx,%edx
+	sub %edi,%edx
+	and $15,%edx
+	mov %rax,(%rdi)
+	mov %rax,8(%rdi)
+	sub %rdx,%rcx
+	add %rdx,%rdi
+	jmp 1b
author	Rich Felker <dalias@aerifal.cx>	2015-02-26 02:07:08 -0500
committer	Rich Felker <dalias@aerifal.cx>	2015-02-26 02:07:08 -0500
commit	e346ff86c8faee901a7c2d502b5beb983b99f972 (patch)
tree	7f08e43d71fab39cf3b8379625944193322ff933 /src
parent	69858fa93107aa7485b143c54137e745a7b7ad72 (diff)
download	musl-e346ff86c8faee901a7c2d502b5beb983b99f972.tar.gz