diff options
-rw-r--r-- | src/string/x86_64/memset.s | 30 |
1 files changed, 16 insertions, 14 deletions
diff --git a/src/string/x86_64/memset.s b/src/string/x86_64/memset.s index fc06eef8..263336b5 100644 --- a/src/string/x86_64/memset.s +++ b/src/string/x86_64/memset.s @@ -1,41 +1,43 @@ .global memset .type memset,@function memset: - and $0xff,%esi + movzbl %sil,%esi mov $0x101010101010101,%rax - mov %rdx,%rcx - mov %rdi,%r8 + # 64-bit imul has 3-7 cycles latency, launch early imul %rsi,%rax - cmp $16,%rcx + + cmp $16,%rdx jb 1f - mov %rax,-8(%rdi,%rcx) + mov %rdx,%rcx + mov %rdi,%r8 shr $3,%rcx + mov %rax,-8(%rdi,%rdx) rep stosq mov %r8,%rax ret -1: test %ecx,%ecx +1: test %edx,%edx jz 1f mov %al,(%rdi) - mov %al,-1(%rdi,%rcx) - cmp $2,%ecx + mov %al,-1(%rdi,%rdx) + cmp $2,%edx jbe 1f mov %al,1(%rdi) - mov %al,-2(%rdi,%rcx) - cmp $4,%ecx + mov %al,-2(%rdi,%rdx) + cmp $4,%edx jbe 1f mov %eax,(%rdi) - mov %eax,-4(%rdi,%rcx) - cmp $8,%ecx + mov %eax,-4(%rdi,%rdx) + cmp $8,%edx jbe 1f mov %eax,4(%rdi) - mov %eax,-8(%rdi,%rcx) + mov %eax,-8(%rdi,%rdx) -1: mov %r8,%rax +1: mov %rdi,%rax ret |