mirror of https://git.musl-libc.org/git/musl
Browse Source
"and $0xff,%esi" is a six-byte insn (81 e6 ff 00 00 00), can use 4-byte "movzbl %sil,%esi" (40 0f b6 f6) instead. 64-bit imul is slow, move it as far up as possible so that the result (rax) has more time to be ready by the time we start using it in mem stores. There is no need to shuffle registers in preparation to "rep movs" if we are not going to take that code path. Thus, patch moves "jump if len < 16" instructions up, and changes alternate code path to use rdx and rdi instead of rcx and r8. Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>master
committed by
Rich Felker
1 changed files with 16 additions and 14 deletions
@ -1,41 +1,43 @@ |
|||
.global memset |
|||
.type memset,@function |
|||
memset: |
|||
and $0xff,%esi |
|||
movzbl %sil,%esi |
|||
mov $0x101010101010101,%rax |
|||
mov %rdx,%rcx |
|||
mov %rdi,%r8 |
|||
# 64-bit imul has 3-7 cycles latency, launch early |
|||
imul %rsi,%rax |
|||
cmp $16,%rcx |
|||
|
|||
cmp $16,%rdx |
|||
jb 1f |
|||
|
|||
mov %rax,-8(%rdi,%rcx) |
|||
mov %rdx,%rcx |
|||
mov %rdi,%r8 |
|||
shr $3,%rcx |
|||
mov %rax,-8(%rdi,%rdx) |
|||
rep |
|||
stosq |
|||
mov %r8,%rax |
|||
ret |
|||
|
|||
1: test %ecx,%ecx |
|||
1: test %edx,%edx |
|||
jz 1f |
|||
|
|||
mov %al,(%rdi) |
|||
mov %al,-1(%rdi,%rcx) |
|||
cmp $2,%ecx |
|||
mov %al,-1(%rdi,%rdx) |
|||
cmp $2,%edx |
|||
jbe 1f |
|||
|
|||
mov %al,1(%rdi) |
|||
mov %al,-2(%rdi,%rcx) |
|||
cmp $4,%ecx |
|||
mov %al,-2(%rdi,%rdx) |
|||
cmp $4,%edx |
|||
jbe 1f |
|||
|
|||
mov %eax,(%rdi) |
|||
mov %eax,-4(%rdi,%rcx) |
|||
cmp $8,%ecx |
|||
mov %eax,-4(%rdi,%rdx) |
|||
cmp $8,%edx |
|||
jbe 1f |
|||
|
|||
mov %eax,4(%rdi) |
|||
mov %eax,-8(%rdi,%rcx) |
|||
mov %eax,-8(%rdi,%rdx) |
|||
|
|||
1: mov %r8,%rax |
|||
1: mov %rdi,%rax |
|||
ret |
|||
|
|||
Loading…
Reference in new issue