12 files changed, 430 insertions, 26 deletions
diff --git a/src/string/aarch64/memcpy.S b/src/string/aarch64/memcpy.S
new file mode 100644
index 00000000..48bb8a8d
--- /dev/null
+++ b/src/string/aarch64/memcpy.S
@@ -0,0 +1,186 @@
+/*
+ * memcpy - copy memory area
+ *
+ * Copyright (c) 2012-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ *
+ */
+
+#define dstin   x0
+#define src     x1
+#define count   x2
+#define dst     x3
+#define srcend  x4
+#define dstend  x5
+#define A_l     x6
+#define A_lw    w6
+#define A_h     x7
+#define B_l     x8
+#define B_lw    w8
+#define B_h     x9
+#define C_l     x10
+#define C_lw    w10
+#define C_h     x11
+#define D_l     x12
+#define D_h     x13
+#define E_l     x14
+#define E_h     x15
+#define F_l     x16
+#define F_h     x17
+#define G_l     count
+#define G_h     dst
+#define H_l     src
+#define H_h     srcend
+#define tmp1    x14
+
+/* This implementation of memcpy uses unaligned accesses and branchless
+   sequences to keep the code small, simple and improve performance.
+
+   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+   copies of up to 128 bytes, and large copies.  The overhead of the overlap
+   check is negligible since it is only required for large copies.
+
+   Large copies use a software pipelined loop processing 64 bytes per iteration.
+   The destination pointer is 16-byte aligned to minimize unaligned accesses.
+   The loop tail is handled by always copying 64 bytes from the end.
+*/
+
+.global memcpy
+.type memcpy,%function
+memcpy:
+	add     srcend, src, count
+	add     dstend, dstin, count
+	cmp     count, 128
+	b.hi    .Lcopy_long
+	cmp     count, 32
+	b.hi    .Lcopy32_128
+
+	/* Small copies: 0..32 bytes.  */
+	cmp     count, 16
+	b.lo    .Lcopy16
+	ldp     A_l, A_h, [src]
+	ldp     D_l, D_h, [srcend, -16]
+	stp     A_l, A_h, [dstin]
+	stp     D_l, D_h, [dstend, -16]
+	ret
+
+	/* Copy 8-15 bytes.  */
+.Lcopy16:
+	tbz     count, 3, .Lcopy8
+	ldr     A_l, [src]
+	ldr     A_h, [srcend, -8]
+	str     A_l, [dstin]
+	str     A_h, [dstend, -8]
+	ret
+
+	.p2align 3
+	/* Copy 4-7 bytes.  */
+.Lcopy8:
+	tbz     count, 2, .Lcopy4
+	ldr     A_lw, [src]
+	ldr     B_lw, [srcend, -4]
+	str     A_lw, [dstin]
+	str     B_lw, [dstend, -4]
+	ret
+
+	/* Copy 0..3 bytes using a branchless sequence.  */
+.Lcopy4:
+	cbz     count, .Lcopy0
+	lsr     tmp1, count, 1
+	ldrb    A_lw, [src]
+	ldrb    C_lw, [srcend, -1]
+	ldrb    B_lw, [src, tmp1]
+	strb    A_lw, [dstin]
+	strb    B_lw, [dstin, tmp1]
+	strb    C_lw, [dstend, -1]
+.Lcopy0:
+	ret
+
+	.p2align 4
+	/* Medium copies: 33..128 bytes.  */
+.Lcopy32_128:
+	ldp     A_l, A_h, [src]
+	ldp     B_l, B_h, [src, 16]
+	ldp     C_l, C_h, [srcend, -32]
+	ldp     D_l, D_h, [srcend, -16]
+	cmp     count, 64
+	b.hi    .Lcopy128
+	stp     A_l, A_h, [dstin]
+	stp     B_l, B_h, [dstin, 16]
+	stp     C_l, C_h, [dstend, -32]
+	stp     D_l, D_h, [dstend, -16]
+	ret
+
+	.p2align 4
+	/* Copy 65..128 bytes.  */
+.Lcopy128:
+	ldp     E_l, E_h, [src, 32]
+	ldp     F_l, F_h, [src, 48]
+	cmp     count, 96
+	b.ls    .Lcopy96
+	ldp     G_l, G_h, [srcend, -64]
+	ldp     H_l, H_h, [srcend, -48]
+	stp     G_l, G_h, [dstend, -64]
+	stp     H_l, H_h, [dstend, -48]
+.Lcopy96:
+	stp     A_l, A_h, [dstin]
+	stp     B_l, B_h, [dstin, 16]
+	stp     E_l, E_h, [dstin, 32]
+	stp     F_l, F_h, [dstin, 48]
+	stp     C_l, C_h, [dstend, -32]
+	stp     D_l, D_h, [dstend, -16]
+	ret
+
+	.p2align 4
+	/* Copy more than 128 bytes.  */
+.Lcopy_long:
+
+	/* Copy 16 bytes and then align dst to 16-byte alignment.  */
+
+	ldp     D_l, D_h, [src]
+	and     tmp1, dstin, 15
+	bic     dst, dstin, 15
+	sub     src, src, tmp1
+	add     count, count, tmp1      /* Count is now 16 too large.  */
+	ldp     A_l, A_h, [src, 16]
+	stp     D_l, D_h, [dstin]
+	ldp     B_l, B_h, [src, 32]
+	ldp     C_l, C_h, [src, 48]
+	ldp     D_l, D_h, [src, 64]!
+	subs    count, count, 128 + 16  /* Test and readjust count.  */
+	b.ls    .Lcopy64_from_end
+
+.Lloop64:
+	stp     A_l, A_h, [dst, 16]
+	ldp     A_l, A_h, [src, 16]
+	stp     B_l, B_h, [dst, 32]
+	ldp     B_l, B_h, [src, 32]
+	stp     C_l, C_h, [dst, 48]
+	ldp     C_l, C_h, [src, 48]
+	stp     D_l, D_h, [dst, 64]!
+	ldp     D_l, D_h, [src, 64]!
+	subs    count, count, 64
+	b.hi    .Lloop64
+
+	/* Write the last iteration and copy 64 bytes from the end.  */
+.Lcopy64_from_end:
+	ldp     E_l, E_h, [srcend, -64]
+	stp     A_l, A_h, [dst, 16]
+	ldp     A_l, A_h, [srcend, -48]
+	stp     B_l, B_h, [dst, 32]
+	ldp     B_l, B_h, [srcend, -32]
+	stp     C_l, C_h, [dst, 48]
+	ldp     C_l, C_h, [srcend, -16]
+	stp     D_l, D_h, [dst, 64]
+	stp     E_l, E_h, [dstend, -64]
+	stp     A_l, A_h, [dstend, -48]
+	stp     B_l, B_h, [dstend, -32]
+	stp     C_l, C_h, [dstend, -16]
+	ret
+
+.size memcpy,.-memcpy
diff --git a/src/string/aarch64/memset.S b/src/string/aarch64/memset.S
new file mode 100644
index 00000000..f0d29b7f
--- /dev/null
+++ b/src/string/aarch64/memset.S
@@ -0,0 +1,115 @@
+/*
+ * memset - fill memory with a constant byte
+ *
+ * Copyright (c) 2012-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ *
+ */
+
+#define dstin   x0
+#define val     x1
+#define valw    w1
+#define count   x2
+#define dst     x3
+#define dstend  x4
+#define zva_val x5
+
+.global memset
+.type memset,%function
+memset:
+
+	dup     v0.16B, valw
+	add     dstend, dstin, count
+
+	cmp     count, 96
+	b.hi    .Lset_long
+	cmp     count, 16
+	b.hs    .Lset_medium
+	mov     val, v0.D[0]
+
+	/* Set 0..15 bytes.  */
+	tbz     count, 3, 1f
+	str     val, [dstin]
+	str     val, [dstend, -8]
+	ret
+	nop
+1:      tbz     count, 2, 2f
+	str     valw, [dstin]
+	str     valw, [dstend, -4]
+	ret
+2:      cbz     count, 3f
+	strb    valw, [dstin]
+	tbz     count, 1, 3f
+	strh    valw, [dstend, -2]
+3:      ret
+
+	/* Set 17..96 bytes.  */
+.Lset_medium:
+	str     q0, [dstin]
+	tbnz    count, 6, .Lset96
+	str     q0, [dstend, -16]
+	tbz     count, 5, 1f
+	str     q0, [dstin, 16]
+	str     q0, [dstend, -32]
+1:      ret
+
+	.p2align 4
+	/* Set 64..96 bytes.  Write 64 bytes from the start and
+	   32 bytes from the end.  */
+.Lset96:
+	str     q0, [dstin, 16]
+	stp     q0, q0, [dstin, 32]
+	stp     q0, q0, [dstend, -32]
+	ret
+
+	.p2align 4
+.Lset_long:
+	and     valw, valw, 255
+	bic     dst, dstin, 15
+	str     q0, [dstin]
+	cmp     count, 160
+	ccmp    valw, 0, 0, hs
+	b.ne    .Lno_zva
+
+#ifndef SKIP_ZVA_CHECK
+	mrs     zva_val, dczid_el0
+	and     zva_val, zva_val, 31
+	cmp     zva_val, 4              /* ZVA size is 64 bytes.  */
+	b.ne    .Lno_zva
+#endif
+	str     q0, [dst, 16]
+	stp     q0, q0, [dst, 32]
+	bic     dst, dst, 63
+	sub     count, dstend, dst      /* Count is now 64 too large.  */
+	sub     count, count, 128       /* Adjust count and bias for loop.  */
+
+	.p2align 4
+.Lzva_loop:
+	add     dst, dst, 64
+	dc      zva, dst
+	subs    count, count, 64
+	b.hi    .Lzva_loop
+	stp     q0, q0, [dstend, -64]
+	stp     q0, q0, [dstend, -32]
+	ret
+
+.Lno_zva:
+	sub     count, dstend, dst      /* Count is 16 too large.  */
+	sub     dst, dst, 16            /* Dst is biased by -32.  */
+	sub     count, count, 64 + 16   /* Adjust count and bias for loop.  */
+.Lno_zva_loop:
+	stp     q0, q0, [dst, 32]
+	stp     q0, q0, [dst, 64]!
+	subs    count, count, 64
+	b.hi    .Lno_zva_loop
+	stp     q0, q0, [dstend, -64]
+	stp     q0, q0, [dstend, -32]
+	ret
+
+.size memset,.-memset
+
diff --git a/src/string/arm/memcpy_le.S b/src/string/arm/memcpy.S
index 9cfbcb2a..869e3448 100644
--- a/src/string/arm/memcpy_le.S
+++ b/src/string/arm/memcpy.S
@@ -1,5 +1,3 @@
-#if !__ARMEB__ && !__thumb__
-
 /*
  * Copyright (C) 2008 The Android Open Source Project
  * All rights reserved.
@@ -40,8 +38,9 @@
  * This file has been modified from the original for use in musl libc.
  * The main changes are: addition of .type memcpy,%function to make the
  * code safely callable from thumb mode, adjusting the return
- * instructions to be compatible with pre-thumb ARM cpus, and removal
- * of prefetch code that is not compatible with older cpus.
+ * instructions to be compatible with pre-thumb ARM cpus, removal of
+ * prefetch code that is not compatible with older cpus and support for
+ * building as thumb 2 and big-endian.
  */
 
 .syntax unified
@@ -226,23 +225,45 @@ non_congruent:
 	 * becomes aligned to 32 bits (r5 = nb of words to copy for alignment)
 	 */
 	movs    r5, r5, lsl #31
+
+#if __ARMEB__
+	movmi   r3, r3, ror #24
+	strbmi	r3, [r0], #1
+	movcs   r3, r3, ror #24
+	strbcs	r3, [r0], #1
+	movcs   r3, r3, ror #24
+	strbcs	r3, [r0], #1
+#else
 	strbmi r3, [r0], #1
 	movmi   r3, r3, lsr #8
 	strbcs r3, [r0], #1
 	movcs   r3, r3, lsr #8
 	strbcs r3, [r0], #1
 	movcs   r3, r3, lsr #8
+#endif
 
 	cmp     r2, #4
 	blo     partial_word_tail
 
+#if __ARMEB__
+	mov	r3, r3, lsr r12
+	mov	r3, r3, lsl r12
+#endif
+
 	/* Align destination to 32 bytes (cache line boundary) */
 1:      tst     r0, #0x1c
 	beq     2f
 	ldr     r5, [r1], #4
 	sub     r2, r2, #4
-	orr     r4, r3, r5,             lsl lr
+#if __ARMEB__
+	mov     r4, r5,                 lsr lr
+	orr     r4, r4, r3
+	mov     r3, r5,                 lsl r12
+#else
+	mov     r4, r5,                 lsl lr
+	orr     r4, r4, r3
 	mov     r3, r5,                 lsr r12
+#endif
 	str     r4, [r0], #4
 	cmp     r2, #4
 	bhs     1b
@@ -268,6 +289,25 @@ loop16:
 	ldmia   r1!, {   r5,r6,r7,  r8,r9,r10,r11}
 	subs    r2, r2, #32
 	ldrhs   r12, [r1], #4
+#if __ARMEB__
+	orr     r3, r3, r4, lsr #16
+	mov     r4, r4, lsl #16
+	orr     r4, r4, r5, lsr #16
+	mov     r5, r5, lsl #16
+	orr     r5, r5, r6, lsr #16
+	mov     r6, r6, lsl #16
+	orr     r6, r6, r7, lsr #16
+	mov     r7, r7, lsl #16
+	orr     r7, r7, r8, lsr #16
+	mov     r8, r8, lsl #16
+	orr     r8, r8, r9, lsr #16
+	mov     r9, r9, lsl #16
+	orr     r9, r9, r10, lsr #16
+	mov     r10, r10,               lsl #16
+	orr     r10, r10, r11, lsr #16
+	stmia   r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
+	mov     r3, r11, lsl #16
+#else
 	orr     r3, r3, r4, lsl #16
 	mov     r4, r4, lsr #16
 	orr     r4, r4, r5, lsl #16
@@ -285,6 +325,7 @@ loop16:
 	orr     r10, r10, r11, lsl #16
 	stmia   r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
 	mov     r3, r11, lsr #16
+#endif
 	bhs     1b
 	b       less_than_thirtytwo
 
@@ -294,6 +335,25 @@ loop8:
 	ldmia   r1!, {   r5,r6,r7,  r8,r9,r10,r11}
 	subs    r2, r2, #32
 	ldrhs   r12, [r1], #4
+#if __ARMEB__
+	orr     r3, r3, r4, lsr #24
+	mov     r4, r4, lsl #8
+	orr     r4, r4, r5, lsr #24
+	mov     r5, r5, lsl #8
+	orr     r5, r5, r6, lsr #24
+	mov     r6, r6,  lsl #8
+	orr     r6, r6, r7, lsr #24
+	mov     r7, r7,  lsl #8
+	orr     r7, r7, r8,             lsr #24
+	mov     r8, r8,  lsl #8
+	orr     r8, r8, r9,             lsr #24
+	mov     r9, r9,  lsl #8
+	orr     r9, r9, r10,    lsr #24
+	mov     r10, r10, lsl #8
+	orr     r10, r10, r11,  lsr #24
+	stmia   r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
+	mov     r3, r11, lsl #8
+#else
 	orr     r3, r3, r4, lsl #24
 	mov     r4, r4, lsr #8
 	orr     r4, r4, r5, lsl #24
@@ -311,6 +371,7 @@ loop8:
 	orr     r10, r10, r11,  lsl #24
 	stmia   r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
 	mov     r3, r11, lsr #8
+#endif
 	bhs     1b
 	b       less_than_thirtytwo
 
@@ -320,6 +381,25 @@ loop24:
 	ldmia   r1!, {   r5,r6,r7,  r8,r9,r10,r11}
 	subs    r2, r2, #32
 	ldrhs   r12, [r1], #4
+#if __ARMEB__
+	orr     r3, r3, r4, lsr #8
+	mov     r4, r4, lsl #24
+	orr     r4, r4, r5, lsr #8
+	mov     r5, r5, lsl #24
+	orr     r5, r5, r6, lsr #8
+	mov     r6, r6, lsl #24
+	orr     r6, r6, r7, lsr #8
+	mov     r7, r7, lsl #24
+	orr     r7, r7, r8, lsr #8
+	mov     r8, r8, lsl #24
+	orr     r8, r8, r9, lsr #8
+	mov     r9, r9, lsl #24
+	orr     r9, r9, r10, lsr #8
+	mov     r10, r10, lsl #24
+	orr     r10, r10, r11, lsr #8
+	stmia   r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
+	mov     r3, r11, lsl #24
+#else
 	orr     r3, r3, r4, lsl #8
 	mov     r4, r4, lsr #24
 	orr     r4, r4, r5, lsl #8
@@ -337,6 +417,7 @@ loop24:
 	orr     r10, r10, r11, lsl #8
 	stmia   r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
 	mov     r3, r11, lsr #24
+#endif
 	bhs     1b
 
 less_than_thirtytwo:
@@ -348,8 +429,15 @@ less_than_thirtytwo:
 
 1:      ldr     r5, [r1], #4
 	sub     r2, r2, #4
-	orr     r4, r3, r5,             lsl lr
+#if __ARMEB__
+	mov     r4, r5,                 lsr lr
+	orr     r4, r4, r3
+	mov     r3,     r5,                     lsl r12
+#else
+	mov     r4, r5,                 lsl lr
+	orr     r4, r4, r3
 	mov     r3,     r5,                     lsr r12
+#endif
 	str     r4, [r0], #4
 	cmp     r2, #4
 	bhs     1b
@@ -357,11 +445,20 @@ less_than_thirtytwo:
 partial_word_tail:
 	/* we have a partial word in the input buffer */
 	movs    r5, lr, lsl #(31-3)
+#if __ARMEB__
+	movmi   r3, r3, ror #24
+	strbmi r3, [r0], #1
+	movcs   r3, r3, ror #24
+	strbcs r3, [r0], #1
+	movcs   r3, r3, ror #24
+	strbcs r3, [r0], #1
+#else
 	strbmi r3, [r0], #1
 	movmi   r3, r3, lsr #8
 	strbcs r3, [r0], #1
 	movcs   r3, r3, lsr #8
 	strbcs r3, [r0], #1
+#endif
 
 	/* Refill spilled registers from the stack. Don't update sp. */
 	ldmfd   sp, {r5-r11}
@@ -380,4 +477,3 @@ copy_last_3_and_return:
 	ldmfd   sp!, {r0, r4, lr}
 	bx      lr
 
-#endif
diff --git a/src/string/arm/memcpy.c b/src/string/arm/memcpy.c
deleted file mode 100644
index f703c9bd..00000000
--- a/src/string/arm/memcpy.c
+++ /dev/null
@@ -1,3 +0,0 @@
-#if __ARMEB__ || __thumb__
-#include "../memcpy.c"
-#endif
diff --git a/src/string/memccpy.c b/src/string/memccpy.c
index 00c18e2b..3b0a3700 100644
--- a/src/string/memccpy.c
+++ b/src/string/memccpy.c
@@ -29,6 +29,6 @@ void *memccpy(void *restrict dest, const void *restrict src, int c, size_t n)
 #endif
 	for (; n && (*d=*s)!=c; n--, s++, d++);
 tail:
-	if (n && *s==c) return d+1;
+	if (n) return d+1;
 	return 0;
 }
diff --git a/src/string/memmem.c b/src/string/memmem.c
index 58a21fcd..11eff86e 100644
--- a/src/string/memmem.c
+++ b/src/string/memmem.c
@@ -12,8 +12,8 @@ static char *twobyte_memmem(const unsigned char *h, size_t k, const unsigned cha
 
 static char *threebyte_memmem(const unsigned char *h, size_t k, const unsigned char *n)
 {
-	uint32_t nw = n[0]<<24 | n[1]<<16 | n[2]<<8;
-	uint32_t hw = h[0]<<24 | h[1]<<16 | h[2]<<8;
+	uint32_t nw = (uint32_t)n[0]<<24 | n[1]<<16 | n[2]<<8;
+	uint32_t hw = (uint32_t)h[0]<<24 | h[1]<<16 | h[2]<<8;
 	for (h+=3, k-=3; k; k--, hw = (hw|*h++)<<8)
 		if (hw == nw) return (char *)h-3;
 	return hw == nw ? (char *)h-3 : 0;
@@ -21,8 +21,8 @@ static char *threebyte_memmem(const unsigned char *h, size_t k, const unsigned c
 
 static char *fourbyte_memmem(const unsigned char *h, size_t k, const unsigned char *n)
 {
-	uint32_t nw = n[0]<<24 | n[1]<<16 | n[2]<<8 | n[3];
-	uint32_t hw = h[0]<<24 | h[1]<<16 | h[2]<<8 | h[3];
+	uint32_t nw = (uint32_t)n[0]<<24 | n[1]<<16 | n[2]<<8 | n[3];
+	uint32_t hw = (uint32_t)h[0]<<24 | h[1]<<16 | h[2]<<8 | h[3];
 	for (h+=4, k-=4; k; k--, hw = hw<<8 | *h++)
 		if (hw == nw) return (char *)h-4;
 	return hw == nw ? (char *)h-4 : 0;
diff --git a/src/string/strsignal.c b/src/string/strsignal.c
index 96bfe841..5156366e 100644
--- a/src/string/strsignal.c
+++ b/src/string/strsignal.c
@@ -31,7 +31,11 @@ static const char map[] = {
 	[SIGPIPE]   = 13,
 	[SIGALRM]   = 14,
 	[SIGTERM]   = 15,
+#if defined(SIGSTKFLT)
 	[SIGSTKFLT] = 16,
+#elif defined(SIGEMT)
+	[SIGEMT]    = 16,
+#endif
 	[SIGCHLD]   = 17,
 	[SIGCONT]   = 18,
 	[SIGSTOP]   = 19,
@@ -70,7 +74,13 @@ static const char strings[] =
 	"Broken pipe\0"
 	"Alarm clock\0"
 	"Terminated\0"
+#if defined(SIGSTKFLT)
 	"Stack fault\0"
+#elif defined(SIGEMT)
+	"Emulator trap\0"
+#else
+	"Unknown signal\0"
+#endif
 	"Child process status\0"
 	"Continued\0"
 	"Stopped (signal)\0"
diff --git a/src/string/strstr.c b/src/string/strstr.c
index 55ba1c7b..96657bc2 100644
--- a/src/string/strstr.c
+++ b/src/string/strstr.c
@@ -10,16 +10,16 @@ static char *twobyte_strstr(const unsigned char *h, const unsigned char *n)
 
 static char *threebyte_strstr(const unsigned char *h, const unsigned char *n)
 {
-	uint32_t nw = n[0]<<24 | n[1]<<16 | n[2]<<8;
-	uint32_t hw = h[0]<<24 | h[1]<<16 | h[2]<<8;
+	uint32_t nw = (uint32_t)n[0]<<24 | n[1]<<16 | n[2]<<8;
+	uint32_t hw = (uint32_t)h[0]<<24 | h[1]<<16 | h[2]<<8;
 	for (h+=2; *h && hw != nw; hw = (hw|*++h)<<8);
 	return *h ? (char *)h-2 : 0;
 }
 
 static char *fourbyte_strstr(const unsigned char *h, const unsigned char *n)
 {
-	uint32_t nw = n[0]<<24 | n[1]<<16 | n[2]<<8 | n[3];
-	uint32_t hw = h[0]<<24 | h[1]<<16 | h[2]<<8 | h[3];
+	uint32_t nw = (uint32_t)n[0]<<24 | n[1]<<16 | n[2]<<8 | n[3];
+	uint32_t hw = (uint32_t)h[0]<<24 | h[1]<<16 | h[2]<<8 | h[3];
 	for (h+=3; *h && hw != nw; hw = hw<<8 | *++h);
 	return *h ? (char *)h-3 : 0;
 }
@@ -96,7 +96,7 @@ static char *twoway_strstr(const unsigned char *h, const unsigned char *n)
 	for (;;) {
 		/* Update incremental end-of-haystack pointer */
 		if (z-h < l) {
-			/* Fast estimate for MIN(l,63) */
+			/* Fast estimate for MAX(l,63) */
 			size_t grow = l | 63;
 			const unsigned char *z2 = memchr(z, 0, grow);
 			if (z2) {
diff --git a/src/string/strverscmp.c b/src/string/strverscmp.c
index 4daf276d..16c1da22 100644
--- a/src/string/strverscmp.c
+++ b/src/string/strverscmp.c
@@ -18,9 +18,9 @@ int strverscmp(const char *l0, const char *r0)
 		else if (c!='0') z=0;
 	}
 
-	if (l[dp]!='0' && r[dp]!='0') {
-		/* If we're not looking at a digit sequence that began
-		 * with a zero, longest digit string is greater. */
+	if (l[dp]-'1'<9U && r[dp]-'1'<9U) {
+		/* If we're looking at non-degenerate digit sequences starting
+		 * with nonzero digits, longest digit string is greater. */
 		for (j=i; isdigit(l[j]); j++)
 			if (!isdigit(r[j])) return 1;
 		if (isdigit(r[j])) return -1;
diff --git a/src/string/wcscmp.c b/src/string/wcscmp.c
index 26eeee70..286ec3ea 100644
--- a/src/string/wcscmp.c
+++ b/src/string/wcscmp.c
@@ -3,5 +3,5 @@
 int wcscmp(const wchar_t *l, const wchar_t *r)
 {
 	for (; *l==*r && *l && *r; l++, r++);
-	return *l - *r;
+	return *l < *r ? -1 : *l > *r;
 }
diff --git a/src/string/wcsncmp.c b/src/string/wcsncmp.c
index 4ab32a92..2b3558bf 100644
--- a/src/string/wcsncmp.c
+++ b/src/string/wcsncmp.c
@@ -3,5 +3,5 @@
 int wcsncmp(const wchar_t *l, const wchar_t *r, size_t n)
 {
 	for (; n && *l==*r && *l && *r; n--, l++, r++);
-	return n ? *l - *r : 0;
+	return n ? (*l < *r ? -1 : *l > *r) : 0;
 }
diff --git a/src/string/wmemcmp.c b/src/string/wmemcmp.c
index 2a193263..717d77b1 100644
--- a/src/string/wmemcmp.c
+++ b/src/string/wmemcmp.c
@@ -3,5 +3,5 @@
 int wmemcmp(const wchar_t *l, const wchar_t *r, size_t n)
 {
 	for (; n && *l==*r; n--, l++, r++);
-	return n ? *l-*r : 0;
+	return n ? (*l < *r ? -1 : *l > *r) : 0;
 }