diff options
| author | Rich Felker <dalias@aerifal.cx> | 2013-08-28 03:34:57 -0400 | 
|---|---|---|
| committer | Rich Felker <dalias@aerifal.cx> | 2013-08-28 03:34:57 -0400 | 
| commit | 90edf1cc15cec685c18ec2485ddce5b655963464 (patch) | |
| tree | 97b728abcd94fbb0e41a2a1d14a7a15d70b24c36 | |
| parent | 38e6acbf89afd3dfabb4f4d0506319c339b13663 (diff) | |
| download | musl-90edf1cc15cec685c18ec2485ddce5b655963464.tar.gz | |
optimized C memcpy
unlike the old C memcpy, this version handles word-at-a-time reads and
writes even for misaligned copies. it does not require that the cpu
support misaligned accesses; instead, it performs bit shifts to
realign the bytes for the destination.
essentially, this is the C version of the ARM assembly language
memcpy. the ideas are all the same, and it should perform well on any
arch with a decent number of general-purpose registers that has a
barrel shift operation. since the barrel shifter is an optional cpu
feature on microblaze, it may be desirable to provide an alternate asm
implementation on microblaze, but otherwise the C code provides a
competitive implementation for "generic risc-y" cpu archs that should
alleviate the urgent need for arch-specific memcpy asm.
| -rw-r--r-- | src/string/memcpy.c | 127 | 
1 files changed, 111 insertions, 16 deletions
| diff --git a/src/string/memcpy.c b/src/string/memcpy.c index 8e98302f..06e88742 100644 --- a/src/string/memcpy.c +++ b/src/string/memcpy.c @@ -1,29 +1,124 @@  #include <string.h> -#include <stdlib.h>  #include <stdint.h> - -#define SS (sizeof(size_t)) -#define ALIGN (sizeof(size_t)-1) -#define ONES ((size_t)-1/UCHAR_MAX) +#include <endian.h>  void *memcpy(void *restrict dest, const void *restrict src, size_t n)  {  	unsigned char *d = dest;  	const unsigned char *s = src; -	if (((uintptr_t)d & ALIGN) != ((uintptr_t)s & ALIGN)) -		goto misaligned; +#ifdef __GNUC__ + +#if __BYTE_ORDER == __LITTLE_ENDIAN +#define LS >> +#define RS << +#else +#define LS << +#define RS >> +#endif + +	typedef uint32_t __attribute__((__may_alias__)) u32; +	uint32_t w, x; -	for (; ((uintptr_t)d & ALIGN) && n; n--) *d++ = *s++; -	if (n) { -		size_t *wd = (void *)d; -		const size_t *ws = (const void *)s; +	for (; (uintptr_t)s % 4 && n; n--) *d++ = *s++; -		for (; n>=SS; n-=SS) *wd++ = *ws++; -		d = (void *)wd; -		s = (const void *)ws; -misaligned: -		for (; n; n--) *d++ = *s++; +	if ((uintptr_t)d % 4 == 0) { +		for (; n>=16; s+=16, d+=16, n-=16) { +			*(u32 *)(d+0) = *(u32 *)(s+0); +			*(u32 *)(d+4) = *(u32 *)(s+4); +			*(u32 *)(d+8) = *(u32 *)(s+8); +			*(u32 *)(d+12) = *(u32 *)(s+12); +		} +		if (n&8) { +			*(u32 *)(d+0) = *(u32 *)(s+0); +			*(u32 *)(d+4) = *(u32 *)(s+4); +			d += 8; s += 8; +		} +		if (n&4) { +			*(u32 *)(d+0) = *(u32 *)(s+0); +			d += 4; s += 4; +		} +		if (n&2) { +			*d++ = *s++; *d++ = *s++; +		} +		if (n&1) { +			*d = *s; +		} +		return dest; +	} + +	if (n >= 32) switch ((uintptr_t)d % 4) { +	case 1: +		w = *(u32 *)s; +		*d++ = *s++; +		*d++ = *s++; +		*d++ = *s++; +		n -= 3; +		for (; n>=17; s+=16, d+=16, n-=16) { +			x = *(u32 *)(s+1); +			*(u32 *)(d+0) = (w LS 24) | (x RS 8); +			w = *(u32 *)(s+5); +			*(u32 *)(d+4) = (x LS 24) | (w RS 8); +			x = *(u32 *)(s+9); +			*(u32 *)(d+8) = (w LS 24) | (x RS 8); +			w = *(u32 *)(s+13); +			*(u32 *)(d+12) = (x LS 24) | (w RS 8); +		} +		break; +	case 2: +		w = *(u32 *)s; +		*d++ = *s++; +		*d++ = *s++; +		n -= 2; +		for (; n>=18; s+=16, d+=16, n-=16) { +			x = *(u32 *)(s+2); +			*(u32 *)(d+0) = (w LS 16) | (x RS 16); +			w = *(u32 *)(s+6); +			*(u32 *)(d+4) = (x LS 16) | (w RS 16); +			x = *(u32 *)(s+10); +			*(u32 *)(d+8) = (w LS 16) | (x RS 16); +			w = *(u32 *)(s+14); +			*(u32 *)(d+12) = (x LS 16) | (w RS 16); +		} +		break; +	case 3: +		w = *(u32 *)s; +		*d++ = *s++; +		n -= 1; +		for (; n>=19; s+=16, d+=16, n-=16) { +			x = *(u32 *)(s+3); +			*(u32 *)(d+0) = (w LS 8) | (x RS 24); +			w = *(u32 *)(s+7); +			*(u32 *)(d+4) = (x LS 8) | (w RS 24); +			x = *(u32 *)(s+11); +			*(u32 *)(d+8) = (w LS 8) | (x RS 24); +			w = *(u32 *)(s+15); +			*(u32 *)(d+12) = (x LS 8) | (w RS 24); +		} +		break; +	} +	if (n&16) { +		*d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++; +		*d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++; +		*d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++; +		*d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++;  	} +	if (n&8) { +		*d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++; +		*d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++; +	} +	if (n&4) { +		*d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++; +	} +	if (n&2) { +		*d++ = *s++; *d++ = *s++; +	} +	if (n&1) { +		*d = *s; +	} +	return dest; +#endif + +	for (; n; n--) *d++ = *s++;  	return dest;  } | 
