diff --git a/arch/openrisc/TODO.openrisc b/arch/openrisc/TODO.openrisc index 0eb04c8240f9..c43d4e1d14eb 100644 --- a/arch/openrisc/TODO.openrisc +++ b/arch/openrisc/TODO.openrisc @@ -10,4 +10,3 @@ that are due for investigation shortly, i.e. our TODO list: or1k and this change is slowly trickling through the stack. For the time being, or32 is equivalent to or1k. --- Implement optimized version of memcpy and memset diff --git a/arch/openrisc/include/asm/string.h b/arch/openrisc/include/asm/string.h index 33470d4d6948..64939ccd7531 100644 --- a/arch/openrisc/include/asm/string.h +++ b/arch/openrisc/include/asm/string.h @@ -4,4 +4,7 @@ #define __HAVE_ARCH_MEMSET extern void *memset(void *s, int c, __kernel_size_t n); +#define __HAVE_ARCH_MEMCPY +extern void *memcpy(void *dest, __const void *src, __kernel_size_t n); + #endif /* __ASM_OPENRISC_STRING_H */ diff --git a/arch/openrisc/lib/Makefile b/arch/openrisc/lib/Makefile index 67c583e0617f..17d9d37f32d2 100644 --- a/arch/openrisc/lib/Makefile +++ b/arch/openrisc/lib/Makefile @@ -2,4 +2,4 @@ # Makefile for or32 specific library files.. # -obj-y = memset.o string.o delay.o +obj-y := delay.o string.o memset.o memcpy.o diff --git a/arch/openrisc/lib/memcpy.c b/arch/openrisc/lib/memcpy.c new file mode 100644 index 000000000000..4706f01a199a --- /dev/null +++ b/arch/openrisc/lib/memcpy.c @@ -0,0 +1,124 @@ +/* + * arch/openrisc/lib/memcpy.c + * + * Optimized memory copy routines for openrisc. These are mostly copied + * from ohter sources but slightly entended based on ideas discuassed in + * #openrisc. + * + * The word unroll implementation is an extension to the arm byte + * unrolled implementation, but using word copies (if things are + * properly aligned) + * + * The great arm loop unroll algorithm can be found at: + * arch/arm/boot/compressed/string.c + */ + +#include + +#include + +#ifdef CONFIG_OR1200 +/* + * Do memcpy with word copies and loop unrolling. This gives the + * best performance on the OR1200 and MOR1KX archirectures + */ +void *memcpy(void *dest, __const void *src, __kernel_size_t n) +{ + int i = 0; + unsigned char *d, *s; + uint32_t *dest_w = (uint32_t *)dest, *src_w = (uint32_t *)src; + + /* If both source and dest are word aligned copy words */ + if (!((unsigned int)dest_w & 3) && !((unsigned int)src_w & 3)) { + /* Copy 32 bytes per loop */ + for (i = n >> 5; i > 0; i--) { + *dest_w++ = *src_w++; + *dest_w++ = *src_w++; + *dest_w++ = *src_w++; + *dest_w++ = *src_w++; + *dest_w++ = *src_w++; + *dest_w++ = *src_w++; + *dest_w++ = *src_w++; + *dest_w++ = *src_w++; + } + + if (n & 1 << 4) { + *dest_w++ = *src_w++; + *dest_w++ = *src_w++; + *dest_w++ = *src_w++; + *dest_w++ = *src_w++; + } + + if (n & 1 << 3) { + *dest_w++ = *src_w++; + *dest_w++ = *src_w++; + } + + if (n & 1 << 2) + *dest_w++ = *src_w++; + + d = (unsigned char *)dest_w; + s = (unsigned char *)src_w; + + } else { + d = (unsigned char *)dest_w; + s = (unsigned char *)src_w; + + for (i = n >> 3; i > 0; i--) { + *d++ = *s++; + *d++ = *s++; + *d++ = *s++; + *d++ = *s++; + *d++ = *s++; + *d++ = *s++; + *d++ = *s++; + *d++ = *s++; + } + + if (n & 1 << 2) { + *d++ = *s++; + *d++ = *s++; + *d++ = *s++; + *d++ = *s++; + } + } + + if (n & 1 << 1) { + *d++ = *s++; + *d++ = *s++; + } + + if (n & 1) + *d++ = *s++; + + return dest; +} +#else +/* + * Use word copies but no loop unrolling as we cannot assume there + * will be benefits on the archirecture + */ +void *memcpy(void *dest, __const void *src, __kernel_size_t n) +{ + unsigned char *d = (unsigned char *)dest, *s = (unsigned char *)src; + uint32_t *dest_w = (uint32_t *)dest, *src_w = (uint32_t *)src; + + /* If both source and dest are word aligned copy words */ + if (!((unsigned int)dest_w & 3) && !((unsigned int)src_w & 3)) { + for (; n >= 4; n -= 4) + *dest_w++ = *src_w++; + } + + d = (unsigned char *)dest_w; + s = (unsigned char *)src_w; + + /* For remaining or if not aligned, copy bytes */ + for (; n >= 1; n -= 1) + *d++ = *s++; + + return dest; + +} +#endif + +EXPORT_SYMBOL(memcpy);