linux/arch/powerpc/lib/checksum_32.S

298 lines
6.1 KiB
ArmAsm

/*
* This file contains assembly-language implementations
* of IP-style 1's complement checksum routines.
*
* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
*/
#include <linux/sys.h>
#include <asm/processor.h>
#include <asm/cache.h>
#include <asm/errno.h>
#include <asm/ppc_asm.h>
.text
/*
* computes the checksum of a memory block at buff, length len,
* and adds in "sum" (32-bit)
*
* __csum_partial(buff, len, sum)
*/
_GLOBAL(__csum_partial)
subi r3,r3,4
srawi. r6,r4,2 /* Divide len by 4 and also clear carry */
beq 3f /* if we're doing < 4 bytes */
andi. r0,r3,2 /* Align buffer to longword boundary */
beq+ 1f
lhz r0,4(r3) /* do 2 bytes to get aligned */
subi r4,r4,2
addi r3,r3,2
srwi. r6,r4,2 /* # words to do */
adde r5,r5,r0
beq 3f
1: andi. r6,r6,3 /* Prepare to handle words 4 by 4 */
beq 21f
mtctr r6
2: lwzu r0,4(r3)
adde r5,r5,r0
bdnz 2b
21: srwi. r6,r4,4 /* # blocks of 4 words to do */
beq 3f
mtctr r6
22: lwz r0,4(r3)
lwz r6,8(r3)
lwz r7,12(r3)
lwzu r8,16(r3)
adde r5,r5,r0
adde r5,r5,r6
adde r5,r5,r7
adde r5,r5,r8
bdnz 22b
3: andi. r0,r4,2
beq+ 4f
lhz r0,4(r3)
addi r3,r3,2
adde r5,r5,r0
4: andi. r0,r4,1
beq+ 5f
lbz r0,4(r3)
slwi r0,r0,8 /* Upper byte of word */
adde r5,r5,r0
5: addze r3,r5 /* add in final carry */
blr
/*
* Computes the checksum of a memory block at src, length len,
* and adds in "sum" (32-bit), while copying the block to dst.
* If an access exception occurs on src or dst, it stores -EFAULT
* to *src_err or *dst_err respectively, and (for an error on
* src) zeroes the rest of dst.
*
* csum_partial_copy_generic(src, dst, len, sum, src_err, dst_err)
*/
#define CSUM_COPY_16_BYTES_WITHEX(n) \
8 ## n ## 0: \
lwz r7,4(r4); \
8 ## n ## 1: \
lwz r8,8(r4); \
8 ## n ## 2: \
lwz r9,12(r4); \
8 ## n ## 3: \
lwzu r10,16(r4); \
8 ## n ## 4: \
stw r7,4(r6); \
adde r12,r12,r7; \
8 ## n ## 5: \
stw r8,8(r6); \
adde r12,r12,r8; \
8 ## n ## 6: \
stw r9,12(r6); \
adde r12,r12,r9; \
8 ## n ## 7: \
stwu r10,16(r6); \
adde r12,r12,r10
#define CSUM_COPY_16_BYTES_EXCODE(n) \
.section __ex_table,"a"; \
.align 2; \
.long 8 ## n ## 0b,src_error; \
.long 8 ## n ## 1b,src_error; \
.long 8 ## n ## 2b,src_error; \
.long 8 ## n ## 3b,src_error; \
.long 8 ## n ## 4b,dst_error; \
.long 8 ## n ## 5b,dst_error; \
.long 8 ## n ## 6b,dst_error; \
.long 8 ## n ## 7b,dst_error; \
.text
.text
.stabs "arch/powerpc/lib/",N_SO,0,0,0f
.stabs "checksum_32.S",N_SO,0,0,0f
0:
CACHELINE_BYTES = L1_CACHE_BYTES
LG_CACHELINE_BYTES = L1_CACHE_SHIFT
CACHELINE_MASK = (L1_CACHE_BYTES-1)
_GLOBAL(csum_partial_copy_generic)
stwu r1,-16(r1)
stw r7,12(r1)
stw r8,8(r1)
andi. r0,r4,1 /* is destination address even ? */
cmplwi cr7,r0,0
addic r12,r6,0
addi r6,r4,-4
neg r0,r4
addi r4,r3,-4
andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */
beq 58f
cmplw 0,r5,r0 /* is this more than total to do? */
blt 63f /* if not much to do */
andi. r8,r0,3 /* get it word-aligned first */
mtctr r8
beq+ 61f
li r3,0
70: lbz r9,4(r4) /* do some bytes */
addi r4,r4,1
slwi r3,r3,8
rlwimi r3,r9,0,24,31
71: stb r9,4(r6)
addi r6,r6,1
bdnz 70b
adde r12,r12,r3
61: subf r5,r0,r5
srwi. r0,r0,2
mtctr r0
beq 58f
72: lwzu r9,4(r4) /* do some words */
adde r12,r12,r9
73: stwu r9,4(r6)
bdnz 72b
58: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
clrlwi r5,r5,32-LG_CACHELINE_BYTES
li r11,4
beq 63f
/* Here we decide how far ahead to prefetch the source */
li r3,4
cmpwi r0,1
li r7,0
ble 114f
li r7,1
#if MAX_COPY_PREFETCH > 1
/* Heuristically, for large transfers we prefetch
MAX_COPY_PREFETCH cachelines ahead. For small transfers
we prefetch 1 cacheline ahead. */
cmpwi r0,MAX_COPY_PREFETCH
ble 112f
li r7,MAX_COPY_PREFETCH
112: mtctr r7
111: dcbt r3,r4
addi r3,r3,CACHELINE_BYTES
bdnz 111b
#else
dcbt r3,r4
addi r3,r3,CACHELINE_BYTES
#endif /* MAX_COPY_PREFETCH > 1 */
114: subf r8,r7,r0
mr r0,r7
mtctr r8
53: dcbt r3,r4
54: dcbz r11,r6
/* the main body of the cacheline loop */
CSUM_COPY_16_BYTES_WITHEX(0)
#if L1_CACHE_BYTES >= 32
CSUM_COPY_16_BYTES_WITHEX(1)
#if L1_CACHE_BYTES >= 64
CSUM_COPY_16_BYTES_WITHEX(2)
CSUM_COPY_16_BYTES_WITHEX(3)
#if L1_CACHE_BYTES >= 128
CSUM_COPY_16_BYTES_WITHEX(4)
CSUM_COPY_16_BYTES_WITHEX(5)
CSUM_COPY_16_BYTES_WITHEX(6)
CSUM_COPY_16_BYTES_WITHEX(7)
#endif
#endif
#endif
bdnz 53b
cmpwi r0,0
li r3,4
li r7,0
bne 114b
63: srwi. r0,r5,2
mtctr r0
beq 64f
30: lwzu r0,4(r4)
adde r12,r12,r0
31: stwu r0,4(r6)
bdnz 30b
64: andi. r0,r5,2
beq+ 65f
40: lhz r0,4(r4)
addi r4,r4,2
41: sth r0,4(r6)
adde r12,r12,r0
addi r6,r6,2
65: andi. r0,r5,1
beq+ 66f
50: lbz r0,4(r4)
51: stb r0,4(r6)
slwi r0,r0,8
adde r12,r12,r0
66: addze r3,r12
addi r1,r1,16
beqlr+ cr7
rlwinm r3,r3,8,0,31 /* swap bytes for odd destination */
blr
/* read fault */
src_error:
lwz r7,12(r1)
addi r1,r1,16
cmpwi cr0,r7,0
beqlr
li r0,-EFAULT
stw r0,0(r7)
blr
/* write fault */
dst_error:
lwz r8,8(r1)
addi r1,r1,16
cmpwi cr0,r8,0
beqlr
li r0,-EFAULT
stw r0,0(r8)
blr
.section __ex_table,"a"
.align 2
.long 70b,src_error
.long 71b,dst_error
.long 72b,src_error
.long 73b,dst_error
.long 54b,dst_error
.text
/*
* this stuff handles faults in the cacheline loop and branches to either
* src_error (if in read part) or dst_error (if in write part)
*/
CSUM_COPY_16_BYTES_EXCODE(0)
#if L1_CACHE_BYTES >= 32
CSUM_COPY_16_BYTES_EXCODE(1)
#if L1_CACHE_BYTES >= 64
CSUM_COPY_16_BYTES_EXCODE(2)
CSUM_COPY_16_BYTES_EXCODE(3)
#if L1_CACHE_BYTES >= 128
CSUM_COPY_16_BYTES_EXCODE(4)
CSUM_COPY_16_BYTES_EXCODE(5)
CSUM_COPY_16_BYTES_EXCODE(6)
CSUM_COPY_16_BYTES_EXCODE(7)
#endif
#endif
#endif
.section __ex_table,"a"
.align 2
.long 30b,src_error
.long 31b,dst_error
.long 40b,src_error
.long 41b,dst_error
.long 50b,src_error
.long 51b,dst_error