319 lines
11 KiB
C
319 lines
11 KiB
C
#include "lbnppc.h"
|
|
|
|
/*
|
|
* lbnppc.c - Assembly primitives for the bignum library, PowerPC version.
|
|
*
|
|
* Copyright (c) 1995 Colin Plumb. All rights reserved.
|
|
* For licensing and other legal details, see the file legal.c.
|
|
*
|
|
* Register usage during function calls is:
|
|
* r0 - volatile
|
|
* r1 - stack pointer, preserved
|
|
* r2 - TOC pointer, preserved
|
|
* r3 - First argument and return value register
|
|
* r4-r10 - More argument registers, volatile
|
|
* r11-r12 - Volatile
|
|
* r13-r31 - Preserved
|
|
* LR, CTR, XER and MQ are all volatile.
|
|
* LR holds return address on entry.
|
|
*
|
|
* On the PPC 601, unrolling the loops more doesn't seem to speed things
|
|
* up at all. I'd be curious if other chips differed.
|
|
*/
|
|
#if __MWERKS__ < 0x800
|
|
|
|
#include "ppcasm.h" /* PowerPC assembler */
|
|
|
|
/*
|
|
* MulN1 expects (*out, *in, len, k), count >= 1
|
|
* r3 r4 r5 r6
|
|
*/
|
|
static const unsigned mulN1[] = {
|
|
PPC_LWZ(7,4,0), /* Load first word of in in r7 */
|
|
PPC_MULLW(8,7,6), /* Low half of multiply in r8 */
|
|
PPC_MTCTR(5), /* Move len into CTR */
|
|
PPC_ADDIC(0,0,0), /* Clear carry bit for loop */
|
|
PPC_MULHWU(5,7,6), /* High half of multiply in r5 */
|
|
PPC_STW(8,3,0),
|
|
PPC_BC(18,31,7), /* Branch to Label if --ctr == 0 */
|
|
/* Loop: */
|
|
PPC_LWZU(7,4,4), /* r7 = *++in */
|
|
PPC_MULLW(8,7,6), /* r8 = low word of product */
|
|
PPC_ADDE(8,8,5), /* Add carry word r5 and bit CF to r8 */
|
|
PPC_STWU(8,3,4), /* *++out = r8 */
|
|
PPC_MULHWU(5,7,6), /* r5 is high word of product, for carry word */
|
|
PPC_BC(16,31,-5), /* Branch to Loop if --ctr != 0 */
|
|
/* Label: */
|
|
PPC_ADDZE(5,5), /* Add carry flag to r5 */
|
|
PPC_STW(5,3,4), /* out[1] = r5 */
|
|
PPC_BLR()
|
|
};
|
|
|
|
/*
|
|
* MulAdd1 expects (*out, *in, len, k), count >= 1
|
|
* r3 r4 r5 r6
|
|
*/
|
|
static unsigned const mulAdd1[] = {
|
|
PPC_LWZ(7,4,0), /* Load first word of in in r7 */
|
|
PPC_LWZ(0,3,0), /* Load first word of out into r0 */
|
|
PPC_MULLW(8,7,6), /* Low half of multiply in r8 */
|
|
PPC_MTCTR(5), /* Move len into CTR */
|
|
PPC_MULHWU(5,7,6), /* High half of multiply in r5 */
|
|
PPC_ADDC(8,8,0), /* r8 = r8 + r0 */
|
|
PPC_STW(8,3,0), /* Store result to memory */
|
|
PPC_BC(18,31,10), /* Branch to Label if --ctr == 0 */
|
|
/* Loop: */
|
|
PPC_LWZU(7,4,4), /* r7 = *++in */
|
|
PPC_LWZU(0,3,4), /* r0 = *++out */
|
|
PPC_MULLW(8,7,6), /* r8 = low word of product */
|
|
PPC_ADDE(8,8,5), /* Add carry word r5 and carry bit CF to r8 */
|
|
PPC_MULHWU(5,7,6), /* r5 is high word of product, for carry word */
|
|
PPC_ADDZE(5,5), /* Add carry bit from low add to r5 */
|
|
PPC_ADDC(8,8,0), /* r8 = r8 + r0 */
|
|
PPC_STW(8,3,0), /* *out = r8 */
|
|
PPC_BC(16,31,-8), /* Branch to Loop if --ctr != 0 */
|
|
/* Label: */
|
|
PPC_ADDZE(3,5), /* Add carry flag to r5 and move to r3 */
|
|
PPC_BLR()
|
|
};
|
|
|
|
/*
|
|
* MulSub1 expects (*out, *in, len, k), count >= 1
|
|
* r3 r4 r5 r6
|
|
*
|
|
* Multiply and subtract is rather a pain. If the subtract of the
|
|
* low word of the product from out[i] generates a borrow, we want to
|
|
* increment the carry word (initially in the range 0..0xfffffffe).
|
|
* However, the PPC's carry bit CF is *clear* after a subtract, so
|
|
* we want to add (1-CF) to the carry word. This is done using two
|
|
* instructions:
|
|
*
|
|
* SUBFME, subtract from minus one extended. This computes
|
|
* rD = ~rS + 0xffffffff + CF. Since rS is from 0 to 0xfffffffe,
|
|
* ~rS is from 1 through 0xffffffff, and the sum with 0xffffffff+CF is
|
|
* from 0 through 0xfffffffff, setting the carry flag unconditionally, and
|
|
* NOR, which is used as a bitwise invert NOT instruction.
|
|
*
|
|
* The SUBFME performs the computation rD = ~rS + 0xffffffff + CF,
|
|
* = (-rS - 1) + (CF - 1) = -(rS - CF + 1) - 1 = ~(rS + 1-CF),
|
|
* which is the bitwise complement of the value we want.
|
|
* We want to add the complement of that result to the low word of the
|
|
* product, which is just what a subtract would do, if only we could get
|
|
* the carry flag clear. But it's always set, except for SUBFE, and the
|
|
* operation we just performed unconditionally *sets* the carry flag. Ugh.
|
|
* So find the complement in a separate instruction.
|
|
*/
|
|
static unsigned const mulSub1[] = {
|
|
PPC_LWZ(7,4,0), /* Load first word of in in r7 */
|
|
PPC_LWZ(0,3,0), /* Load first word of out into r0 */
|
|
PPC_MTCTR(5), /* Move len into CTR */
|
|
PPC_MULLW(8,7,6), /* Low half of multiply in r8 */
|
|
PPC_MULHWU(5,7,6), /* High half of multiply in r5 */
|
|
PPC_SUBFC(8,8,0), /* r8 = r0 - r8, setting CF */
|
|
PPC_STW(8,3,0), /* Store result to memory */
|
|
PPC_SUBFME(5,5), /* First of two insns to add (1-CF) to r5 */
|
|
PPC_BC(18,31,12), /* Branch to Label if --ctr == 0 */
|
|
/* Loop: */
|
|
PPC_LWZU(7,4,4), /* r7 = *++in */
|
|
PPC_LWZU(0,3,4), /* r0 = *++out */
|
|
PPC_NOR(5,5,5), /* Second of two insns to add (1-CF) to r5 */
|
|
PPC_MULLW(8,7,6), /* r8 = low word of product */
|
|
PPC_ADDC(8,8,5), /* Add carry word r5 to r8 */
|
|
PPC_MULHWU(5,7,6), /* r5 is high word of product, for carry word */
|
|
PPC_ADDZE(5,5), /* Add carry bit from low add to r5 */
|
|
PPC_SUBFC(8,8,0), /* r8 = r0 - r8, setting CF */
|
|
PPC_STW(8,3,0), /* *out = r8 */
|
|
PPC_SUBFME(5,5), /* First of two insns to add (1-CF) to r5 */
|
|
PPC_BC(16,31,-10), /* Branch to Loop if --ctr != 0 */
|
|
/* Label: */
|
|
PPC_NOR(3,5,5), /* Finish adding (1-CF) to r5, store in r3 */
|
|
PPC_BLR()
|
|
};
|
|
|
|
#if 0
|
|
/*
|
|
* Args: BNWORD32 *n, BNWORD32 const *mod, unsigned mlen, BNWORD32 inv)
|
|
* r3 r4 r5 r6
|
|
* r7, r8 and r9 are the triple-width accumulator.
|
|
* r0 and r10 are temporary registers.
|
|
* r11 and r12 are temporary pointers into n and mod, respectively.
|
|
* r2 (!) is another temporary register.
|
|
*/
|
|
static unsigned const montReduce[] = {
|
|
PPC_MTCTR(5), /* ??? */
|
|
PPC_LWZ(7,3,0), /* Load low word of n into r7 */
|
|
PPC_LWZ(10,4,0), /* Fetch low word of mod */
|
|
PPC_MULLW(0,7,6), /* Invert r7 into r0 */
|
|
PPC_STW(0,3,0), /* Store back for future use */
|
|
PPC_MULHWU(8,10,7), /* Get high word of whatnot */
|
|
PPC_MULLW(10,10,7), /* Get low word of it */
|
|
PPC_ADDC(7,7,10), /* Add low word of product to r7 */
|
|
PPC_ADDZE(8,8), /* Add carry to high word */
|
|
PPC_
|
|
|
|
|
|
PPC_MULHW(8,7,6),
|
|
PPC_ADDC(7,7,0), /* Add inverse back to r7 */
|
|
PPC_ADDZE(8,8),
|
|
PPC_
|
|
|
|
PPC_LWZU(
|
|
/* Loop: */
|
|
PPC_LWZU(0,11,4),
|
|
PPC_LWZU(10,23,-4),
|
|
PPC_MULLW(2,0,10),
|
|
PPC_ADDC(7,7,2),
|
|
PPC_MULHWU(0,0,10),
|
|
PPC_ADDE(8,8,0),
|
|
PPC_ADDZE(9,9),
|
|
PPC_BC(16,31,-7), /* Branch to Loop if --ctr != 0 */
|
|
|
|
PPC_ADDIC_(count,-1),
|
|
PPC_LWZU(0,x,4),
|
|
PPC_ADDC(0,7,0),
|
|
PPC_STW(0,x,0),
|
|
PPC_ADDZE(7,8),
|
|
PPC_ADDZE(8,9),
|
|
PPC_LI(9,0),
|
|
PPC_BC(xx,2,yy),
|
|
|
|
};
|
|
#endif
|
|
|
|
/*
|
|
* Three overlapped transition vectors for three functions.
|
|
* A PowerPC transition vector for a (potentially) inter-module
|
|
* jump or call consists of two words, an instruction address
|
|
* and a Table Of Contents (TOC) pointer, which is loaded into
|
|
* r1. Since none of the routines here have global variables,
|
|
* they don't need a TOC pointer, so the value is unimportant.
|
|
* This array places an unintersting 32-bit value after each address.
|
|
*/
|
|
unsigned const * const lbnPPC_tv[] = {
|
|
mulN1,
|
|
mulAdd1,
|
|
mulSub1,
|
|
0
|
|
};
|
|
|
|
#else /* __MWERKS >= 0x800 */
|
|
|
|
/*
|
|
* MulN1 expects (*out, *in, len, k), count >= 1
|
|
* r3 r4 r5 r6
|
|
*/
|
|
asm void
|
|
lbnMulN1_32(register unsigned *out, register unsigned const *in,
|
|
register unsigned len, register unsigned k)
|
|
{
|
|
lwz r7,0(in) /* Load first word of in in r7 */
|
|
mtctr len /* Move len into CTR */
|
|
mullw r8,r7,k /* Low half of multiply in r8 */
|
|
addic r0,r0,0 /* Clear carry bit for loop */
|
|
mulhwu len,r7,k /* High half of multiply in len */
|
|
stw r8,0(out) /* *out = r8 */
|
|
mulhwu len,r7,k /* len is high word of product, for carry */
|
|
bdz- label /* Branch to Label if --ctr == 0 */
|
|
loop:
|
|
lwzu r7,4(in) /* r7 = *++in */
|
|
mullw r8,r7,k /* Low half of multiply in r8 */
|
|
adde r8,r8,len /* Add carry word len and bit CF to r8 */
|
|
stwu r8,4(out) /* *++out = r8 */
|
|
mulhwu len,r7,k /* len is high word of product, for carry */
|
|
bdnz+ loop /* Branch to Loop if --ctr != 0 */
|
|
label:
|
|
addze len,len /* Add carry flag to carry word */
|
|
stw len,4(out)
|
|
blr
|
|
}
|
|
|
|
/*
|
|
* MulAdd1 expects (*out, *in, len, k), count >= 1
|
|
* r3 r4 r5 r6
|
|
*/
|
|
asm unsigned
|
|
lbnMulAdd1_32(register unsigned *out, register unsigned const *in,
|
|
register unsigned len, register unsigned k)
|
|
{
|
|
lwz r7,0(in) /* Load first word of in in r7 */
|
|
lwz r0,0(out) /* Load first word of out into r0 */
|
|
mullw r8,r7,k /* Low half of multiply in r8 */
|
|
mtctr len /* Move len into CTR */
|
|
mulhwu len,r7,k /* High half of multiply in len */
|
|
addc r8,r8,r0 /* r8 = r8 + r0 */
|
|
stw r8,0(out) /* Store result to memory */
|
|
bdz- label /* Branch to Label if --ctr == 0 */
|
|
loop:
|
|
lwzu r7,4(in) /* r7 = *++in */
|
|
lwzu r0,4(out) /* r0 = *++out */
|
|
mullw r8,r7,k /* r8 = low word of product */
|
|
adde r8,r8,len /* Add carry word len and carry bit CF to r8 */
|
|
mulhwu len,r7,k /* len is high word of product, for carry */
|
|
addze len,len /* Add carry bit from low add to r5 */
|
|
addc r8,r8,r0 /* r8 = r8 + r0 */
|
|
stw r8,0(out) /* *out = r8 */
|
|
bdnz+ loop /* Branch to Loop if --ctr != 0 */
|
|
label:
|
|
addze r3,r5 /* Add carry flag to r5 and move to r3 */
|
|
blr
|
|
}
|
|
|
|
/*
|
|
* MulSub1 expects (*out, *in, len, k), count >= 1
|
|
* r3 r4 r5 r6
|
|
*
|
|
* Multiply and subtract is rather a pain. If the subtract of the
|
|
* low word of the product from out[i] generates a borrow, we want to
|
|
* increment the carry word (initially in the range 0..0xfffffffe).
|
|
* However, the PPC's carry bit CF is *clear* after a subtract, so
|
|
* we want to add (1-CF) to the carry word. This is done using two
|
|
* instructions:
|
|
*
|
|
* SUBFME, subtract from minus one extended. This computes
|
|
* rD = ~rS + 0xffffffff + CF. Since rS is from 0 to 0xfffffffe,
|
|
* ~rS is from 1 through 0xffffffff, and the sum with 0xffffffff+CF is
|
|
* from 0 through 0xfffffffff, setting the carry flag unconditionally, and
|
|
* NOR, which is used as a bitwise invert NOT instruction.
|
|
*
|
|
* The SUBFME performs the computation rD = ~rS + 0xffffffff + CF,
|
|
* = (-rS - 1) + (CF - 1) = -(rS - CF + 1) - 1 = ~(rS + 1-CF),
|
|
* which is the bitwise complement of the value we want.
|
|
* We want to add the complement of that result to the low word of the
|
|
* product, which is just what a subtract would do, if only we could get
|
|
* the carry flag clear. But it's always set, except for SUBFE, and the
|
|
* operation we just performed unconditionally *sets* the carry flag. Ugh.
|
|
* So find the complement in a separate instruction.
|
|
*/
|
|
asm unsigned
|
|
lbnMulSub1_32(register unsigned *out, register unsigned const *in,
|
|
register unsigned len, register unsigned k)
|
|
{
|
|
lwz r7,0(in) /* Load first word of in in r7 */
|
|
lwz r0,0(out) /* Load first word of out into r0 */
|
|
mtctr len /* Move len into CTR */
|
|
mullw r8,r7,k /* Low half of multiply in r8 */
|
|
mulhwu len,r7,k /* High half of multiply in len */
|
|
subfc r8,r8,r0 /* r8 = r0 - r8, setting CF */
|
|
stw r8,0(out) /* Store result to memory */
|
|
subfme len,len /* First of two insns to add (1-CF) to len */
|
|
bdz- label /* Branch to Label if --ctr == 0 */
|
|
loop:
|
|
lwzu r7,4(in) /* r7 = *++in */
|
|
lwzu r0,4(out) /* r0 = *++out */
|
|
nor len,len,len /* Second of two insns to add (1-CF) to len */
|
|
mullw r8,r7,k /* r8 = low word of product */
|
|
addc r8,r8,len /* Add carry word len to r8 */
|
|
mulhwu len,r7,k /* len is high word of product, for carry */
|
|
addze len,len /* Add carry bit from low add to len */
|
|
subfc r8,r8,r0 /* r8 = r0 - r8 */
|
|
stw r8,0(out) /* *out = r8 */
|
|
subfme len,len /* First of two insns to add (1-CF) to len */
|
|
bdnz+ loop /* Branch to Loop if --ctr != 0 */
|
|
label:
|
|
nor r3,r5,r5 /* Finish adding (1-CF) to len, store in r3 */
|
|
blr
|
|
}
|
|
|
|
#endif /* __MWERKS >= 0x800 */
|
|
/* 45678901234567890123456789012345678901234567890123456789012345678901234567 */
|