174 lines
4.9 KiB
ArmAsm
174 lines
4.9 KiB
ArmAsm
@ lbnarm.s - 32-bit bignum primitives for ARM processors with 32x32-bit multiply
|
|
@
|
|
@ This uses the standard ARM calling convetion, which is that arguments
|
|
@ are passed, and results returned, in r0..r3. r0..r3, r12 (IP) and r14 (LR)
|
|
@ are volatile across the function; all others are callee-save.
|
|
@ However, note that r14 (LR) is the return address, so it would be
|
|
@ wise to save it somewhere before trashing it. Fortunately, there is
|
|
@ a neat trick possible, in that you can pop LR from the stack straight
|
|
@ into r15 (PC), effecting a return at the same time.
|
|
@
|
|
@ Also, r13 (SP) is probably best left alone, and r15 (PC) is obviously
|
|
@ reserved by hardware. Temps should use lr, then r4..r9 in order.
|
|
|
|
.text
|
|
.align 2
|
|
|
|
@ out[0..len] = in[0..len-1] * k
|
|
@ void lbnMulN1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
|
|
.global lbnMulN1_32
|
|
.type lbnMulN1_32, %function
|
|
lbnMulN1_32:
|
|
stmfd sp!, {r4, r5, lr}
|
|
ldr lr, [r1], #4 @ lr = *in++
|
|
umull r5, r4, lr, r3 @ (r4,r5) = lr * r3
|
|
str r5, [r0], #4 @ *out++ = r5
|
|
movs r2, r2, lsr #1
|
|
bcc m32_even
|
|
mov r5, r4 @ Get carry in the right register
|
|
beq m32_done
|
|
m32_loop:
|
|
@ carry is in r5
|
|
ldr lr, [r1], #4 @ lr = *in++
|
|
mov r4, #0
|
|
umlal r5, r4, lr, r3 @ (r4,r5) += lr * r3
|
|
str r5, [r0], #4 @ *out++ = r5
|
|
m32_even:
|
|
@ carry is in r4
|
|
ldr lr, [r1], #4 @ lr = *in++
|
|
mov r5, #0
|
|
umlal r4, r5, lr, r3 @ (r5,r4) += lr * r3
|
|
subs r2, r2, #1
|
|
str r4, [r0], #4 @ *out++ = r4
|
|
|
|
bne m32_loop
|
|
m32_done:
|
|
str r5, [r0, #0] @ store carry
|
|
ldmfd sp!, {r4, r5, pc}
|
|
.size lbnMulN1_32, .-lbnMulN1_32
|
|
|
|
@ out[0..len-1] += in[0..len-1] * k, return carry
|
|
@ BNWORD32
|
|
@ lbnMulAdd1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
|
|
|
|
.global lbnMulAdd1_32
|
|
.type lbnMulAdd1_32, %function
|
|
lbnMulAdd1_32:
|
|
stmfd sp!, {r4, r5, lr}
|
|
|
|
mov r4, #0
|
|
ldr lr, [r1], #4 @ lr = *in++
|
|
ldr r5, [r0, #0] @ r5 = *out
|
|
mov r4, #0
|
|
umlal r5, r4, lr, r3 @ (r4,r5) += lr * r3
|
|
str r5, [r0], #4 @ *out++ = r5
|
|
movs r2, r2, lsr #1
|
|
bcc ma32_even
|
|
beq ma32_done
|
|
ma32_loop:
|
|
@ carry is in r4
|
|
ldr lr, [r1], #4 @ lr = *in++
|
|
mov r5, #0
|
|
umlal r4, r5, lr, r3 @ (r5,r4) += lr * r3
|
|
ldr lr, [r0, #0] @ lr = *out
|
|
adds lr, lr, r4 @ lr += product.low
|
|
str lr, [r0], #4 @ *out++ = lr
|
|
adc r4, r5, #0 @ Compute carry and move back to r4
|
|
ma32_even:
|
|
@ another unrolled copy
|
|
ldr lr, [r1], #4 @ lr = *in++
|
|
mov r5, #0
|
|
umlal r4, r5, lr, r3 @ (r5,r4) += lr * r3
|
|
ldr lr, [r0, #0] @ lr = *out
|
|
adds lr, lr, r4 @ lr += product.low
|
|
adc r4, r5, #0 @ Compute carry and move back to r4
|
|
str lr, [r0], #4 @ *out++ = lr
|
|
subs r2, r2, #1
|
|
|
|
bne ma32_loop
|
|
ma32_done:
|
|
mov r0, r4
|
|
ldmfd sp!, {r4, r5, pc}
|
|
.size lbnMulAdd1_32, .-lbnMulAdd1_32
|
|
|
|
@@@ This is a bit messy... punt for now...
|
|
@ out[0..len-1] -= in[0..len-1] * k, return carry (borrow)
|
|
@ BNWORD32
|
|
@ lbnMulSub1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
|
|
.global lbnMulSub1_32
|
|
.type lbnMulSub1_32, %function
|
|
lbnMulSub1_32:
|
|
stmfd sp!, {r4, r5, lr}
|
|
|
|
mov r4, #0
|
|
mov r5, #0
|
|
ldr lr, [r1], #4 @ lr = *in++
|
|
umull r4, r5, lr, r3 @ (r5,r4) = lr * r3
|
|
ldr lr, [r0, #0] @ lr = *out
|
|
subs lr, lr, r4 @ lr -= product.low
|
|
str lr, [r0], #4 @ *out++ = lr
|
|
addcc r5, r5, #1 @ propagate carry
|
|
|
|
movs r2, r2, lsr #1
|
|
bcc ms32_even
|
|
mov r4, r5
|
|
beq ms32_done
|
|
ms32_loop:
|
|
@ carry is in r4
|
|
ldr lr, [r1], #4 @ lr = *in++
|
|
mov r5, #0
|
|
umlal r4, r5, lr, r3 @ (r5,r4) += lr * r3
|
|
ldr lr, [r0, #0] @ lr = *out
|
|
subs lr, lr, r4 @ lr -= product.low
|
|
str lr, [r0], #4 @ *out++ = lr
|
|
addcc r5, r5, #1 @ propagate carry
|
|
ms32_even:
|
|
@ carry is in r5
|
|
ldr lr, [r1], #4 @ lr = *in++
|
|
mov r4, #0
|
|
umlal r5, r4, lr, r3 @ (r4,r5) += lr * r3
|
|
ldr lr, [r0, #0] @ lr = *out
|
|
subs lr, lr, r5 @ lr -= product.low
|
|
str lr, [r0], #4 @ *out++ = lr
|
|
addcc r4, r4, #1 @ Propagate carry
|
|
|
|
subs r2, r2, #1
|
|
bne ms32_loop
|
|
ms32_done:
|
|
mov r0, r4
|
|
ldmfd sp!, {r4, r5, pc}
|
|
|
|
.size lbnMulSub1_32, .-lbnMulSub1_32
|
|
|
|
@@
|
|
@@ It's possible to eliminate the store traffic by doing the multiplies
|
|
@@ in a different order, forming all the partial products in one column
|
|
@@ at a time. But it requires 32x32 + 64 -> 65-bit MAC. The
|
|
@@ ARM has the MAC, but no carry out.
|
|
@@
|
|
@@ The question is, is it faster to do the add directly (3 instructions),
|
|
@@ or can we compute the carry out in 1 instruction (+1 to do the add)?
|
|
@@ Well... it takes at least 1 instruction to copy the original accumulator,
|
|
@@ out of the way, and 1 to do a compare, so no.
|
|
@@
|
|
@@ Now, the overall loop... this is an nxn->2n multiply. For i=0..n-1,
|
|
@@ we sum i+1 multiplies in each (plus the carry in from the
|
|
@@ previous one). For i = n..2*n-1 we sum 2*n-1-i, plus the previous
|
|
@@ carry.
|
|
@@
|
|
@@ This "non-square" structure makes things more complicated.
|
|
@@
|
|
@@ void
|
|
@@ lbnMulX_32(BNWORD32 *prod, BNWORD32 const *num1, BNWORD32 const *num2,
|
|
@@ unsigned len)
|
|
@ .global lbnMulX_32
|
|
@ .type lbnMulX_32, %function
|
|
@lbnMulX_32:
|
|
@ stmfd sp!, {r4, r5, r6, r7, lr}
|
|
@
|
|
@ mov r4, #0
|
|
@ mov r5, #0
|
|
@ mov r0, r4
|
|
@ ldmfd sp!, {r4, r5, pc}
|
|
@ .size lbnMulX_32, .-lbnMulX_32
|