@ lbnarm.s - 32-bit bignum primitives for ARM processors with 32x32-bit multiply
@
@ This uses the standard ARM calling convetion, which is that arguments
@ are passed, and results returned, in r0..r3.  r0..r3, r12 (IP) and r14 (LR)
@ are volatile across the function; all others are callee-save.
@ However, note that r14 (LR) is the return address, so it would be
@ wise to save it somewhere before trashing it.  Fortunately, there is
@ a neat trick possible, in that you can pop LR from the stack straight
@ into r15 (PC), effecting a return at the same time.
@
@ Also, r13 (SP) is probably best left alone, and r15 (PC) is obviously
@ reserved by hardware.  Temps should use lr, then r4..r9 in order.

	.text
	.align	2

@ out[0..len] = in[0..len-1] * k
@ void lbnMulN1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
	.global	lbnMulN1_32
	.type	lbnMulN1_32, %function
lbnMulN1_32:
	stmfd	sp!, {r4, r5, lr}
	ldr	lr, [r1], #4		@ lr = *in++
	umull	r5, r4, lr, r3		@ (r4,r5) = lr * r3
	str	r5, [r0], #4		@ *out++ = r5
	movs	r2, r2, lsr #1
	bcc	m32_even
	mov	r5, r4			@ Get carry in the right register
	beq	m32_done
m32_loop:
	@ carry is in r5
	ldr	lr, [r1], #4		@ lr = *in++
	mov	r4, #0
	umlal	r5, r4, lr, r3		@ (r4,r5) += lr * r3
	str	r5, [r0], #4		@ *out++ = r5
m32_even:
	@ carry is in r4
	ldr	lr, [r1], #4		@ lr = *in++
	mov	r5, #0
	umlal	r4, r5, lr, r3		@ (r5,r4) += lr * r3
	subs	r2, r2, #1
	str	r4, [r0], #4		@ *out++ = r4

	bne	m32_loop
m32_done:
	str	r5, [r0, #0]		@ store carry
	ldmfd	sp!, {r4, r5, pc}
	.size	lbnMulN1_32, .-lbnMulN1_32

@ out[0..len-1] += in[0..len-1] * k, return carry
@ BNWORD32
@ lbnMulAdd1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)

	.global	lbnMulAdd1_32
	.type	lbnMulAdd1_32, %function
lbnMulAdd1_32:
	stmfd	sp!, {r4, r5, lr}

	mov	r4, #0
	ldr	lr, [r1], #4		@ lr = *in++
	ldr	r5, [r0, #0]		@ r5 = *out
	mov	r4, #0
	umlal	r5, r4, lr, r3		@ (r4,r5) += lr * r3
	str	r5, [r0], #4		@ *out++ = r5
	movs	r2, r2, lsr #1
	bcc	ma32_even
	beq	ma32_done
ma32_loop:
	@ carry is in r4
	ldr	lr, [r1], #4		@ lr = *in++
	mov	r5, #0
	umlal	r4, r5, lr, r3		@ (r5,r4) += lr * r3
	ldr	lr, [r0, #0]		@ lr = *out
	adds	lr, lr, r4		@ lr += product.low
	str	lr, [r0], #4		@ *out++ = lr
	adc	r4, r5, #0		@ Compute carry and move back to r4
ma32_even:
	@ another unrolled copy
	ldr	lr, [r1], #4		@ lr = *in++
	mov	r5, #0
	umlal	r4, r5, lr, r3		@ (r5,r4) += lr * r3
	ldr	lr, [r0, #0]		@ lr = *out
	adds	lr, lr, r4		@ lr += product.low
	adc	r4, r5, #0		@ Compute carry and move back to r4
	str	lr, [r0], #4		@ *out++ = lr
	subs	r2, r2, #1

	bne	ma32_loop
ma32_done:
	mov	r0, r4
	ldmfd	sp!, {r4, r5, pc}
	.size	lbnMulAdd1_32, .-lbnMulAdd1_32

@@@ This is a bit messy... punt for now...
@ out[0..len-1] -= in[0..len-1] * k, return carry (borrow)
@ BNWORD32
@ lbnMulSub1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
	.global	lbnMulSub1_32
	.type	lbnMulSub1_32, %function
lbnMulSub1_32:
	stmfd	sp!, {r4, r5, lr}

	mov	r4, #0
	mov	r5, #0
	ldr	lr, [r1], #4		@ lr = *in++
	umull	r4, r5, lr, r3		@ (r5,r4) = lr * r3
	ldr	lr, [r0, #0]		@ lr = *out
	subs	lr, lr, r4		@ lr -= product.low
	str	lr, [r0], #4		@ *out++ = lr
	addcc	r5, r5, #1		@ propagate carry

	movs	r2, r2, lsr #1
	bcc	ms32_even
	mov	r4, r5
	beq	ms32_done
ms32_loop:
	@ carry is in r4
	ldr	lr, [r1], #4		@ lr = *in++
	mov	r5, #0
	umlal	r4, r5, lr, r3		@ (r5,r4) += lr * r3
	ldr	lr, [r0, #0]		@ lr = *out
	subs	lr, lr, r4		@ lr -= product.low
	str	lr, [r0], #4		@ *out++ = lr
	addcc	r5, r5, #1		@ propagate carry
ms32_even:
	@ carry is in r5
	ldr	lr, [r1], #4		@ lr = *in++
	mov	r4, #0
	umlal	r5, r4, lr, r3		@ (r4,r5) += lr * r3
	ldr	lr, [r0, #0]		@ lr = *out
	subs	lr, lr, r5		@ lr -= product.low
	str	lr, [r0], #4		@ *out++ = lr
	addcc	r4, r4, #1		@ Propagate carry

	subs	r2, r2, #1
	bne	ms32_loop
ms32_done:
	mov	r0, r4
	ldmfd	sp!, {r4, r5, pc}

	.size	lbnMulSub1_32, .-lbnMulSub1_32

@@
@@ It's possible to eliminate the store traffic by doing the multiplies
@@ in a different order, forming all the partial products in one column
@@ at a time.  But it requires 32x32 + 64 -> 65-bit MAC.  The
@@ ARM has the MAC, but no carry out.
@@
@@ The question is, is it faster to do the add directly (3 instructions),
@@ or can we compute the carry out in 1 instruction (+1 to do the add)?
@@ Well... it takes at least 1 instruction to copy the original accumulator,
@@ out of the way, and 1 to do a compare, so no.
@@
@@ Now, the overall loop... this is an nxn->2n multiply.  For i=0..n-1,
@@ we sum i+1 multiplies in each (plus the carry in from the
@@ previous one).  For i = n..2*n-1 we sum 2*n-1-i, plus the previous
@@ carry.
@@
@@ This "non-square" structure makes things more complicated.
@@
@@ void
@@ lbnMulX_32(BNWORD32 *prod, BNWORD32 const *num1, BNWORD32 const *num2,
@@ 	unsigned len)
@	.global	lbnMulX_32
@	.type	lbnMulX_32, %function
@lbnMulX_32:
@	stmfd	sp!, {r4, r5, r6, r7, lr}
@
@	mov	r4, #0
@	mov	r5, #0
@	mov	r0, r4
@	ldmfd	sp!, {r4, r5, pc}
@	.size	lbnMulX_32, .-lbnMulX_32