freeswitch/libs/libzrtp/third_party/bnlib/bignum-ARM/lbnarm.s

@ lbnarm.s - 32-bit bignum primitives for ARM processors with 32x32-bit multiply
@
@ This uses the standard ARM calling convetion, which is that arguments
@ are passed, and results returned, in r0..r3.  r0..r3, r12 (IP) and r14 (LR)
@ are volatile across the function; all others are callee-save.
@ However, note that r14 (LR) is the return address, so it would be
@ wise to save it somewhere before trashing it.  Fortunately, there is
@ a neat trick possible, in that you can pop LR from the stack straight
@ into r15 (PC), effecting a return at the same time.
@
@ Also, r13 (SP) is probably best left alone, and r15 (PC) is obviously
@ reserved by hardware.  Temps should use lr, then r4..r9 in order.

	.text
	.align	2

@ out[0..len] = in[0..len-1] * k
@ void lbnMulN1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
	.global	lbnMulN1_32
	.type	lbnMulN1_32, %function
lbnMulN1_32:
	stmfd	sp!, {r4, r5, lr}
	ldr	lr, [r1], #4		@ lr = *in++
	umull	r5, r4, lr, r3		@ (r4,r5) = lr * r3
	str	r5, [r0], #4		@ *out++ = r5
	movs	r2, r2, lsr #1
	bcc	m32_even
	mov	r5, r4			@ Get carry in the right register
	beq	m32_done
m32_loop:
	@ carry is in r5
	ldr	lr, [r1], #4		@ lr = *in++
	mov	r4, #0
	umlal	r5, r4, lr, r3		@ (r4,r5) += lr * r3
	str	r5, [r0], #4		@ *out++ = r5
m32_even:
	@ carry is in r4
	ldr	lr, [r1], #4		@ lr = *in++
	mov	r5, #0
	umlal	r4, r5, lr, r3		@ (r5,r4) += lr * r3
	subs	r2, r2, #1
	str	r4, [r0], #4		@ *out++ = r4

	bne	m32_loop
m32_done:
	str	r5, [r0, #0]		@ store carry
	ldmfd	sp!, {r4, r5, pc}
	.size	lbnMulN1_32, .-lbnMulN1_32

@ out[0..len-1] += in[0..len-1] * k, return carry
@ BNWORD32
@ lbnMulAdd1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)

	.global	lbnMulAdd1_32
	.type	lbnMulAdd1_32, %function
lbnMulAdd1_32:
	stmfd	sp!, {r4, r5, lr}

	mov	r4, #0
	ldr	lr, [r1], #4		@ lr = *in++
	ldr	r5, [r0, #0]		@ r5 = *out
	mov	r4, #0
	umlal	r5, r4, lr, r3		@ (r4,r5) += lr * r3
	str	r5, [r0], #4		@ *out++ = r5
	movs	r2, r2, lsr #1
	bcc	ma32_even
	beq	ma32_done
ma32_loop:
	@ carry is in r4
	ldr	lr, [r1], #4		@ lr = *in++
	mov	r5, #0
	umlal	r4, r5, lr, r3		@ (r5,r4) += lr * r3
	ldr	lr, [r0, #0]		@ lr = *out
	adds	lr, lr, r4		@ lr += product.low
	str	lr, [r0], #4		@ *out++ = lr
	adc	r4, r5, #0		@ Compute carry and move back to r4
ma32_even:
	@ another unrolled copy
	ldr	lr, [r1], #4		@ lr = *in++
	mov	r5, #0
	umlal	r4, r5, lr, r3		@ (r5,r4) += lr * r3
	ldr	lr, [r0, #0]		@ lr = *out
	adds	lr, lr, r4		@ lr += product.low
	adc	r4, r5, #0		@ Compute carry and move back to r4
	str	lr, [r0], #4		@ *out++ = lr
	subs	r2, r2, #1

	bne	ma32_loop
ma32_done:
	mov	r0, r4
	ldmfd	sp!, {r4, r5, pc}
	.size	lbnMulAdd1_32, .-lbnMulAdd1_32

@@@ This is a bit messy... punt for now...
@ out[0..len-1] -= in[0..len-1] * k, return carry (borrow)
@ BNWORD32
@ lbnMulSub1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
	.global	lbnMulSub1_32
	.type	lbnMulSub1_32, %function
lbnMulSub1_32:
	stmfd	sp!, {r4, r5, lr}

	mov	r4, #0
	mov	r5, #0
	ldr	lr, [r1], #4		@ lr = *in++
	umull	r4, r5, lr, r3		@ (r5,r4) = lr * r3
	ldr	lr, [r0, #0]		@ lr = *out
	subs	lr, lr, r4		@ lr -= product.low
	str	lr, [r0], #4		@ *out++ = lr
	addcc	r5, r5, #1		@ propagate carry

	movs	r2, r2, lsr #1
	bcc	ms32_even
	mov	r4, r5
	beq	ms32_done
ms32_loop:
	@ carry is in r4
	ldr	lr, [r1], #4		@ lr = *in++
	mov	r5, #0
	umlal	r4, r5, lr, r3		@ (r5,r4) += lr * r3
	ldr	lr, [r0, #0]		@ lr = *out
	subs	lr, lr, r4		@ lr -= product.low
	str	lr, [r0], #4		@ *out++ = lr
	addcc	r5, r5, #1		@ propagate carry
ms32_even:
	@ carry is in r5
	ldr	lr, [r1], #4		@ lr = *in++
	mov	r4, #0
	umlal	r5, r4, lr, r3		@ (r4,r5) += lr * r3
	ldr	lr, [r0, #0]		@ lr = *out
	subs	lr, lr, r5		@ lr -= product.low
	str	lr, [r0], #4		@ *out++ = lr
	addcc	r4, r4, #1		@ Propagate carry

	subs	r2, r2, #1
	bne	ms32_loop
ms32_done:
	mov	r0, r4
	ldmfd	sp!, {r4, r5, pc}

	.size	lbnMulSub1_32, .-lbnMulSub1_32

@@
@@ It's possible to eliminate the store traffic by doing the multiplies
@@ in a different order, forming all the partial products in one column
@@ at a time.  But it requires 32x32 + 64 -> 65-bit MAC.  The
@@ ARM has the MAC, but no carry out.
@@
@@ The question is, is it faster to do the add directly (3 instructions),
@@ or can we compute the carry out in 1 instruction (+1 to do the add)?
@@ Well... it takes at least 1 instruction to copy the original accumulator,
@@ out of the way, and 1 to do a compare, so no.
@@
@@ Now, the overall loop... this is an nxn->2n multiply.  For i=0..n-1,
@@ we sum i+1 multiplies in each (plus the carry in from the
@@ previous one).  For i = n..2*n-1 we sum 2*n-1-i, plus the previous
@@ carry.
@@
@@ This "non-square" structure makes things more complicated.
@@
@@ void
@@ lbnMulX_32(BNWORD32 *prod, BNWORD32 const *num1, BNWORD32 const *num2,
@@ 	unsigned len)
@	.global	lbnMulX_32
@	.type	lbnMulX_32, %function
@lbnMulX_32:
@	stmfd	sp!, {r4, r5, r6, r7, lr}
@
@	mov	r4, #0
@	mov	r5, #0
@	mov	r0, r4
@	ldmfd	sp!, {r4, r5, pc}
@	.size	lbnMulX_32, .-lbnMulX_32
Add libzrtp beta 2010-02-20 18:51:54 +00:00			`@ lbnarm.s - 32-bit bignum primitives for ARM processors with 32x32-bit multiply`
			`@`
			`@ This uses the standard ARM calling convetion, which is that arguments`
			`@ are passed, and results returned, in r0..r3. r0..r3, r12 (IP) and r14 (LR)`
			`@ are volatile across the function; all others are callee-save.`
			`@ However, note that r14 (LR) is the return address, so it would be`
			`@ wise to save it somewhere before trashing it. Fortunately, there is`
			`@ a neat trick possible, in that you can pop LR from the stack straight`
			`@ into r15 (PC), effecting a return at the same time.`
			`@`
			`@ Also, r13 (SP) is probably best left alone, and r15 (PC) is obviously`
			`@ reserved by hardware. Temps should use lr, then r4..r9 in order.`

			`.text`
			`.align 2`

			`@ out[0..len] = in[0..len-1] * k`
			`@ void lbnMulN1_32(BNWORD32 out, BNWORD32 const in, unsigned len, BNWORD32 k)`
			`.global lbnMulN1_32`
			`.type lbnMulN1_32, %function`
			`lbnMulN1_32:`
			`stmfd sp!, {r4, r5, lr}`
			`ldr lr, [r1], #4 @ lr = *in++`
			`umull r5, r4, lr, r3 @ (r4,r5) = lr * r3`
			`str r5, [r0], #4 @ *out++ = r5`
			`movs r2, r2, lsr #1`
			`bcc m32_even`
			`mov r5, r4 @ Get carry in the right register`
			`beq m32_done`
			`m32_loop:`
			`@ carry is in r5`
			`ldr lr, [r1], #4 @ lr = *in++`
			`mov r4, #0`
			`umlal r5, r4, lr, r3 @ (r4,r5) += lr * r3`
			`str r5, [r0], #4 @ *out++ = r5`
			`m32_even:`
			`@ carry is in r4`
			`ldr lr, [r1], #4 @ lr = *in++`
			`mov r5, #0`
			`umlal r4, r5, lr, r3 @ (r5,r4) += lr * r3`
			`subs r2, r2, #1`
			`str r4, [r0], #4 @ *out++ = r4`

			`bne m32_loop`
			`m32_done:`
			`str r5, [r0, #0] @ store carry`
			`ldmfd sp!, {r4, r5, pc}`
			`.size lbnMulN1_32, .-lbnMulN1_32`

			`@ out[0..len-1] += in[0..len-1] * k, return carry`
			`@ BNWORD32`
			`@ lbnMulAdd1_32(BNWORD32 out, BNWORD32 const in, unsigned len, BNWORD32 k)`

			`.global lbnMulAdd1_32`
			`.type lbnMulAdd1_32, %function`
			`lbnMulAdd1_32:`
			`stmfd sp!, {r4, r5, lr}`

			`mov r4, #0`
			`ldr lr, [r1], #4 @ lr = *in++`
			`ldr r5, [r0, #0] @ r5 = *out`
			`mov r4, #0`
			`umlal r5, r4, lr, r3 @ (r4,r5) += lr * r3`
			`str r5, [r0], #4 @ *out++ = r5`
			`movs r2, r2, lsr #1`
			`bcc ma32_even`
			`beq ma32_done`
			`ma32_loop:`
			`@ carry is in r4`
			`ldr lr, [r1], #4 @ lr = *in++`
			`mov r5, #0`
			`umlal r4, r5, lr, r3 @ (r5,r4) += lr * r3`
			`ldr lr, [r0, #0] @ lr = *out`
			`adds lr, lr, r4 @ lr += product.low`
			`str lr, [r0], #4 @ *out++ = lr`
			`adc r4, r5, #0 @ Compute carry and move back to r4`
			`ma32_even:`
			`@ another unrolled copy`
			`ldr lr, [r1], #4 @ lr = *in++`
			`mov r5, #0`
			`umlal r4, r5, lr, r3 @ (r5,r4) += lr * r3`
			`ldr lr, [r0, #0] @ lr = *out`
			`adds lr, lr, r4 @ lr += product.low`
			`adc r4, r5, #0 @ Compute carry and move back to r4`
			`str lr, [r0], #4 @ *out++ = lr`
			`subs r2, r2, #1`

			`bne ma32_loop`
			`ma32_done:`
			`mov r0, r4`
			`ldmfd sp!, {r4, r5, pc}`
			`.size lbnMulAdd1_32, .-lbnMulAdd1_32`

			`@@@ This is a bit messy... punt for now...`
			`@ out[0..len-1] -= in[0..len-1] * k, return carry (borrow)`
			`@ BNWORD32`
			`@ lbnMulSub1_32(BNWORD32 out, BNWORD32 const in, unsigned len, BNWORD32 k)`
			`.global lbnMulSub1_32`
			`.type lbnMulSub1_32, %function`
			`lbnMulSub1_32:`
			`stmfd sp!, {r4, r5, lr}`

			`mov r4, #0`
			`mov r5, #0`
			`ldr lr, [r1], #4 @ lr = *in++`
			`umull r4, r5, lr, r3 @ (r5,r4) = lr * r3`
			`ldr lr, [r0, #0] @ lr = *out`
			`subs lr, lr, r4 @ lr -= product.low`
			`str lr, [r0], #4 @ *out++ = lr`
			`addcc r5, r5, #1 @ propagate carry`

			`movs r2, r2, lsr #1`
			`bcc ms32_even`
			`mov r4, r5`
			`beq ms32_done`
			`ms32_loop:`
			`@ carry is in r4`
			`ldr lr, [r1], #4 @ lr = *in++`
			`mov r5, #0`
			`umlal r4, r5, lr, r3 @ (r5,r4) += lr * r3`
			`ldr lr, [r0, #0] @ lr = *out`
			`subs lr, lr, r4 @ lr -= product.low`
			`str lr, [r0], #4 @ *out++ = lr`
			`addcc r5, r5, #1 @ propagate carry`
			`ms32_even:`
			`@ carry is in r5`
			`ldr lr, [r1], #4 @ lr = *in++`
			`mov r4, #0`
			`umlal r5, r4, lr, r3 @ (r4,r5) += lr * r3`
			`ldr lr, [r0, #0] @ lr = *out`
			`subs lr, lr, r5 @ lr -= product.low`
			`str lr, [r0], #4 @ *out++ = lr`
			`addcc r4, r4, #1 @ Propagate carry`

			`subs r2, r2, #1`
			`bne ms32_loop`
			`ms32_done:`
			`mov r0, r4`
			`ldmfd sp!, {r4, r5, pc}`

			`.size lbnMulSub1_32, .-lbnMulSub1_32`

			`@@`
			`@@ It's possible to eliminate the store traffic by doing the multiplies`
			`@@ in a different order, forming all the partial products in one column`
			`@@ at a time. But it requires 32x32 + 64 -> 65-bit MAC. The`
			`@@ ARM has the MAC, but no carry out.`
			`@@`
			`@@ The question is, is it faster to do the add directly (3 instructions),`
			`@@ or can we compute the carry out in 1 instruction (+1 to do the add)?`
			`@@ Well... it takes at least 1 instruction to copy the original accumulator,`
			`@@ out of the way, and 1 to do a compare, so no.`
			`@@`
			`@@ Now, the overall loop... this is an nxn->2n multiply. For i=0..n-1,`
			`@@ we sum i+1 multiplies in each (plus the carry in from the`
			`@@ previous one). For i = n..2n-1 we sum 2n-1-i, plus the previous`
			`@@ carry.`
			`@@`
			`@@ This "non-square" structure makes things more complicated.`
			`@@`
			`@@ void`
			`@@ lbnMulX_32(BNWORD32 prod, BNWORD32 const num1, BNWORD32 const *num2,`
			`@@ unsigned len)`
			`@ .global lbnMulX_32`
			`@ .type lbnMulX_32, %function`
			`@lbnMulX_32:`
			`@ stmfd sp!, {r4, r5, r6, r7, lr}`
			`@`
			`@ mov r4, #0`
			`@ mov r5, #0`
			`@ mov r0, r4`
			`@ ldmfd sp!, {r4, r5, pc}`
			`@ .size lbnMulX_32, .-lbnMulX_32`