freeswitch/libs/libzrtp/third_party/bnlib/lbn80386.s

### Copyright (c) 1995, Colin Plumb.
### For licensing and other legal details, see the file legal.c.
###
### Assembly primitives for bignum library, 80386 family, 32-bit code.
###
### Several primitives are included here.  Only lbnMulAdd1 is *really*
### critical, but once that's written, lnmMulN1 and lbnMulSub1 are quite
### easy to write as well, so they are included here as well.
### lbnDiv21 and lbnModQ are so easy to write that they're included, too.
###
### All functions here are for 32-bit flat mode.  I.e. near code and
### near data, although the near offsets are 32 bits.
### Preserved registers are esp, ebp, esi, edi and ebx.  That last
### is needed by ELF for PIC, and differs from the IBM PC calling
### convention.

# Different assemblers have different conventions here
align4=4	# could be 2 or 4
align8=8	# could be 3 or 8
align16=16	# cound be 4 or 16


.text

# We declare each symbol with two names, to deal with ELF/a.out variances.
	.globl	lbnMulN1_32
	.globl	_lbnMulN1_32
	.globl	lbnMulAdd1_32
	.globl	_lbnMulAdd1_32
	.globl	lbnMulSub1_32
	.globl	_lbnMulSub1_32
	.globl	lbnDiv21_32
	.globl	_lbnDiv21_32
	.globl	lbnModQ_32
	.globl	_lbnModQ_32

## Register usage:
## %eax - low half of product
## %ebx - carry to next iteration
## %ecx - multiplier (k)
## %edx - high half of product
## %esi - source pointer
## %edi - dest pointer
## %ebp - loop counter
##
## Stack frame:
## +--------+ %esp+20  %esp+24  %esp+28  %esp+32  %esp+36
## |    k   |
## +--------+ %esp+16  %esp+20  %esp+24  %esp+28  %esp+32
## |   len  |
## +--------+ %esp+12  %esp+16  %esp+20  %esp+24  %esp+28
## |   in   |
## +--------+ %esp+8   %esp+12  %esp+16  %esp+20  %esp+24
## |   out  |
## +--------+ %esp+4   %esp+8   %esp+12  %esp+16  %esp+20
## | return |
## +--------+ %esp     %esp+4   %esp+8   %esp+12  %esp+16
## |  %esi  |
## +--------+          %esp     %esp+4   %esp+8   %esp+12
## |  %ebp  |
## +--------+                   %esp     %esp+4   %esp+8
## |  %ebx  |
## +--------+                            %esp     %esp+4
## |  %edi  |
## +--------+                                     %esp

	.align	align16
lbnMulN1_32:
_lbnMulN1_32:
	pushl	%esi		# U
	movl	12(%esp),%esi	#  V	load in
	pushl	%ebp		# U
	movl	20(%esp),%ebp	#  V	load len
	pushl	%ebx		# U
	movl	28(%esp),%ecx	#  V	load k
	pushl	%edi		# U
	movl	20(%esp),%edi	#  V	load out

## First multiply step has no carry in.
	movl	(%esi),%eax		#  V
	leal	-4(,%ebp,4),%ebx	# U	loop unrolling
	mull	%ecx			# NP	first multiply
	movl	%eax,(%edi)		# U
	andl	$12,%ebx		#  V	loop unrolling

	addl	%ebx,%esi		# U	loop unrolling
	addl	%ebx,%edi		#  V	loop unrolling

	jmp	*m32_jumptable(%ebx)	# NP	loop unrolling

	.align	align4
m32_jumptable:
	.long	m32_case0
	.long	m32_case1
	.long	m32_case2
	.long	m32_case3

	nop
	.align	align8
	nop
	nop
	nop	# Get loop nicely aligned

m32_case0:
	subl	$4,%ebp		# U
	jbe	m32_done	#  V

m32_loop:
	movl	4(%esi),%eax	# U
	movl	%edx,%ebx	#  V	Remember carry for later
	addl	$16,%esi	# U
	addl	$16,%edi	#  V
	mull	%ecx		# NP
	addl	%ebx,%eax	# U	Add carry in from previous word
	adcl	$0,%edx		# U
	movl	%eax,-12(%edi)	#  V
m32_case3:
	movl	-8(%esi),%eax	# U
	movl	%edx,%ebx	#  V	Remember carry for later
	mull	%ecx		# NP
	addl	%ebx,%eax	# U	Add carry in from previous word
	adcl	$0,%edx		# U
	movl	%eax,-8(%edi)	#  V
m32_case2:
	movl	-4(%esi),%eax	# U
	movl	%edx,%ebx	#  V	Remember carry for later
	mull	%ecx		# NP
	addl	%ebx,%eax	# U	Add carry in from previous word
	adcl	$0,%edx		# U
	movl	%eax,-4(%edi)	#  V
m32_case1:
	movl	(%esi),%eax	# U
	movl	%edx,%ebx	#  V	Remember carry for later
	mull	%ecx		# NP
	addl	%ebx,%eax	# U	Add carry in from previous word
	adcl	$0,%edx		# U
	movl	%eax,(%edi)	#  V

	subl	$4,%ebp		# U
	ja	m32_loop	#  V

m32_done:
	movl	%edx,4(%edi)	# U
	popl	%edi		#  V
	popl	%ebx		# U
	popl	%ebp		#  V
	popl	%esi		# U
	ret			# NP


	.align	align16
lbnMulAdd1_32:
_lbnMulAdd1_32:

	pushl	%esi		# U
	movl	12(%esp),%esi	#  V	load in
	pushl	%edi		# U
	movl	12(%esp),%edi	#  V	load out
	pushl	%ebp		# U
	movl	24(%esp),%ebp	#  V	load len
	pushl	%ebx		# U
	movl	32(%esp),%ecx	#  V	load k

## First multiply step has no carry in.
	movl	(%esi),%eax		#  V
	movl	(%edi),%ebx		# U
	mull	%ecx			# NP	first multiply
	addl	%eax,%ebx		# U
	leal	-4(,%ebp,4),%eax	#  V	loop unrolling
	adcl	$0,%edx			# U
	andl	$12,%eax		#  V	loop unrolling
	movl	%ebx,(%edi)		# U

	addl	%eax,%esi		#  V	loop unrolling
	addl	%eax,%edi		# U	loop unrolling

	jmp	*ma32_jumptable(%eax)	# NP	loop unrolling

	.align	align4
ma32_jumptable:
	.long	ma32_case0
	.long	ma32_case1
	.long	ma32_case2
	.long	ma32_case3

	.align	align8
	nop
	nop
	nop			# To align loop properly


ma32_case0:
	subl	$4,%ebp		# U
	jbe	ma32_done	#  V

ma32_loop:
	movl	4(%esi),%eax	# U
	movl	%edx,%ebx	#  V	Remember carry for later
	addl	$16,%esi	# U
	addl	$16,%edi	#  V
	mull	%ecx		# NP
	addl	%ebx,%eax	# U	Add carry in from previous word
	movl	-12(%edi),%ebx	#  V
	adcl	$0,%edx		# U
	addl	%eax,%ebx	#  V
	adcl	$0,%edx		# U
	movl	%ebx,-12(%edi)	#  V
ma32_case3:
	movl	-8(%esi),%eax	# U
	movl	%edx,%ebx	#  V	Remember carry for later
	mull	%ecx		# NP
	addl	%ebx,%eax	# U	Add carry in from previous word
	movl	-8(%edi),%ebx	#  V
	adcl	$0,%edx		# U
	addl	%eax,%ebx	#  V
	adcl	$0,%edx		# U
	movl	%ebx,-8(%edi)	#  V
ma32_case2:
	movl	-4(%esi),%eax	# U
	movl	%edx,%ebx	#  V	Remember carry for later
	mull	%ecx		# NP
	addl	%ebx,%eax	# U	Add carry in from previous word
	movl	-4(%edi),%ebx	#  V
	adcl	$0,%edx		# U
	addl	%eax,%ebx	#  V
	adcl	$0,%edx		# U
	movl	%ebx,-4(%edi)	#  V
ma32_case1:
	movl	(%esi),%eax	# U
	movl	%edx,%ebx	#  V	Remember carry for later
	mull	%ecx		# NP
	addl	%ebx,%eax	# U	Add carry in from previous word
	movl	(%edi),%ebx	#  V
	adcl	$0,%edx		# U
	addl	%eax,%ebx	#  V
	adcl	$0,%edx		# U
	movl	%ebx,(%edi)	#  V

	subl	$4,%ebp		# U
	ja	ma32_loop	#  V

ma32_done:
	popl	%ebx		# U
	popl	%ebp		#  V
	movl	%edx,%eax	# U
	popl	%edi		#  V
	popl	%esi		# U
	ret			# NP


	.align	align16
lbnMulSub1_32:
_lbnMulSub1_32:
	pushl	%esi		# U
	movl	12(%esp),%esi	#  V	load in
	pushl	%edi		# U
	movl	12(%esp),%edi	#  V	load out
	pushl	%ebp		# U
	movl	24(%esp),%ebp	#  V	load len
	pushl	%ebx		# U
	movl	32(%esp),%ecx	#  V	load k

/* First multiply step has no carry in. */
	movl	(%esi),%eax		#  V
	movl	(%edi),%ebx		# U
	mull	%ecx			# NP	first multiply
	subl	%eax,%ebx		# U
	leal	-4(,%ebp,4),%eax	#  V	loop unrolling
	adcl	$0,%edx			# U
	andl	$12,%eax		#  V	loop unrolling
	movl	%ebx,(%edi)		# U

	addl	%eax,%esi		#  V	loop unrolling
	addl	%eax,%edi		# U	loop unrolling

	jmp	*ms32_jumptable(%eax)	# NP	loop unrolling

	.align	align4
ms32_jumptable:
	.long	ms32_case0
	.long	ms32_case1
	.long	ms32_case2
	.long	ms32_case3

	.align	align8
	nop
	nop
	nop

ms32_case0:
	subl	$4,%ebp		# U
	jbe	ms32_done	#  V

ms32_loop:
	movl	4(%esi),%eax	# U
	movl	%edx,%ebx	#  V	Remember carry for later
	addl	$16,%esi	# U
	addl	$16,%edi	#  V
	mull	%ecx		# NP
	addl	%ebx,%eax	# U	Add carry in from previous word
	movl	-12(%edi),%ebx	#  V
	adcl	$0,%edx		# U
	subl	%eax,%ebx	#  V
	adcl	$0,%edx		# U
	movl	%ebx,-12(%edi)	#  V
ms32_case3:
	movl	-8(%esi),%eax	# U
	movl	%edx,%ebx	#  V	Remember carry for later
	mull	%ecx		# NP
	addl	%ebx,%eax	# U	Add carry in from previous word
	movl	-8(%edi),%ebx	#  V
	adcl	$0,%edx		# U
	subl	%eax,%ebx	#  V
	adcl	$0,%edx		# U
	movl	%ebx,-8(%edi)	#  V
ms32_case2:
	movl	-4(%esi),%eax	# U
	movl	%edx,%ebx	#  V	Remember carry for later
	mull	%ecx		# NP
	addl	%ebx,%eax	# U	Add carry in from previous word
	movl	-4(%edi),%ebx	#  V
	adcl	$0,%edx		# U
	subl	%eax,%ebx	#  V
	adcl	$0,%edx		# U
	movl	%ebx,-4(%edi)	#  V
ms32_case1:
	movl	(%esi),%eax	# U
	movl	%edx,%ebx	#  V	Remember carry for later
	mull	%ecx		# NP
	addl	%ebx,%eax	# U	Add carry in from previous word
	movl	(%edi),%ebx	#  V
	adcl	$0,%edx		# U
	subl	%eax,%ebx	#  V
	adcl	$0,%edx		# U
	movl	%ebx,(%edi)	#  V

	subl	$4,%ebp		# U
	ja	ms32_loop	#  V

ms32_done:
	popl	%ebx		# U
	popl	%ebp		#  V
	movl	%edx,%eax	# U
	popl	%edi		#  V
	popl	%esi		# U
	ret			# NP

## Two-word by one-word divide.  Stores quotient, returns remainder.
## BNWORD32 lbnDiv21_32(BNWORD32 *q, BNWORD32 nh, BNWORD32 nl, BNWORD32 d)
##                      4            8            12           16

	.align align16
lbnDiv21_32:
_lbnDiv21_32:
	movl	8(%esp),%edx	# U	Load nh
	movl	12(%esp),%eax	#  V	Load nl
	movl	4(%esp),%ecx	# U	Load q
	divl	16(%esp)	# NP
	movl	%eax,(%ecx)	# U	Store quotient
	movl	%edx,%eax	#  V	Return remainder
	ret

## Multi-word by one-word remainder.
## This speeds up key generation.  It's not worth unrolling and so on;
## using 32-bit divides is enough of a speedup.
##
## The modulus (in %ebp) is often 16 bits.  Given that the dividend is 32
## bits, the chances of saving the first divide because the high word of the
## dividend is less than the modulus are low enough it's not worth taking
## the cycles to test for it.
##
## unsigned lbnModQ_32(BNWORD32 const *n, unsigned len, unsigned d)
##                     4                  8             12
	.align align16
lbnModQ_32:
_lbnModQ_32:
	movl	4(%esp),%eax		# U	Load n
	pushl	%ebp			#  V
	movl	12(%esp),%ebp		# U	Load len
	pushl	%esi			#  V
	leal	-4(%eax,%ebp,4),%esi	# U
	movl	20(%esp),%ecx		#  V	Load d
	xorl	%edx,%edx		# U	Clear MSW for first divide
modq32_loop:
	movl	(%esi),%eax		# U
	subl	$4,%esi			#  V
	divl	%ecx			# NP
	decl	%ebp			# U
	jnz	modq32_loop		#  V

	popl	%esi			# U
	movl	%edx,%eax		#  V
	popl	%ebp			# U
	ret				# NP
Add libzrtp beta 2010-02-20 18:51:54 +00:00			`### Copyright (c) 1995, Colin Plumb.`
			`### For licensing and other legal details, see the file legal.c.`
			`###`
Relicense bnlib under GPLv2/GPLv3 with exception for FreeSWITCH Thanks to Travis Cross for much of the language here. Signed-off-by: Philip Zimmermann <prz@mit.edu> Signed-off-by: Travis Cross <tc@traviscross.com> 2012-03-31 20:22:13 +00:00			`### Assembly primitives for bignum library, 80386 family, 32-bit code.`
			`###`
Add libzrtp beta 2010-02-20 18:51:54 +00:00			`### Several primitives are included here. Only lbnMulAdd1 is really`
			`### critical, but once that's written, lnmMulN1 and lbnMulSub1 are quite`
			`### easy to write as well, so they are included here as well.`
			`### lbnDiv21 and lbnModQ are so easy to write that they're included, too.`
			`###`
			`### All functions here are for 32-bit flat mode. I.e. near code and`
			`### near data, although the near offsets are 32 bits.`
			`### Preserved registers are esp, ebp, esi, edi and ebx. That last`
			`### is needed by ELF for PIC, and differs from the IBM PC calling`
			`### convention.`

			`# Different assemblers have different conventions here`
			`align4=4 # could be 2 or 4`
			`align8=8 # could be 3 or 8`
			`align16=16 # cound be 4 or 16`


			`.text`

			`# We declare each symbol with two names, to deal with ELF/a.out variances.`
			`.globl lbnMulN1_32`
			`.globl _lbnMulN1_32`
			`.globl lbnMulAdd1_32`
			`.globl _lbnMulAdd1_32`
			`.globl lbnMulSub1_32`
			`.globl _lbnMulSub1_32`
			`.globl lbnDiv21_32`
			`.globl _lbnDiv21_32`
			`.globl lbnModQ_32`
			`.globl _lbnModQ_32`

			`## Register usage:`
			`## %eax - low half of product`
			`## %ebx - carry to next iteration`
			`## %ecx - multiplier (k)`
			`## %edx - high half of product`
			`## %esi - source pointer`
			`## %edi - dest pointer`
			`## %ebp - loop counter`
			`##`
			`## Stack frame:`
			`## +--------+ %esp+20 %esp+24 %esp+28 %esp+32 %esp+36`
			`## \| k \|`
			`## +--------+ %esp+16 %esp+20 %esp+24 %esp+28 %esp+32`
			`## \| len \|`
			`## +--------+ %esp+12 %esp+16 %esp+20 %esp+24 %esp+28`
			`## \| in \|`
			`## +--------+ %esp+8 %esp+12 %esp+16 %esp+20 %esp+24`
			`## \| out \|`
			`## +--------+ %esp+4 %esp+8 %esp+12 %esp+16 %esp+20`
			`## \| return \|`
			`## +--------+ %esp %esp+4 %esp+8 %esp+12 %esp+16`
			`## \| %esi \|`
			`## +--------+ %esp %esp+4 %esp+8 %esp+12`
			`## \| %ebp \|`
			`## +--------+ %esp %esp+4 %esp+8`
			`## \| %ebx \|`
			`## +--------+ %esp %esp+4`
			`## \| %edi \|`
			`## +--------+ %esp`

			`.align align16`
			`lbnMulN1_32:`
			`_lbnMulN1_32:`
			`pushl %esi # U`
			`movl 12(%esp),%esi # V load in`
			`pushl %ebp # U`
			`movl 20(%esp),%ebp # V load len`
			`pushl %ebx # U`
			`movl 28(%esp),%ecx # V load k`
			`pushl %edi # U`
			`movl 20(%esp),%edi # V load out`

			`## First multiply step has no carry in.`
			`movl (%esi),%eax # V`
			`leal -4(,%ebp,4),%ebx # U loop unrolling`
			`mull %ecx # NP first multiply`
			`movl %eax,(%edi) # U`
			`andl $12,%ebx # V loop unrolling`

			`addl %ebx,%esi # U loop unrolling`
			`addl %ebx,%edi # V loop unrolling`

			`jmp *m32_jumptable(%ebx) # NP loop unrolling`

			`.align align4`
			`m32_jumptable:`
			`.long m32_case0`
			`.long m32_case1`
			`.long m32_case2`
			`.long m32_case3`

			`nop`
			`.align align8`
			`nop`
			`nop`
			`nop # Get loop nicely aligned`

			`m32_case0:`
			`subl $4,%ebp # U`
			`jbe m32_done # V`

			`m32_loop:`
			`movl 4(%esi),%eax # U`
			`movl %edx,%ebx # V Remember carry for later`
			`addl $16,%esi # U`
			`addl $16,%edi # V`
			`mull %ecx # NP`
			`addl %ebx,%eax # U Add carry in from previous word`
			`adcl $0,%edx # U`
			`movl %eax,-12(%edi) # V`
			`m32_case3:`
			`movl -8(%esi),%eax # U`
			`movl %edx,%ebx # V Remember carry for later`
			`mull %ecx # NP`
			`addl %ebx,%eax # U Add carry in from previous word`
			`adcl $0,%edx # U`
			`movl %eax,-8(%edi) # V`
			`m32_case2:`
			`movl -4(%esi),%eax # U`
			`movl %edx,%ebx # V Remember carry for later`
			`mull %ecx # NP`
			`addl %ebx,%eax # U Add carry in from previous word`
			`adcl $0,%edx # U`
			`movl %eax,-4(%edi) # V`
			`m32_case1:`
			`movl (%esi),%eax # U`
			`movl %edx,%ebx # V Remember carry for later`
			`mull %ecx # NP`
			`addl %ebx,%eax # U Add carry in from previous word`
			`adcl $0,%edx # U`
			`movl %eax,(%edi) # V`

			`subl $4,%ebp # U`
			`ja m32_loop # V`

			`m32_done:`
			`movl %edx,4(%edi) # U`
			`popl %edi # V`
			`popl %ebx # U`
			`popl %ebp # V`
			`popl %esi # U`
			`ret # NP`


			`.align align16`
			`lbnMulAdd1_32:`
			`_lbnMulAdd1_32:`

			`pushl %esi # U`
			`movl 12(%esp),%esi # V load in`
			`pushl %edi # U`
			`movl 12(%esp),%edi # V load out`
			`pushl %ebp # U`
			`movl 24(%esp),%ebp # V load len`
			`pushl %ebx # U`
			`movl 32(%esp),%ecx # V load k`

			`## First multiply step has no carry in.`
			`movl (%esi),%eax # V`
			`movl (%edi),%ebx # U`
			`mull %ecx # NP first multiply`
			`addl %eax,%ebx # U`
			`leal -4(,%ebp,4),%eax # V loop unrolling`
			`adcl $0,%edx # U`
			`andl $12,%eax # V loop unrolling`
			`movl %ebx,(%edi) # U`

			`addl %eax,%esi # V loop unrolling`
			`addl %eax,%edi # U loop unrolling`

			`jmp *ma32_jumptable(%eax) # NP loop unrolling`

			`.align align4`
			`ma32_jumptable:`
			`.long ma32_case0`
			`.long ma32_case1`
			`.long ma32_case2`
			`.long ma32_case3`

			`.align align8`
			`nop`
			`nop`
			`nop # To align loop properly`


			`ma32_case0:`
			`subl $4,%ebp # U`
			`jbe ma32_done # V`

			`ma32_loop:`
			`movl 4(%esi),%eax # U`
			`movl %edx,%ebx # V Remember carry for later`
			`addl $16,%esi # U`
			`addl $16,%edi # V`
			`mull %ecx # NP`
			`addl %ebx,%eax # U Add carry in from previous word`
			`movl -12(%edi),%ebx # V`
			`adcl $0,%edx # U`
			`addl %eax,%ebx # V`
			`adcl $0,%edx # U`
			`movl %ebx,-12(%edi) # V`
			`ma32_case3:`
			`movl -8(%esi),%eax # U`
			`movl %edx,%ebx # V Remember carry for later`
			`mull %ecx # NP`
			`addl %ebx,%eax # U Add carry in from previous word`
			`movl -8(%edi),%ebx # V`
			`adcl $0,%edx # U`
			`addl %eax,%ebx # V`
			`adcl $0,%edx # U`
			`movl %ebx,-8(%edi) # V`
			`ma32_case2:`
			`movl -4(%esi),%eax # U`
			`movl %edx,%ebx # V Remember carry for later`
			`mull %ecx # NP`
			`addl %ebx,%eax # U Add carry in from previous word`
			`movl -4(%edi),%ebx # V`
			`adcl $0,%edx # U`
			`addl %eax,%ebx # V`
			`adcl $0,%edx # U`
			`movl %ebx,-4(%edi) # V`
			`ma32_case1:`
			`movl (%esi),%eax # U`
			`movl %edx,%ebx # V Remember carry for later`
			`mull %ecx # NP`
			`addl %ebx,%eax # U Add carry in from previous word`
			`movl (%edi),%ebx # V`
			`adcl $0,%edx # U`
			`addl %eax,%ebx # V`
			`adcl $0,%edx # U`
			`movl %ebx,(%edi) # V`

			`subl $4,%ebp # U`
			`ja ma32_loop # V`

			`ma32_done:`
			`popl %ebx # U`
			`popl %ebp # V`
			`movl %edx,%eax # U`
			`popl %edi # V`
			`popl %esi # U`
			`ret # NP`


			`.align align16`
			`lbnMulSub1_32:`
			`_lbnMulSub1_32:`
			`pushl %esi # U`
			`movl 12(%esp),%esi # V load in`
			`pushl %edi # U`
			`movl 12(%esp),%edi # V load out`
			`pushl %ebp # U`
			`movl 24(%esp),%ebp # V load len`
			`pushl %ebx # U`
			`movl 32(%esp),%ecx # V load k`

			`/* First multiply step has no carry in. */`
			`movl (%esi),%eax # V`
			`movl (%edi),%ebx # U`
			`mull %ecx # NP first multiply`
			`subl %eax,%ebx # U`
			`leal -4(,%ebp,4),%eax # V loop unrolling`
			`adcl $0,%edx # U`
			`andl $12,%eax # V loop unrolling`
			`movl %ebx,(%edi) # U`

			`addl %eax,%esi # V loop unrolling`
			`addl %eax,%edi # U loop unrolling`

			`jmp *ms32_jumptable(%eax) # NP loop unrolling`

			`.align align4`
			`ms32_jumptable:`
			`.long ms32_case0`
			`.long ms32_case1`
			`.long ms32_case2`
			`.long ms32_case3`

			`.align align8`
			`nop`
			`nop`
			`nop`

			`ms32_case0:`
			`subl $4,%ebp # U`
			`jbe ms32_done # V`

			`ms32_loop:`
			`movl 4(%esi),%eax # U`
			`movl %edx,%ebx # V Remember carry for later`
			`addl $16,%esi # U`
			`addl $16,%edi # V`
			`mull %ecx # NP`
			`addl %ebx,%eax # U Add carry in from previous word`
			`movl -12(%edi),%ebx # V`
			`adcl $0,%edx # U`
			`subl %eax,%ebx # V`
			`adcl $0,%edx # U`
			`movl %ebx,-12(%edi) # V`
			`ms32_case3:`
			`movl -8(%esi),%eax # U`
			`movl %edx,%ebx # V Remember carry for later`
			`mull %ecx # NP`
			`addl %ebx,%eax # U Add carry in from previous word`
			`movl -8(%edi),%ebx # V`
			`adcl $0,%edx # U`
			`subl %eax,%ebx # V`
			`adcl $0,%edx # U`
			`movl %ebx,-8(%edi) # V`
			`ms32_case2:`
			`movl -4(%esi),%eax # U`
			`movl %edx,%ebx # V Remember carry for later`
			`mull %ecx # NP`
			`addl %ebx,%eax # U Add carry in from previous word`
			`movl -4(%edi),%ebx # V`
			`adcl $0,%edx # U`
			`subl %eax,%ebx # V`
			`adcl $0,%edx # U`
			`movl %ebx,-4(%edi) # V`
			`ms32_case1:`
			`movl (%esi),%eax # U`
			`movl %edx,%ebx # V Remember carry for later`
			`mull %ecx # NP`
			`addl %ebx,%eax # U Add carry in from previous word`
			`movl (%edi),%ebx # V`
			`adcl $0,%edx # U`
			`subl %eax,%ebx # V`
			`adcl $0,%edx # U`
			`movl %ebx,(%edi) # V`

			`subl $4,%ebp # U`
			`ja ms32_loop # V`

			`ms32_done:`
			`popl %ebx # U`
			`popl %ebp # V`
			`movl %edx,%eax # U`
			`popl %edi # V`
			`popl %esi # U`
			`ret # NP`

			`## Two-word by one-word divide. Stores quotient, returns remainder.`
			`## BNWORD32 lbnDiv21_32(BNWORD32 *q, BNWORD32 nh, BNWORD32 nl, BNWORD32 d)`
			`## 4 8 12 16`

			`.align align16`
			`lbnDiv21_32:`
			`_lbnDiv21_32:`
			`movl 8(%esp),%edx # U Load nh`
			`movl 12(%esp),%eax # V Load nl`
			`movl 4(%esp),%ecx # U Load q`
			`divl 16(%esp) # NP`
			`movl %eax,(%ecx) # U Store quotient`
			`movl %edx,%eax # V Return remainder`
			`ret`

			`## Multi-word by one-word remainder.`
			`## This speeds up key generation. It's not worth unrolling and so on;`
			`## using 32-bit divides is enough of a speedup.`
			`##`
			`## The modulus (in %ebp) is often 16 bits. Given that the dividend is 32`
			`## bits, the chances of saving the first divide because the high word of the`
			`## dividend is less than the modulus are low enough it's not worth taking`
			`## the cycles to test for it.`
			`##`
			`## unsigned lbnModQ_32(BNWORD32 const *n, unsigned len, unsigned d)`
			`## 4 8 12`
			`.align align16`
			`lbnModQ_32:`
			`_lbnModQ_32:`
			`movl 4(%esp),%eax # U Load n`
			`pushl %ebp # V`
			`movl 12(%esp),%ebp # U Load len`
			`pushl %esi # V`
			`leal -4(%eax,%ebp,4),%esi # U`
			`movl 20(%esp),%ecx # V Load d`
			`xorl %edx,%edx # U Clear MSW for first divide`
			`modq32_loop:`
			`movl (%esi),%eax # U`
			`subl $4,%esi # V`
			`divl %ecx # NP`
			`decl %ebp # U`
			`jnz modq32_loop # V`

			`popl %esi # U`
			`movl %edx,%eax # V`
			`popl %ebp # U`
			`ret # NP`