415 lines
9.1 KiB
NASM
415 lines
9.1 KiB
NASM
;;; Copyright (c) 1995, Colin Plumb.
|
|
;;; For licensing and other legal details, see the file legal.c.
|
|
;;;
|
|
;;; Assembly primitives for bignum library, 80386 family, 32-bit code.
|
|
;;;
|
|
;;; Several primitives are included here. Only lbnMulAdd1 is *really*
|
|
;;; critical, but once that's written, lnmMulN1 and lbnMulSub1 are quite
|
|
;;; easy to write as well, so they are included here as well.
|
|
;;; lbnDiv21 and lbnModQ are so easy to write that they're included, too.
|
|
;;;
|
|
;;; All functions here are for 32-bit flat mode. I.e. near code and
|
|
;;; near data, although the near offsets are 32 bits.
|
|
;;;
|
|
;;; The usual 80x86 calling conventions have AX, BX, CX and DX
|
|
;;; volatile, and SI, DI, SP and BP preserved across calls.
|
|
;;; This includes the "E"xtended forms of all of those registers
|
|
;;;
|
|
;;; However, just to be confusing, recent 32-bit DOS compilers have
|
|
;;; quietly changed that to require EBX preserved across calls, too.
|
|
;;; Joy.
|
|
|
|
.386
|
|
;_TEXT segment para public use32 'CODE' ; 16-byte aligned because 486 cares
|
|
;_TEXT ends
|
|
|
|
ifdef @Version
|
|
if @Version le 510
|
|
FLAT group _TEXT
|
|
endif
|
|
else
|
|
FLAT group _TEXT
|
|
endif
|
|
assume cs:FLAT, ds:FLAT, ss:FLAT
|
|
_TEXT segment para public use32 'CODE' ; 16-byte aligned because 486 cares
|
|
|
|
public _lbnMulN1_32
|
|
public _lbnMulAdd1_32
|
|
public _lbnMulSub1_32
|
|
public _lbnDiv21_32
|
|
public _lbnModQ_32
|
|
|
|
;; Register usage:
|
|
;; eax - low half of product
|
|
;; ebx - carry to next iteration
|
|
;; ecx - multiplier (k)
|
|
;; edx - high half of product
|
|
;; esi - source pointer
|
|
;; edi - dest pointer
|
|
;; ebp - loop counter
|
|
;;
|
|
;; Stack frame:
|
|
;; +--------+ esp+20 esp+24 esp+28 esp+32 esp+36
|
|
;; | k |
|
|
;; +--------+ esp+16 esp+20 esp+24 esp+28 esp+32
|
|
;; | len |
|
|
;; +--------+ esp+12 esp+16 esp+20 esp+24 esp+28
|
|
;; | in |
|
|
;; +--------+ esp+8 esp+12 esp+16 esp+20 esp+24
|
|
;; | out |
|
|
;; +--------+ esp+4 esp+8 esp+12 esp+16 esp+20
|
|
;; | return |
|
|
;; +--------+ esp esp+4 esp+8 esp+12 esp+16
|
|
;; | esi |
|
|
;; +--------+ esp esp+4 esp+8 esp+12
|
|
;; | ebp |
|
|
;; +--------+ esp esp+4 esp+8
|
|
;; | ebx |
|
|
;; +--------+ esp esp+4
|
|
;; | edi |
|
|
;; +--------+ esp
|
|
|
|
align 16
|
|
_lbnMulN1_32 proc near
|
|
|
|
push esi ; U
|
|
mov esi,[esp+12] ; V load in
|
|
push ebp ; U
|
|
mov ebp,[esp+20] ; V load len
|
|
push ebx ; U
|
|
mov ecx,[esp+28] ; V load k
|
|
push edi ; U
|
|
mov edi,[esp+20] ; V load out
|
|
|
|
;; First multiply step has no carry in.
|
|
mov eax,[esi] ; U
|
|
lea ebx,[ebp*4-4] ; V loop unrolling
|
|
mul ecx ; NP first multiply
|
|
mov [edi],eax ; U
|
|
and ebx,12 ; V loop unrolling
|
|
|
|
add esi,ebx ; U loop unrolling
|
|
add edi,ebx ; V loop unrolling
|
|
|
|
jmp DWORD PTR m32_jumptable[ebx] ; NP loop unrolling
|
|
|
|
align 4
|
|
m32_jumptable:
|
|
dd m32_case0
|
|
dd m32_case1
|
|
dd m32_case2
|
|
dd m32_case3
|
|
|
|
nop
|
|
align 8
|
|
nop
|
|
nop
|
|
nop ; Get loop nicely aligned
|
|
|
|
m32_case0:
|
|
sub ebp,4 ; U
|
|
jbe SHORT m32_done ; V
|
|
|
|
m32_loop:
|
|
mov eax,[esi+4] ; U
|
|
mov ebx,edx ; V Remember carry for later
|
|
add esi,16 ; U
|
|
add edi,16 ; V
|
|
mul ecx ; NP
|
|
add eax,ebx ; U Add carry in from previous word
|
|
adc edx,0 ; U
|
|
mov [edi-12],eax ; V
|
|
m32_case3:
|
|
mov eax,[esi-8] ; U
|
|
mov ebx,edx ; V Remember carry for later
|
|
mul ecx ; NP
|
|
add eax,ebx ; U Add carry in from previous word
|
|
adc edx,0 ; U
|
|
mov [edi-8],eax ; V
|
|
m32_case2:
|
|
mov eax,[esi-4] ; U
|
|
mov ebx,edx ; V Remember carry for later
|
|
mul ecx ; NP
|
|
add eax,ebx ; U Add carry in from previous word
|
|
adc edx,0 ; U
|
|
mov [edi-4],eax ; V
|
|
m32_case1:
|
|
mov eax,[esi] ; U
|
|
mov ebx,edx ; V Remember carry for later
|
|
mul ecx ; NP
|
|
add eax,ebx ; U Add carry in from previous word
|
|
adc edx,0 ; U
|
|
mov [edi],eax ; V
|
|
|
|
sub ebp,4 ; U
|
|
ja SHORT m32_loop ; V
|
|
|
|
m32_done:
|
|
mov [edi+4],edx ; U
|
|
pop edi ; V
|
|
pop ebx ; U
|
|
pop ebp ; V
|
|
pop esi ; U
|
|
ret ; NP
|
|
_lbnMulN1_32 endp
|
|
|
|
|
|
align 16
|
|
_lbnMulAdd1_32 proc near
|
|
|
|
push esi ; U
|
|
mov esi,[esp+12] ; V load in
|
|
push edi ; U
|
|
mov edi,[esp+12] ; V load out
|
|
push ebp ; U
|
|
mov ebp,[esp+24] ; V load len
|
|
push ebx ; U
|
|
mov ecx,[esp+32] ; V load k
|
|
|
|
;; First multiply step has no carry in.
|
|
mov eax,[esi] ; U
|
|
mov ebx,[edi] ; V
|
|
mul ecx ; NP first multiply
|
|
add ebx,eax ; U
|
|
lea eax,[ebp*4-4] ; V loop unrolling
|
|
adc edx,0 ; U
|
|
and eax,12 ; V loop unrolling
|
|
mov [edi],ebx ; U
|
|
|
|
add esi,eax ; V loop unrolling
|
|
add edi,eax ; U loop unrolling
|
|
|
|
jmp DWORD PTR ma32_jumptable[eax] ; NP loop unrolling
|
|
|
|
align 4
|
|
ma32_jumptable:
|
|
dd ma32_case0
|
|
dd ma32_case1
|
|
dd ma32_case2
|
|
dd ma32_case3
|
|
|
|
nop
|
|
align 8
|
|
nop
|
|
nop
|
|
nop ; To align loop properly
|
|
|
|
|
|
ma32_case0:
|
|
sub ebp,4 ; U
|
|
jbe SHORT ma32_done ; V
|
|
|
|
ma32_loop:
|
|
mov eax,[esi+4] ; U
|
|
mov ebx,edx ; V Remember carry for later
|
|
add esi,16 ; U
|
|
add edi,16 ; V
|
|
mul ecx ; NP
|
|
add eax,ebx ; U Add carry in from previous word
|
|
mov ebx,[edi-12] ; V
|
|
adc edx,0 ; U
|
|
add ebx,eax ; V
|
|
adc edx,0 ; U
|
|
mov [edi-12],ebx ; V
|
|
ma32_case3:
|
|
mov eax,[esi-8] ; U
|
|
mov ebx,edx ; V Remember carry for later
|
|
mul ecx ; NP
|
|
add eax,ebx ; U Add carry in from previous word
|
|
mov ebx,[edi-8] ; V
|
|
adc edx,0 ; U
|
|
add ebx,eax ; V
|
|
adc edx,0 ; U
|
|
mov [edi-8],ebx ; V
|
|
ma32_case2:
|
|
mov eax,[esi-4] ; U
|
|
mov ebx,edx ; V Remember carry for later
|
|
mul ecx ; NP
|
|
add eax,ebx ; U Add carry in from previous word
|
|
mov ebx,[edi-4] ; V
|
|
adc edx,0 ; U
|
|
add ebx,eax ; V
|
|
adc edx,0 ; U
|
|
mov [edi-4],ebx ; V
|
|
ma32_case1:
|
|
mov eax,[esi] ; U
|
|
mov ebx,edx ; V Remember carry for later
|
|
mul ecx ; NP
|
|
add eax,ebx ; U Add carry in from previous word
|
|
mov ebx,[edi] ; V
|
|
adc edx,0 ; U
|
|
add ebx,eax ; V
|
|
adc edx,0 ; U
|
|
mov [edi],ebx ; V
|
|
|
|
sub ebp,4 ; U
|
|
ja SHORT ma32_loop ; V
|
|
|
|
ma32_done:
|
|
pop ebx ; U
|
|
pop ebp ; V
|
|
mov eax,edx ; U
|
|
pop edi ; V
|
|
pop esi ; U
|
|
ret ; NP
|
|
_lbnMulAdd1_32 endp
|
|
|
|
|
|
align 16
|
|
_lbnMulSub1_32 proc near
|
|
push esi ; U
|
|
mov esi,[esp+12] ; V load in
|
|
push edi ; U
|
|
mov edi,[esp+12] ; V load out
|
|
push ebp ; U
|
|
mov ebp,[esp+24] ; V load len
|
|
push ebx ; U
|
|
mov ecx,[esp+32] ; V load k
|
|
|
|
;; First multiply step has no carry in.
|
|
push esi ; U
|
|
mov esi,[esp+12] ; V load in
|
|
push edi ; U
|
|
mov edi,[esp+12] ; V load out
|
|
push ebp ; U
|
|
mov ebp,[esp+24] ; V load len
|
|
mov ecx,[esp+28] ; U load k
|
|
|
|
;; First multiply step has no carry in.
|
|
mov eax,[esi] ; V
|
|
mov ebx,[edi] ; U
|
|
mul ecx ; NP first multiply
|
|
sub ebx,eax ; U
|
|
lea eax,[ebp*4-4] ; V loop unrolling
|
|
adc edx,0 ; U
|
|
and eax,12 ; V loop unrolling
|
|
mov [edi],ebx ; U
|
|
|
|
add esi,eax ; V loop unrolling
|
|
add edi,eax ; U loop unrolling
|
|
|
|
jmp DWORD PTR ms32_jumptable[eax] ; NP loop unrolling
|
|
|
|
align 4
|
|
ms32_jumptable:
|
|
dd ms32_case0
|
|
dd ms32_case1
|
|
dd ms32_case2
|
|
dd ms32_case3
|
|
|
|
nop
|
|
align 8
|
|
nop
|
|
nop
|
|
nop
|
|
|
|
ms32_case0:
|
|
sub ebp,4 ; U
|
|
jbe SHORT ms32_done ; V
|
|
|
|
ms32_loop:
|
|
mov eax,[esi+4] ; U
|
|
mov ebx,edx ; V Remember carry for later
|
|
add esi,16 ; U
|
|
add edi,16 ; V
|
|
mul ecx ; NP
|
|
add eax,ebx ; U Add carry in from previous word
|
|
mov ebx,[edi-12] ; V
|
|
adc edx,0 ; U
|
|
sub ebx,eax ; V
|
|
adc edx,0 ; U
|
|
mov [edi-12],ebx ; V
|
|
ms32_case3:
|
|
mov eax,[esi-8] ; U
|
|
mov ebx,edx ; V Remember carry for later
|
|
mul ecx ; NP
|
|
add eax,ebx ; U Add carry in from previous word
|
|
mov ebx,[edi-8] ; V
|
|
adc edx,0 ; U
|
|
sub ebx,eax ; V
|
|
adc edx,0 ; U
|
|
mov [edi-8],ebx ; V
|
|
ms32_case2:
|
|
mov eax,[esi-4] ; U
|
|
mov ebx,edx ; V Remember carry for later
|
|
mul ecx ; NP
|
|
add eax,ebx ; U Add carry in from previous word
|
|
mov ebx,[edi-4] ; V
|
|
adc edx,0 ; U
|
|
sub ebx,eax ; V
|
|
adc edx,0 ; U
|
|
mov [edi-4],ebx ; V
|
|
ms32_case1:
|
|
mov eax,[esi] ; U
|
|
mov ebx,edx ; V Remember carry for later
|
|
mul ecx ; NP
|
|
add eax,ebx ; U Add carry in from previous word
|
|
mov ebx,[edi] ; V
|
|
adc edx,0 ; U
|
|
sub ebx,eax ; V
|
|
adc edx,0 ; U
|
|
mov [edi],ebx ; V
|
|
|
|
sub ebp,4 ; U
|
|
ja SHORT ms32_loop ; V
|
|
|
|
ms32_done:
|
|
pop ebx ; U
|
|
pop ebp ; V
|
|
mov eax,edx ; U
|
|
pop edi ; V
|
|
pop esi ; U
|
|
ret ; NP
|
|
_lbnMulSub1_32 endp
|
|
|
|
|
|
|
|
;; Two-word by one-word divide. Stores quotient, returns remainder.
|
|
;; BNWORD32 lbnDiv21_32(BNWORD32 *q, BNWORD32 nh, BNWORD32 nl, BNWORD32 d)
|
|
;; 4 8 12 16
|
|
align 4
|
|
_lbnDiv21_32 proc near
|
|
mov edx,[esp+8] ; U Load nh
|
|
mov eax,[esp+12] ; V Load nl
|
|
mov ecx,[esp+4] ; U Load q
|
|
div DWORD PTR [esp+16] ; NP
|
|
mov [ecx],eax ; U Store quotient
|
|
mov eax,edx ; V Return remainder
|
|
ret
|
|
_lbnDiv21_32 endp
|
|
|
|
;; Multi-word by one-word remainder.
|
|
;; This speeds up key generation. It's not worth unrolling and so on;
|
|
;; using 32-bit divides is enough of a speedup.
|
|
;;
|
|
;; The modulus (in ebp) is often 16 bits. Given that the dividend is 32
|
|
;; bits, the chances of saving the first divide because the high word of the
|
|
;; dividend is less than the modulus are low enough it's not worth taking
|
|
;; the cycles to test for it.
|
|
;;
|
|
;; unsigned lbnModQ_32(BNWORD32 const *n, unsigned len, unsigned d)
|
|
;; 4 8 12
|
|
align 4
|
|
_lbnModQ_32 proc near
|
|
mov eax,[esp+4] ; U Load n
|
|
push ebp ; V
|
|
mov ebp,[esp+12] ; U Load len
|
|
push esi ; V
|
|
lea esi,[ebp*4+eax-4] ; U
|
|
mov ecx,[esp+20] ; V Load d
|
|
xor edx,edx ; U Clear edx for first iteration
|
|
modq32_loop:
|
|
mov eax,[esi] ; U Load new low word for divide
|
|
sub esi,4 ; V
|
|
div ecx ; NP edx = edx:eax % ecx
|
|
dec ebp ; U
|
|
jnz SHORT modq32_loop ; V
|
|
|
|
pop esi ; U
|
|
mov eax,edx ; V Return remainder in eax
|
|
pop ebp ; U
|
|
ret ; NP
|
|
_lbnModQ_32 endp
|
|
|
|
end
|