1039 lines
28 KiB
NASM
1039 lines
28 KiB
NASM
;;; Copyright (c) 1995, Colin Plumb.
|
|
;;; For licensing and other legal details, see the file legal.c.
|
|
;;;
|
|
;;; Assembly primitives for bignum library, 80x86 family.
|
|
;;;
|
|
;;; Several primitives are included here. Only lbnMulAdd1 is *really*
|
|
;;; critical, but once that's written, lnmMul1 and lbnSub1 are quite
|
|
;;; easy to write as well, so they are included here as well.
|
|
;;; lbnDiv21 and lbnModQ are so easy to write that they're included, too.
|
|
;;;
|
|
;;; All functions here are for large code, large data.
|
|
;;; All use standard "cdecl" calling convention: arguments pushed on the
|
|
;;; stack (ss:sp) right to left (the leftmost agrument at the lowest address)
|
|
;;; and popped by the caller, return values in ax or dx:ax, and register
|
|
;;; usage as follows:
|
|
;;;
|
|
;;; Callee-save (preserved by callee if needed):
|
|
;;; ss, esp, cs, eip, ds, esi, edi, ebp, high byte of FLAGS except DF,
|
|
;;; all other registers (CRx, DRx, TRx, IDT, GDT, LDT, TR, etc.).
|
|
;;; Caller-save (may be corrupted by callee):
|
|
;;; es, eax, ebx, ecx, edx, low byte of flags (SF, ZF, AF, PF, CF)
|
|
;;;
|
|
;;; The direction flag (DF) is either preserved or cleared.
|
|
;;; I'm not sure what the calling convention is for fs and gs. This
|
|
;;; code never alters them.
|
|
|
|
;; Not all of this code has to be '386 code, but STUPID FUCKING MASM (5.0)
|
|
;; gives an error if you change in the middle of a segment. Rather than
|
|
;; fight the thing, just enable '386 instructions everywhere. (And lose
|
|
;; the error checking.)
|
|
.386
|
|
|
|
_TEXT segment para public use16 'CODE' ; 16-byte aligned because '486 cares
|
|
assume cs:_TEXT
|
|
|
|
public _lbnMulN1_16
|
|
public _lbnMulAdd1_16
|
|
public _lbnMulSub1_16
|
|
public _lbnDiv21_16
|
|
public _lbnModQ_16
|
|
|
|
public _lbnMulN1_32
|
|
public _lbnMulAdd1_32
|
|
public _lbnMulSub1_32
|
|
public _lbnDiv21_32
|
|
public _lbnModQ_32
|
|
|
|
public _not386
|
|
|
|
|
|
;; Prototype:
|
|
;; BNWORD16
|
|
;; lbnMulAdd_16(BNWORD16 *out, BNWORD16 *in, unsigned len, BNWORD16 k)
|
|
;;
|
|
;; Multiply len words of "in" by k and add to len words of "out";
|
|
;; return the len+1st word of carry. All pointers are to the least-
|
|
;; significant ends of the appropriate arrays. len is guaraneed > 0.
|
|
;;
|
|
;; This 16-bit code is optimized for an 8086/80286. It will not be run
|
|
;; on 32-bit processors except for debugging during development.
|
|
;;
|
|
;; NOTE that it may be possible to assume that the direction flag is clear
|
|
;; on entry; this would avoid the need for the cld instructions. Hoewever,
|
|
;; the Microsoft C libraries require that the direction flag be clear.
|
|
;; Thus, lbnModQ_16 clears it before returning.
|
|
;;
|
|
;; Stack frame:
|
|
;; +--------+ bp+18
|
|
;; | k |
|
|
;; +--------+ bp+16
|
|
;; | len |
|
|
;; +--------+ bp+14
|
|
;; | |
|
|
;; +- in -+
|
|
;; | |
|
|
;; +--------+ bp+10
|
|
;; | |
|
|
;; +- out -+
|
|
;; | |
|
|
;; +--------+ bp+6
|
|
;; | |
|
|
;; +-return-+
|
|
;; | |
|
|
;; +--------+ bp+2
|
|
;; | old bp |
|
|
;; +--------+ bp
|
|
;;
|
|
;; Register usage for lbnMul1_16:
|
|
;; ds:[si] in
|
|
;; es:[di] out
|
|
;; bp k
|
|
;; cx loop counter (len/4)
|
|
;; dx,ax high,low parts of product
|
|
;; bx carry from previous multiply iteration
|
|
;;
|
|
;; Register usage for lbnMulAdd1_16 and lbnMulSub1_16:
|
|
;; ds:[si] in
|
|
;; es:[bx+si] out
|
|
;; bp k
|
|
;; cx loop counter (len/4)
|
|
;; dx,ax high,low parts of product
|
|
;; di carry from previous multiply iteration
|
|
;;
|
|
;; The reson for the difference is that straight mul can use stosw, but
|
|
;; the multiply and add or multiply and subtract add the result in, so
|
|
;; they have to reference es:[di] to add it in.
|
|
;;
|
|
;; The options are either "add ax,es:[di]; stosw" or "add es:[di],ax;
|
|
;; add di,2"; both take 10 cycles on an 80286, 27 on an 8086 and 35 on
|
|
;; an 8088 although the former is preferred since it's one byte smaller.
|
|
;; However, using [bx+si] is even faster; "add es:[bx+si],ax" takes
|
|
;; 7 cycles on an 80286, 25 on an 8086 and 33 on an 8088, as well as
|
|
;; being the smallest. (Of course, stosw, at 3 on an 80286, 11 on an
|
|
;; 8086 amd 15 on an 8088 wins easily in the straight multiply case over
|
|
;; mov es:[bx+si],ax, which takes 3/18/22 cycles and is larger to boot.)
|
|
;;
|
|
;; Most of these register assignments are driven by the 8086's instruction
|
|
;; set. The only really practical variation would be to put the multiplier
|
|
;; k into bx or di and use bp for carry, but if someone can make a faster
|
|
;; Duff's device using a lookup table, bx and di are useful because indexing
|
|
;; off them is more flexible than bp.
|
|
;;
|
|
;; Overview of code:
|
|
;;
|
|
;; len is guaranteed to be at least 1, so do the first multiply (with no
|
|
;; carry in) unconditionally. Then go to a min loop unrolled 4 times,
|
|
;; jumping into the middle using a variant of Duff's device.
|
|
;;
|
|
;; The loop is constructed using the loop instruction, which does
|
|
;; "} while (--cnt)". This means that we have to divide the count
|
|
;; by 4, and increment it so it doesn't start at 0. To gain a little
|
|
;; bit more efficiency, we actually increment the count by 2, so the
|
|
;; minimum possible value is 3, which will be shifted down to produce 0.
|
|
;; usually in Duff's device, if the number of iterations is a multiple
|
|
;; of the unrolling factor, you branch to just before the loop conditional
|
|
;; and let it handle the case of 0. Here, we have a special test for 0
|
|
;; at the head of the loop and fall through into the top of the loop
|
|
;; if it passes.
|
|
;;
|
|
;; Basically, with STEP being a multiply step, it's:
|
|
;;
|
|
;; STEP;
|
|
;; count += 2;
|
|
;; mod4 = count % 4;
|
|
;; count /= 4;
|
|
;; switch(mod4) {
|
|
;; case 3:
|
|
;; if (count) {
|
|
;; do {
|
|
;; STEP;
|
|
;; case 2:
|
|
;; STEP;
|
|
;; case 1:
|
|
;; STEP;
|
|
;; case 0:
|
|
;; STEP;
|
|
;; } while (--count);
|
|
;; }
|
|
;; }
|
|
;;
|
|
;; The switch() is actually done by two levels of branch instructions
|
|
;; rather than a lookup table.
|
|
|
|
_lbnMulN1_16 proc far
|
|
|
|
push bp
|
|
mov bp,sp
|
|
push ds
|
|
push si
|
|
push di
|
|
cld
|
|
|
|
les di,[bp+6] ; out
|
|
lds si,[bp+10] ; in
|
|
mov cx,[bp+14] ; len
|
|
mov bp,[bp+16] ; k
|
|
|
|
;; First multiply step has no carry in
|
|
lodsw
|
|
mul bp
|
|
stosw
|
|
|
|
;; The switch() for Duff's device starts here
|
|
;; Note: this *is* faster than a jump table for an 8086 and '286.
|
|
;; 8086: jump table: 44 cycles; this: 27/29/31/41
|
|
;; 80286: jump table: 25 cycles; this: 17/17/20/22
|
|
shr cx,1
|
|
jc SHORT m16_odd
|
|
|
|
inc cx
|
|
shr cx,1
|
|
jc SHORT m16_case2
|
|
jmp SHORT m16_case0
|
|
|
|
nop ; To align loop
|
|
m16_odd:
|
|
inc cx
|
|
shr cx,1
|
|
jnc SHORT m16_case1
|
|
jz SHORT m16_done ; Avoid entire loop in this case
|
|
|
|
m16_loop:
|
|
lodsw
|
|
mov bx,dx ; Remember carry for later
|
|
mul bp
|
|
add ax,bx ; Add carry in from previous word
|
|
adc dx,0
|
|
stosw
|
|
m16_case2:
|
|
lodsw
|
|
mov bx,dx ; Remember carry for later
|
|
mul bp
|
|
add ax,bx ; Add carry in from previous word
|
|
adc dx,0
|
|
stosw
|
|
m16_case1:
|
|
lodsw
|
|
mov bx,dx ; Remember carry for later
|
|
mul bp
|
|
add ax,bx ; Add carry in from previous word
|
|
adc dx,0
|
|
stosw
|
|
m16_case0:
|
|
lodsw
|
|
mov bx,dx ; Remember carry for later
|
|
mul bp
|
|
add ax,bx ; Add carry in from previous word
|
|
adc dx,0
|
|
stosw
|
|
|
|
loop m16_loop
|
|
|
|
m16_done:
|
|
mov ax,dx
|
|
stosw ; Store last word
|
|
pop di
|
|
pop si
|
|
pop ds
|
|
pop bp
|
|
ret
|
|
|
|
_lbnMulN1_16 endp
|
|
|
|
|
|
align 2
|
|
_lbnMulAdd1_16 proc far
|
|
|
|
push bp
|
|
mov bp,sp
|
|
push ds
|
|
push si
|
|
push di
|
|
cld
|
|
|
|
les bx,[bp+6] ; out
|
|
lds si,[bp+10] ; in
|
|
mov cx,[bp+14] ; len
|
|
mov bp,[bp+16] ; k
|
|
|
|
;; First multiply step has no carry in
|
|
lodsw
|
|
mul bp
|
|
add es:[bx],ax ; This time, store in [bx] directly
|
|
adc dx,0
|
|
sub bx,si ; Prepare to use [bx+si].
|
|
|
|
;; The switch() for Duff's device starts here
|
|
;; Note: this *is* faster than a jump table for an 8086 and '286.
|
|
;; 8086: jump table: 44 cycles; this: 27/29/31/41
|
|
;; 80286: jump table: 25 cycles; this: 17/17/20/22
|
|
shr cx,1
|
|
jc SHORT ma16_odd
|
|
|
|
inc cx
|
|
shr cx,1
|
|
jc SHORT ma16_case2
|
|
jmp SHORT ma16_case0
|
|
|
|
ma16_odd:
|
|
inc cx
|
|
shr cx,1
|
|
jnc SHORT ma16_case1
|
|
jz SHORT ma16_done ; Avoid entire loop in this case
|
|
|
|
ma16_loop:
|
|
lodsw
|
|
mov di,dx ; Remember carry for later
|
|
mul bp
|
|
add ax,di ; Add carry in from previous word
|
|
adc dx,0
|
|
add es:[bx+si],ax
|
|
adc dx,0
|
|
ma16_case2:
|
|
lodsw
|
|
mov di,dx ; Remember carry for later
|
|
mul bp
|
|
add ax,di ; Add carry in from previous word
|
|
adc dx,0
|
|
add es:[bx+si],ax
|
|
adc dx,0
|
|
ma16_case1:
|
|
lodsw
|
|
mov di,dx ; Remember carry for later
|
|
mul bp
|
|
add ax,di ; Add carry in from previous word
|
|
adc dx,0
|
|
add es:[bx+si],ax
|
|
adc dx,0
|
|
ma16_case0:
|
|
lodsw
|
|
mov di,dx ; Remember carry for later
|
|
mul bp
|
|
add ax,di ; Add carry in from previous word
|
|
adc dx,0
|
|
add es:[bx+si],ax
|
|
adc dx,0
|
|
|
|
loop ma16_loop
|
|
|
|
ma16_done:
|
|
mov ax,dx
|
|
pop di
|
|
pop si
|
|
pop ds
|
|
pop bp
|
|
ret
|
|
|
|
_lbnMulAdd1_16 endp
|
|
|
|
align 2
|
|
_lbnMulSub1_16 proc far
|
|
|
|
push bp
|
|
mov bp,sp
|
|
push ds
|
|
push si
|
|
push di
|
|
cld
|
|
|
|
les bx,[bp+6] ; out
|
|
lds si,[bp+10] ; in
|
|
mov cx,[bp+14] ; len
|
|
mov bp,[bp+16] ; k
|
|
|
|
;; First multiply step has no carry in
|
|
lodsw
|
|
mul bp
|
|
sub es:[bx],ax ; This time, store in [bx] directly
|
|
adc dx,0
|
|
sub bx,si ; Prepare to use [bx+si].
|
|
|
|
;; The switch() for Duff's device starts here
|
|
;; Note: this *is* faster than a jump table for an 8086 and '286.
|
|
;; 8086: jump table: 44 cycles; this: 27/29/31/41
|
|
;; 80286: jump table: 25 cycles; this: 17/17/20/22
|
|
shr cx,1
|
|
jc SHORT ms16_odd
|
|
|
|
inc cx
|
|
shr cx,1
|
|
jc SHORT ms16_case2
|
|
jmp SHORT ms16_case0
|
|
|
|
ms16_odd:
|
|
inc cx
|
|
shr cx,1
|
|
jnc SHORT ms16_case1
|
|
jz SHORT ms16_done ; Avoid entire loop in this case
|
|
|
|
ms16_loop:
|
|
lodsw
|
|
mov di,dx ; Remember carry for later
|
|
mul bp
|
|
add ax,di ; Add carry in from previous word
|
|
adc dx,0
|
|
sub es:[bx+si],ax
|
|
adc dx,0
|
|
ms16_case2:
|
|
lodsw
|
|
mov di,dx ; Remember carry for later
|
|
mul bp
|
|
add ax,di ; Add carry in from previous word
|
|
adc dx,0
|
|
sub es:[bx+si],ax
|
|
adc dx,0
|
|
ms16_case1:
|
|
lodsw
|
|
mov di,dx ; Remember carry for later
|
|
mul bp
|
|
add ax,di ; Add carry in from previous word
|
|
adc dx,0
|
|
sub es:[bx+si],ax
|
|
adc dx,0
|
|
ms16_case0:
|
|
lodsw
|
|
mov di,dx ; Remember carry for later
|
|
mul bp
|
|
add ax,di ; Add carry in from previous word
|
|
adc dx,0
|
|
sub es:[bx+si],ax
|
|
adc dx,0
|
|
|
|
loop ms16_loop
|
|
|
|
ms16_done:
|
|
mov ax,dx
|
|
pop di
|
|
pop si
|
|
pop ds
|
|
pop bp
|
|
ret
|
|
|
|
_lbnMulSub1_16 endp
|
|
|
|
;; Two-word by one-word divide. Stores quotient, returns remainder.
|
|
;; BNWORD16 lbnDiv21_16(BNWORD16 *q, BNWORD16 nh, BNWORD16 nl, BNWORD16 d)
|
|
;; 4 8 10 12
|
|
align 2
|
|
_lbnDiv21_16 proc far
|
|
mov cx,bp ; bp NOT pushed; note change in offsets
|
|
mov bp,sp
|
|
mov dx,[bp+8]
|
|
mov ax,[bp+10]
|
|
div WORD PTR [bp+12]
|
|
les bx,[bp+4]
|
|
mov es:[bx],ax
|
|
mov ax,dx
|
|
mov bp,cx
|
|
ret
|
|
|
|
nop ; To align loop in lbnModQ properly
|
|
|
|
_lbnDiv21_16 endp
|
|
|
|
;; Multi-word by one-word remainder.
|
|
;; BNWORD16 lbnModQ_16(BNWORD16 *q, unsigned len, unsigned d)
|
|
;; 6 10 12
|
|
_lbnModQ_16 proc far
|
|
push bp
|
|
mov bp,sp
|
|
push ds
|
|
mov bx,si
|
|
mov cx,10[bp] ; load len
|
|
lds si,6[bp] ; load q
|
|
std ; loop MSW to LSW
|
|
add si,cx
|
|
mov bp,12[bp] ; load d
|
|
add si,cx
|
|
xor dx,dx ; Set up for first divide
|
|
sub si,2 ; Adjust pointer to point to MSW
|
|
|
|
lodsw ; Load first word
|
|
|
|
cmp ax,bp ; See if we can skip first divide
|
|
jnc SHORT modq16_inner ; No such luck
|
|
mov dx,ax ; Yes! Modulus > input, so remainder = input
|
|
dec cx ; Do loop
|
|
jz SHORT modq16_done
|
|
|
|
modq16_loop:
|
|
lodsw
|
|
modq16_inner:
|
|
div bp
|
|
loop modq16_loop
|
|
modq16_done:
|
|
pop ds
|
|
mov ax,dx ; Return remainder
|
|
pop bp
|
|
mov si,bx
|
|
cld ; Microsoft C's libraries assume this
|
|
ret
|
|
|
|
_lbnModQ_16 endp
|
|
|
|
|
|
;; Similar, but using 32-bit operations.
|
|
;;
|
|
;; The differences are that the switch() in Duff's device is done using
|
|
;; a jump table, and lods is not used because it's slower than load and
|
|
;; increment. The pointers are only updated once per loop; offset
|
|
;; addressing modes are used, since they're no slower. [di] is used
|
|
;; instead of [bx+si] because the extra increment of di take only one
|
|
;; cycle per loop a '486, while [bx+si] takes one extra cycle per multiply.
|
|
;;
|
|
;; The register assignments are also slightly different:
|
|
;;
|
|
;; es:[si] in
|
|
;; ds:[di] out
|
|
;; ecx k
|
|
;; bp loop counter (len/4)
|
|
;; edx,eax high,low parts of product
|
|
;; ebx carry word from previous multiply iteration
|
|
;;
|
|
;; The use of bp for a loop counter lets all the 32-bit values go
|
|
;; in caller-save registers, so there's no need to do any 32-bit
|
|
;; saves and restores. Using ds:di for the destination saves one
|
|
;; segment override in the lbnMulN1_32 code, since there's one more
|
|
;; store to [di] than load from es:[si].
|
|
;;
|
|
;; Given the number of 32-bit references that this code uses, optimizing
|
|
;; it for the Pentium is interesting, because the Pentium has a very
|
|
;; inefficient implementation of prefix bytes. Each prefix byte, with
|
|
;; the exception of 0x0f *>> on conditional branch instructions ONLY <<*
|
|
;; is a 1-cycle non-pairiable instruction. Which has the effect of
|
|
;; forcing the instruction it's on into the U pipe. But this code uses
|
|
;; *lots* of prefix bytes, notably the 0x66 operand size override.
|
|
;;
|
|
;; For example "add [di],eax" is advised against in Intel's optimization
|
|
;; papers, because it takes 3 cycles and 2 of them are not pairable.
|
|
;; But any longer sequence would have a prefix byte on every instruction,
|
|
;; resulting in even more non-pairable cycles. Also, only two instructions
|
|
;; in the multiply kernel can go in the V pipe (the increments of si and
|
|
;; di), and they're already there, so the pairable cycles would be wasted.
|
|
;;
|
|
;; Things would be *quite* different in native 32-bit mode.
|
|
;;
|
|
;; All instructions that could go in the V pipe that aren't there are
|
|
;; marked.
|
|
;;
|
|
;; The setup code is quite intricately interleaved to get the best possible
|
|
;; performance out of a Pentium. If you want to follow the code,
|
|
;; pretend that the sections actually come in the following order:
|
|
;; 1) prologue (push registers)
|
|
;; 2) load (fetch arguments)
|
|
;; 3) first multiply
|
|
;; 4) loop unrolling
|
|
;;
|
|
;; The loop unrolling setup consists of taking the count, adjusting
|
|
;; it to account for the first multiply, and splitting it into
|
|
;; two parts: the high bits are a loop count, while the low bits are
|
|
;; used to find the right entry in the Duff's device jump table and
|
|
;; to adjust the initial data pointers.
|
|
;;
|
|
;; Known slack: There is one instruction in the prologue and one in
|
|
;; the epilogue that could go in the V pipe if I could find a U-pipe
|
|
;; instruction to pair them with, but all the U-pipe instructions
|
|
;; are already paired, so it looks difficult.
|
|
;;
|
|
;; There is a cycle of Address Generation Interlock in the lbnMulN1_32
|
|
;; code on the Pentium (not on a '486). I can't figure out how to
|
|
;; get rid of it without wasting time elsewhere. The problem is that
|
|
;; the load of bx needs to be done as soon as possible to let it
|
|
;; be set up in time for the switch(). The other problem is the
|
|
;; epilogue code which can waste time if the order of the pushed
|
|
;; registers is diddled with so that ds doesn't come between si and di.
|
|
;;
|
|
;; The increment of si after the last load is redundant, and the
|
|
;; copy of the high word of the product to the carry after the last
|
|
;; multiply is likewise unnecessary.
|
|
;;
|
|
;; In these cases, the operations were done that way in order to remove
|
|
;; cycles from the loop on the '486 and/or Pentium, even though it costs
|
|
;; a few overhead cycles on a '386.
|
|
;; The increment fo si has to be done early because a load based on si
|
|
;; is the first thing in any given multiply step, and the address
|
|
;; generation interlock on the '486 and Pentium requires that a full
|
|
;; cycle (i.e. possibly two instructions on a Pentium) pass between
|
|
;; incrementing a register and using it in an address.
|
|
;; This saves one cycle per multiply on a '486 and Pentium, and costs
|
|
;; 2 cycles per call to the function on a '386 and 1 cycle on a '486.
|
|
;;
|
|
;; The carry word is copied where it is so that the decrement of the loop
|
|
;; counter happens in the V pipe. The instruction between the decrement
|
|
;; of the loop counter and the branch should be a U-pipe instruction that
|
|
;; doesn't affect the flags. Thus, the "mov" was rotated down from
|
|
;; the top of the loop to fill the slot.
|
|
;; This is a bit more marginal: it saves one cycle per loop iteration on
|
|
;; a Pentium, and costs 2 cycles per call on a '386, '486 or Pentium.
|
|
;;
|
|
;; The same logic applies to the copy of the carry and increment of si
|
|
;; before the test, in case 0, for skipping the loop entirely.
|
|
;; It makes no difference in speed if the loop is executed, but
|
|
;; incrementing si before saves an address generation interlock cycle
|
|
;; On a '486 and Pentium in the case that the loop is executed.
|
|
;; And the loop is executed more often than not.
|
|
;;
|
|
;; Given that just one multiply on a '386 takes 12 to 41 cycles (with the
|
|
;; average being very much at the high end of that) 4 cycles of additional
|
|
;; overhead per call is not a big deal.
|
|
;;
|
|
;; On a Pentium, it would actually be easier to *not* unroll the loop
|
|
;; at all, since the decrement and compare are completely hidden
|
|
;; in the V-pipe and it wouldn't cost anything to do them more often.
|
|
;; That would save the setup for the unrolling and Duff's device at the
|
|
;; beginning. But the overhead for that is pretty minor: ignoring what's
|
|
;; hidden in the V pipe, it's two cycles plus the indirect jump.
|
|
;; Not too much, and special-casing the pentium is quite a hassle.
|
|
;; (For starters, you have to detect it, and since you're probably in
|
|
;; V86 mode, without access to the EFLAGS register to test the CPUID bit.)
|
|
|
|
|
|
align 16
|
|
_lbnMulN1_32 proc far
|
|
|
|
push bp ; U prologue ** Could be V
|
|
mov bp,sp ; V prologue
|
|
push si ; U prologue ** Could be V
|
|
mov bx,[bp+14] ; U load len ** Could be V (AGI!)r
|
|
push ds ; NP prologue
|
|
les si,[bp+10] ; NP load in
|
|
mov ecx,[bp+16] ; U load k
|
|
dec bx ; V loop unrolling
|
|
shl bx,2 ; U loop unrolling
|
|
push di ; V prologue
|
|
lds di,[bp+6] ; NP load out
|
|
mov bp,bx ; U loop unrolling ** Could be V
|
|
and bx,12 ; V loop unrolling
|
|
|
|
;; First multiply step has no carry in.
|
|
mov eax,es:[si] ; U first multiply
|
|
add si,bx ; V loop unrolling
|
|
mul ecx ; NP first multiply
|
|
mov [di],eax ; U first multiply
|
|
add di,bx ; V loop unrolling
|
|
|
|
;; The switch() for Duff's device. This jump table is (slightly!) faster
|
|
;; than a bunch of branches on a '386 and '486, and is probably better yet
|
|
;; on higher processors.
|
|
jmp WORD PTR cs:m32_jumptable[bx] ; NP loop unrolling
|
|
align 2
|
|
m32_jumptable:
|
|
dw OFFSET m32_case0, 0
|
|
dw OFFSET m32_case1, 0
|
|
dw OFFSET m32_case2, 0
|
|
dw OFFSET m32_case3, 0, 0, 0, 0 ; Get loop aligned properly
|
|
|
|
m32_case0:
|
|
add si,16 ; U Fix up si ** Could be V
|
|
test bp,bp ; V
|
|
mov ebx,edx ; U Remember carry for later
|
|
jbe SHORT m32_done ; V Avoid entire loop if loop count is 0
|
|
|
|
m32_loop:
|
|
mov eax,es:[si-12] ; U
|
|
add di, 16 ; V
|
|
mul ecx ; NP
|
|
add eax,ebx ; U Add carry in from previous word
|
|
adc edx,0 ; U
|
|
mov [di-12],eax ; U
|
|
m32_case3:
|
|
mov ebx,edx ; U Remember carry for later
|
|
mov eax,es:[si-8] ; U
|
|
mul ecx ; NP
|
|
add eax,ebx ; U Add carry in from previous word
|
|
adc edx,0 ; U
|
|
mov [di-8],eax ; U
|
|
m32_case2:
|
|
mov ebx,edx ; U Remember carry for later
|
|
mov eax,es:[si-4] ; U
|
|
mul ecx ; NP
|
|
add eax,ebx ; U Add carry in from previous word
|
|
adc edx,0 ; U
|
|
mov [di-4],eax ; U
|
|
m32_case1:
|
|
mov ebx,edx ; U Remember carry for later
|
|
mov eax,es:[si] ; U
|
|
mul ecx ; NP
|
|
add eax,ebx ; U Add carry in from previous word
|
|
adc edx,0 ; U
|
|
add si,16 ; V
|
|
mov [di],eax ; U
|
|
|
|
sub bp,16 ; V
|
|
mov ebx,edx ; U Remember carry for later
|
|
ja m32_loop ; V
|
|
|
|
m32_done:
|
|
mov [di+4],edx ; U
|
|
pop di ; V
|
|
pop ds ; NP
|
|
pop si ; U ** Could be V
|
|
pop bp ; V
|
|
ret ; NP
|
|
|
|
_lbnMulN1_32 endp
|
|
|
|
|
|
align 16
|
|
_lbnMulAdd1_32 proc far
|
|
|
|
push bp ; U prologue ** Could be V
|
|
mov bp,sp ; V prologue
|
|
push ds ; NP prologue
|
|
|
|
mov ecx,[bp+16] ; U load k
|
|
mov bx,[bp+14] ; V load len
|
|
push di ; U prologue ** Could be V
|
|
dec bx ; V loop unrolling
|
|
lds di,[bp+6] ; NP load out
|
|
shl bx,2 ; U loop unrolling
|
|
push si ; V prologue
|
|
les si,[bp+10] ; NP load in
|
|
|
|
mov bp,bx ; U loop unrolling ** Could be V
|
|
and bx,12 ; V loop unrolling
|
|
|
|
;; First multiply step has no carry in.
|
|
mov eax,es:[si] ; U first multiply
|
|
add si,bx ; V loop unrolling
|
|
mul ecx ; NP first multiply
|
|
add [di],eax ; U first multiply
|
|
adc edx,0 ; U first multiply
|
|
add di,bx ; V loop unrolling
|
|
|
|
;; The switch() for Duff's device. This jump table is (slightly!) faster
|
|
;; than a bunch of branches on a '386 and '486, and is probably better yet
|
|
;; on higher processors.
|
|
jmp WORD PTR cs:ma32_jumptable[bx] ; NP loop unrolling
|
|
align 2
|
|
ma32_jumptable:
|
|
dw OFFSET ma32_case0, 0
|
|
dw OFFSET ma32_case1, 0
|
|
dw OFFSET ma32_case2, 0
|
|
dw OFFSET ma32_case3, 0, 0 ; To get loop aligned properly
|
|
|
|
ma32_case0:
|
|
add si,16 ; U Fix up si ** Could be V
|
|
test bp,bp ; V
|
|
mov ebx,edx ; U Remember carry for later
|
|
jbe SHORT ma32_done ; V Avoid entire loop if loop count is 0
|
|
|
|
ma32_loop:
|
|
mov eax,es:[si-12] ; U
|
|
add di, 16 ; V
|
|
mul ecx ; NP
|
|
add eax,ebx ; U Add carry in from previous word
|
|
adc edx,0 ; U
|
|
add [di-12],eax ; U
|
|
adc edx,0 ; U
|
|
ma32_case3:
|
|
mov ebx,edx ; U Remember carry for later
|
|
mov eax,es:[si-8] ; U
|
|
mul ecx ; NP
|
|
add eax,ebx ; U Add carry in from previous word
|
|
adc edx,0 ; U
|
|
add [di-8],eax ; U
|
|
adc edx,0 ; U
|
|
ma32_case2:
|
|
mov ebx,edx ; U Remember carry for later
|
|
mov eax,es:[si-4] ; U
|
|
mul ecx ; NP
|
|
add eax,ebx ; U Add carry in from previous word
|
|
adc edx,0 ; U
|
|
add [di-4],eax ; U
|
|
adc edx,0 ; U
|
|
ma32_case1:
|
|
mov ebx,edx ; U Remember carry for later
|
|
mov eax,es:[si] ; U
|
|
mul ecx ; NP
|
|
add eax,ebx ; U Add carry in from previous word
|
|
adc edx,0 ; U
|
|
add si,16 ; V
|
|
add [di],eax ; U
|
|
adc edx,0 ; U
|
|
|
|
sub bp,16 ; V
|
|
mov ebx,edx ; U Remember carry for later
|
|
ja ma32_loop ; V
|
|
|
|
ma32_done:
|
|
pop si ; U ** Could be V
|
|
pop di ; V
|
|
mov ax,dx ; U return value low ** Could be V
|
|
pop ds ; NP
|
|
shr edx,16 ; U return value high
|
|
pop bp ; V
|
|
ret ; NP
|
|
|
|
_lbnMulAdd1_32 endp
|
|
|
|
|
|
align 16
|
|
_lbnMulSub1_32 proc far
|
|
|
|
push bp ; U prologue ** Could be V
|
|
mov bp,sp ; V prologue
|
|
push ds ; NP prologue
|
|
|
|
mov ecx,[bp+16] ; U load k
|
|
mov bx,[bp+14] ; V load len
|
|
push di ; U prologue ** Could be V
|
|
dec bx ; V loop unrolling
|
|
lds di,[bp+6] ; NP load out
|
|
shl bx,2 ; U loop unrolling
|
|
push si ; V prologue
|
|
les si,[bp+10] ; NP load in
|
|
|
|
mov bp,bx ; U loop unrolling ** Could be V
|
|
and bx,12 ; V loop unrolling
|
|
|
|
;; First multiply step has no carry in.
|
|
mov eax,es:[si] ; U first multiply
|
|
add si,bx ; V loop unrolling
|
|
mul ecx ; NP first multiply
|
|
sub [di],eax ; U first multiply
|
|
adc edx,0 ; U first multiply
|
|
add di,bx ; V loop unrolling
|
|
|
|
;; The switch() for Duff's device. This jump table is (slightly!) faster
|
|
;; than a bunch of branches on a '386 and '486, and is probably better yet
|
|
;; on higher processors.
|
|
jmp WORD PTR cs:ms32_jumptable[bx] ; NP loop unrolling
|
|
align 2
|
|
ms32_jumptable:
|
|
dw OFFSET ms32_case0, 0
|
|
dw OFFSET ms32_case1, 0
|
|
dw OFFSET ms32_case2, 0
|
|
dw OFFSET ms32_case3, 0, 0 ; To get loop aligned properly
|
|
|
|
ms32_case0:
|
|
add si,16 ; U Fix up si ** Could be V
|
|
test bp,bp ; V
|
|
mov ebx,edx ; U Remember carry for later
|
|
jbe SHORT ms32_done ; V Avoid entire loop if loop count is 0
|
|
|
|
ms32_loop:
|
|
mov eax,es:[si-12] ; U
|
|
add di, 16 ; V
|
|
mul ecx ; NP
|
|
add eax,ebx ; U Add carry in from previous word
|
|
adc edx,0 ; U
|
|
sub [di-12],eax ; U
|
|
adc edx,0 ; U
|
|
ms32_case3:
|
|
mov ebx,edx ; U Remember carry for later
|
|
mov eax,es:[si-8] ; U
|
|
mul ecx ; NP
|
|
add eax,ebx ; U Add carry in from previous word
|
|
adc edx,0 ; U
|
|
sub [di-8],eax ; U
|
|
adc edx,0 ; U
|
|
ms32_case2:
|
|
mov ebx,edx ; U Remember carry for later
|
|
mov eax,es:[si-4] ; U
|
|
mul ecx ; NP
|
|
add eax,ebx ; U Add carry in from previous word
|
|
adc edx,0 ; U
|
|
sub [di-4],eax ; U
|
|
adc edx,0 ; U
|
|
ms32_case1:
|
|
mov ebx,edx ; U Remember carry for later
|
|
mov eax,es:[si] ; U
|
|
mul ecx ; NP
|
|
add eax,ebx ; U Add carry in from previous word
|
|
adc edx,0 ; U
|
|
add si,16 ; V
|
|
sub [di],eax ; U
|
|
adc edx,0 ; U
|
|
|
|
sub bp,16 ; V
|
|
mov ebx,edx ; U Remember carry for later
|
|
ja ms32_loop ; V
|
|
|
|
ms32_done:
|
|
pop si ; U ** Could be V
|
|
pop di ; V
|
|
mov ax,dx ; U return value low ** Could be V
|
|
pop ds ; NP
|
|
shr edx,16 ; U return value high
|
|
pop bp ; V
|
|
ret ; NP
|
|
|
|
_lbnMulSub1_32 endp
|
|
|
|
|
|
|
|
;; Just for interest's sake, here's a completely Pentium-optimized version.
|
|
;; In addition to being smaller, it takes 8 + (8+mul_time)*n cycles, as
|
|
;; compared to the 10 + jmp_time + (8+mul_time)*n cycles for the loop above.
|
|
;; (I don't know how long a 32x32->64 bit multiply or an indirect jump
|
|
;; take on a Pentium, so plug those numbers in.)
|
|
; align 2
|
|
; nop ; To align loop nicely
|
|
;P_lbnMulAdd1_32 proc far
|
|
;
|
|
; push bp ; U prologue ** Could be V
|
|
; mov bp,sp ; V prologue
|
|
; push ds ; NP prologue
|
|
; mov ecx,[bp+16] ; U load k
|
|
; push si ; V prologue
|
|
; lds si,[bp+10] ; NP load in
|
|
; mov eax,[si] ; U first multiply
|
|
; push di ; V prologue
|
|
; mul ecx ; NP first multiply
|
|
; les di,[bp+6] ; NP load out
|
|
; add es:[di],eax ; U first multiply
|
|
; mov bp,[bp+14] ; V load len
|
|
; adc edx,0 ; U first multiply
|
|
; dec bp ; V
|
|
; mov ebx,edx ; U Remember carry for later
|
|
; je Pma32_done ; V
|
|
;Pma32_loop:
|
|
; mov eax,[si+4] ; U
|
|
; add di,4 ; V
|
|
; mul ecx ; NP
|
|
; add eax,ebx ; U Add carry in from previous word
|
|
; adc edx,0 ; U
|
|
; add si,4 ; V
|
|
; add es:[di],eax ; U
|
|
; adc edx,0 ; U
|
|
; dec bp ; V
|
|
; mov ebx,edx ; U Remember carry for later
|
|
; jne Pma32_loop ; V
|
|
;Pma32_done:
|
|
; pop di ; U ** Could be V
|
|
; pop si ; V
|
|
; pop ds ; NP
|
|
; mov ax,dx ; U return value low ** Could be V
|
|
; pop bp ; V
|
|
; shr edx,16 ; U return value high
|
|
; ret ; NP
|
|
;
|
|
;P_lbnMulAdd1_32 endp
|
|
|
|
|
|
|
|
;; Two-word by one-word divide. Stores quotient, returns remainder.
|
|
;; BNWORD32 lbnDiv21_32(BNWORD32 *q, BNWORD32 nh, BNWORD32 nl, BNWORD32 d)
|
|
;; 4 8 12 16
|
|
align 16
|
|
_lbnDiv21_32 proc far
|
|
mov cx,bp ; U bp NOT pushed; offsets differ
|
|
mov bp,sp ; V
|
|
; AGI
|
|
mov edx,[bp+8] ; U
|
|
mov eax,[bp+12] ; U
|
|
div DWORD PTR [bp+16] ; NP
|
|
les bx,[bp+4] ; NP
|
|
mov es:[bx],eax ; U
|
|
mov ax,dx ; V
|
|
shr edx,16 ; U
|
|
mov bp,cx ; V
|
|
ret ; NP
|
|
|
|
nop
|
|
nop
|
|
nop
|
|
nop ; Get lbnModQ_32 aligned properly
|
|
|
|
_lbnDiv21_32 endp
|
|
|
|
;; Multi-word by one-word remainder.
|
|
;; This speeds up key generation. It's not worth unrolling and so on;
|
|
;; using 32-bit divides is enough of a speedup.
|
|
;;
|
|
;; bp is used as a counter so that all the 32-bit values can be in
|
|
;; caller-save registers (eax, ecx, edx). bx is needed as a pointer.
|
|
;;
|
|
;; The modulus (in ebp) is 16 bits. Given that the dividend is 32 bits,
|
|
;; the chances of saving the first divide because the high word of the
|
|
;; dividend is less than the modulus are low enough it's not worth taking
|
|
;; the cycles to test for it.
|
|
;;
|
|
;; unsigned lbnModQ_32(BNWORD16 *q, unsigned len, unsigned d)
|
|
;; 6 10 12
|
|
_lbnModQ_32 proc far
|
|
xor ecx,ecx ; U Clear ecx (really, the high half)
|
|
push bp ; V
|
|
mov edx,ecx ; U Clear high word for first divide
|
|
mov bp,sp ; V
|
|
push ds ; NP
|
|
lds ax,[bp+6] ; NP Load dividend pointer
|
|
mov bx,[bp+10] ; U Load count ** Could be V
|
|
sub ax,4 ; V Offset dividend pointer
|
|
mov cx,[bp+12] ; U Load modulus ** Could be V
|
|
mov bp,bx ; V Copy count
|
|
shl bx,2 ; U Shift index
|
|
add bx,ax ; U Add base ** Could be V
|
|
; lea bx,[eax+ebp*4-4]; U Move pointer to high word
|
|
|
|
modq32_loop:
|
|
mov eax,[bx] ; U
|
|
sub bx,4 ; V
|
|
div ecx ; NP
|
|
dec bp ; U ** Could be V
|
|
jnz modq32_loop ; V
|
|
modq32_done:
|
|
pop ds ; NP
|
|
mov ax,dx ; U ** Could be V
|
|
pop bp ; V
|
|
ret ; NP
|
|
|
|
_lbnModQ_32 endp
|
|
|
|
|
|
;; int not386(void) returns 0 on a 32-bit (386 or better) processor;
|
|
;; non-zero if an 80286 or lower. The Z flag is set to reflect
|
|
;; ax on return. This is only called once, so it doesn't matter how
|
|
;; it's aligned.
|
|
|
|
_not386 proc far
|
|
;;
|
|
;; This first test detects 80x86 for x < 2. On the 8086 and '186,
|
|
;; "push sp" does "--sp; sp[0] = sp". On all later processors, it does
|
|
;; "sp[-1] = sp; --sp".
|
|
;;
|
|
push sp
|
|
pop ax
|
|
sub ax,sp
|
|
jne SHORT return
|
|
|
|
;; This test is the key one. It will probably detect 8086, V30 and 80186
|
|
;; as well as 80286, but I haven't had access to test it on any of those,
|
|
;; so it's protected by the well-known test above. It has been tested
|
|
;; on the 80286, 80386, 80486, Pentium and AMD tested it on their K5.
|
|
;; I have not been able to confirm effectiveness on the P6 yet, although
|
|
;; someone I spoke to at Intel said it should work.
|
|
;;
|
|
;; This test uses the fact that the '386 and above have a barrel shifter
|
|
;; to do shifts, while the '286 does left shifts by releated adds.
|
|
;; That means that on the '286, the auxilliary carry gets a copy of
|
|
;; bit 4 of the shift output, while on the '386 and up, it's trashed
|
|
;; (as it happens, set to 1) independent of the result. (It's documented
|
|
;; as undefined.)
|
|
;;
|
|
;; We do two shifts, which should produce different auxilliary carries
|
|
;; on a '286 and XOR them to see if they are different. Even on a
|
|
;; future processor that does something different with the aux carry
|
|
;; flag, it probably does something data-independent, so this will still
|
|
;; work. Note that all flags except aux carry are defined for shl
|
|
;; output and will be the same for both cases.
|
|
|
|
mov al,4
|
|
shl al,1 ; Expected to produce ac = 0 on a '286
|
|
lahf
|
|
shl al,1 ; Expected to produce ac = 1 on a '286
|
|
mov al,ah
|
|
lahf
|
|
xor al,ah ; Xor the flags together to detect the difference
|
|
mov ah,al ; Clear ah if al is clear, leave Z flag alone
|
|
return:
|
|
ret
|
|
|
|
_not386 endp
|
|
|
|
_TEXT ends
|
|
|
|
end
|