461 lines
14 KiB
C
461 lines
14 KiB
C
/*
|
|
* Copyright (c) 1995 Colin Plumb. All rights reserved.
|
|
* For licensing and other legal details, see the file legal.c.
|
|
*
|
|
* lbn68000.c - 16-bit bignum primitives for the 68000 (or 68010) processors.
|
|
*
|
|
* This was written for Metrowerks C, and while it should be reasonably
|
|
* portable, NOTE that Metrowerks lets a callee trash a0, a1, d0, d1, and d2.
|
|
* Some 680x0 compilers make d2 callee-save, so instructions to save it
|
|
* will have to be added.
|
|
*
|
|
* This code supports 16 or 32-bit ints, based on UINT_MAX.
|
|
* Regardless of UINT_MAX, only bignums up to 64K words (1 million bits)
|
|
* are supported. (68k hackers will recognize this as a consequence of
|
|
* using dbra.)
|
|
*
|
|
* These primitives use little-endian word order.
|
|
* (The order of bytes within words is irrelevant to this issue.)
|
|
*/
|
|
|
|
#include <limits.h>
|
|
|
|
#include "lbn.h" /* Should include lbn68000.h */
|
|
|
|
/*
|
|
* The Metrowerks C compiler (1.2.2) produces bad 68k code for the
|
|
* following input, which happens to be the inner loop of lbnSub1,
|
|
* so a few less than critical routines have been recoded in assembly
|
|
* to avoid the bug. (Optimizer on or off does not matter.)
|
|
*
|
|
* unsigned
|
|
* decrement(unsigned *num, unsigned len)
|
|
* {
|
|
* do {
|
|
* if ((*num++)-- != 0)
|
|
* return 0;
|
|
* } while (--len);
|
|
* return 1;
|
|
* }
|
|
*/
|
|
asm BNWORD16
|
|
lbnSub1_16(BNWORD16 *num, unsigned len, BNWORD16 borrow)
|
|
{
|
|
movea.l 4(sp),a0 /* num */
|
|
#if UINT_MAX == 0xffff
|
|
move.w 10(sp),d0 /* borrow */
|
|
#else
|
|
move.w 12(sp),d0 /* borrow */
|
|
#endif
|
|
sub.w d0,(a0)+
|
|
bcc done
|
|
#if UINT_MAX == 0xffff
|
|
move.w 8(sp),d0 /* len */
|
|
#else
|
|
move.w 10(sp),d0 /* len */
|
|
#endif
|
|
subq.w #2,d0
|
|
bcs done
|
|
loop:
|
|
subq.w #1,(a0)+
|
|
dbcc d0,loop
|
|
done:
|
|
moveq.l #0,d0
|
|
addx.w d0,d0
|
|
rts
|
|
}
|
|
|
|
asm BNWORD16
|
|
lbnAdd1_16(BNWORD16 *num, unsigned len, BNWORD16 carry)
|
|
{
|
|
movea.l 4(sp),a0 /* num */
|
|
#if UINT_MAX == 0xffff
|
|
move.w 10(sp),d0 /* carry */
|
|
#else
|
|
move.w 12(sp),d0 /* carry */
|
|
#endif
|
|
add.w d0,(a0)+
|
|
bcc done
|
|
#if UINT_MAX == 0xffff
|
|
move.w 8(sp),d0 /* len */
|
|
#else
|
|
move.w 10(sp),d0 /* len */
|
|
#endif
|
|
subq.w #2,d0
|
|
bcs done
|
|
loop:
|
|
addq.w #1,(a0)+
|
|
dbcc d0,loop
|
|
done:
|
|
moveq.l #0,d0
|
|
addx.w d0,d0
|
|
rts
|
|
}
|
|
|
|
asm void
|
|
lbnMulN1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k)
|
|
{
|
|
move.w d3,-(sp) /* 2 bytes of stack frame */
|
|
move.l 2+4(sp),a1 /* out */
|
|
move.l 2+8(sp),a0 /* in */
|
|
#if UINT_MAX == 0xffff
|
|
move.w 2+12(sp),d3 /* len */
|
|
move.w 2+14(sp),d2 /* k */
|
|
#else
|
|
move.w 2+14(sp),d3 /* len (low 16 bits) */
|
|
move.w 2+16(sp),d2 /* k */
|
|
#endif
|
|
|
|
move.w (a0)+,d1 /* First multiply */
|
|
mulu.w d2,d1
|
|
move.w d1,(a1)+
|
|
clr.w d1
|
|
swap d1
|
|
|
|
subq.w #1,d3 /* Setup for loop unrolling */
|
|
lsr.w #1,d3
|
|
bcs.s m16_even
|
|
beq.s m16_short
|
|
|
|
subq.w #1,d3 /* Set up software pipeline properly */
|
|
move.l d1,d0
|
|
|
|
m16_loop:
|
|
move.w (a0)+,d1
|
|
mulu.w d2,d1
|
|
add.l d0,d1
|
|
move.w d1,(a1)+
|
|
clr.w d1
|
|
swap d1
|
|
m16_even:
|
|
|
|
move.w (a0)+,d0
|
|
mulu.w d2,d0
|
|
add.l d1,d0
|
|
move.w d0,(a1)+
|
|
clr.w d0
|
|
swap d0
|
|
|
|
dbra d3,m16_loop
|
|
|
|
move.w d0,(a1)
|
|
move.w (sp)+,d3
|
|
rts
|
|
m16_short:
|
|
move.w d1,(a1)
|
|
move.w (sp)+,d3
|
|
rts
|
|
}
|
|
|
|
|
|
asm BNWORD16
|
|
lbnMulAdd1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k)
|
|
{
|
|
move.w d4,-(sp)
|
|
clr.w d4
|
|
move.w d3,-(sp) /* 4 bytes of stack frame */
|
|
move.l 4+4(sp),a1 /* out */
|
|
move.l 4+8(sp),a0 /* in */
|
|
#if UINT_MAX == 0xffff
|
|
move.w 4+12(sp),d3 /* len */
|
|
move.w 4+14(sp),d2 /* k */
|
|
#else
|
|
move.w 4+14(sp),d3 /* len (low 16 bits) */
|
|
move.w 4+16(sp),d2 /* k */
|
|
#endif
|
|
|
|
move.w (a0)+,d1 /* First multiply */
|
|
mulu.w d2,d1
|
|
add.w d1,(a1)+
|
|
clr.w d1
|
|
swap d1
|
|
addx.w d4,d1
|
|
|
|
subq.w #1,d3 /* Setup for loop unrolling */
|
|
lsr.w #1,d3
|
|
bcs.s ma16_even
|
|
beq.s ma16_short
|
|
|
|
subq.w #1,d3 /* Set up software pipeline properly */
|
|
move.l d1,d0
|
|
|
|
ma16_loop:
|
|
move.w (a0)+,d1
|
|
mulu.w d2,d1
|
|
add.l d0,d1
|
|
add.w d1,(a1)+
|
|
clr.w d1
|
|
swap d1
|
|
addx.w d4,d1
|
|
ma16_even:
|
|
|
|
move.w (a0)+,d0
|
|
mulu.w d2,d0
|
|
add.l d1,d0
|
|
add.w d0,(a1)+
|
|
clr.w d0
|
|
swap d0
|
|
addx.w d4,d0
|
|
|
|
dbra d3,ma16_loop
|
|
|
|
move.w (sp)+,d3
|
|
move.w (sp)+,d4
|
|
rts
|
|
ma16_short:
|
|
move.w (sp)+,d3
|
|
move.l d1,d0
|
|
move.w (sp)+,d4
|
|
rts
|
|
}
|
|
|
|
|
|
|
|
asm BNWORD16
|
|
lbnMulSub1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k)
|
|
{
|
|
move.w d4,-(sp)
|
|
clr.w d4
|
|
move.w d3,-(sp) /* 4 bytes of stack frame */
|
|
move.l 4+4(sp),a1 /* out */
|
|
move.l 4+8(sp),a0 /* in */
|
|
#if UINT_MAX == 0xffff
|
|
move.w 4+12(sp),d3 /* len */
|
|
move.w 4+14(sp),d2 /* k */
|
|
#else
|
|
move.w 4+14(sp),d3 /* len (low 16 bits) */
|
|
move.w 4+16(sp),d2 /* k */
|
|
#endif
|
|
|
|
move.w (a0)+,d1 /* First multiply */
|
|
mulu.w d2,d1
|
|
sub.w d1,(a1)+
|
|
clr.w d1
|
|
swap d1
|
|
addx.w d4,d1
|
|
|
|
subq.w #1,d3 /* Setup for loop unrolling */
|
|
lsr.w #1,d3
|
|
bcs.s ms16_even
|
|
beq.s ms16_short
|
|
|
|
subq.w #1,d3 /* Set up software pipeline properly */
|
|
move.l d1,d0
|
|
|
|
ms16_loop:
|
|
move.w (a0)+,d1
|
|
mulu.w d2,d1
|
|
add.l d0,d1
|
|
sub.w d1,(a1)+
|
|
clr.w d1
|
|
swap d1
|
|
addx.w d4,d1
|
|
ms16_even:
|
|
|
|
move.w (a0)+,d0
|
|
mulu.w d2,d0
|
|
add.l d1,d0
|
|
sub.w d0,(a1)+
|
|
clr.w d0
|
|
swap d0
|
|
addx.w d4,d0
|
|
|
|
dbra d3,ms16_loop
|
|
|
|
move.w (sp)+,d3
|
|
move.w (sp)+,d4
|
|
rts
|
|
ms16_short:
|
|
move.w (sp)+,d3
|
|
move.l d1,d0
|
|
move.w (sp)+,d4
|
|
rts
|
|
}
|
|
|
|
/* The generic long/short divide doesn't know that nh < d */
|
|
asm BNWORD16
|
|
lbnDiv21_16(BNWORD16 *q, BNWORD16 nh, BNWORD16 nl, BNWORD16 d)
|
|
{
|
|
move.l 8(sp),d0 /* nh *and* nl */
|
|
divu.w 12(sp),d0
|
|
move.l 4(sp),a0
|
|
move.w d0,(a0)
|
|
clr.w d0
|
|
swap d0
|
|
rts
|
|
}
|
|
|
|
asm unsigned
|
|
lbnModQ_16(BNWORD16 const *n, unsigned len, BNWORD16 d)
|
|
{
|
|
move.l 4(sp),a0 /* n */
|
|
moveq.l #0,d1
|
|
#if UINT_MAX == 0xffff
|
|
move.w 8(sp),d1 /* len */
|
|
move.w 10(sp),d2 /* d */
|
|
#else
|
|
move.w 10(sp),d1 /* len (low 16 bits) */
|
|
move.w 12(sp),d2 /* d */
|
|
#endif
|
|
|
|
add.l d1,a0
|
|
add.l d1,a0 /* n += len */
|
|
moveq.l #0,d0
|
|
subq.w #1,d1
|
|
|
|
mq16_loop:
|
|
move.w -(a0),d0 /* Assemble remainder and new word */
|
|
divu.w d2,d0 /* Put remainder in high half of d0 */
|
|
dbra d1,mq16_loop
|
|
|
|
mq16_done:
|
|
clr.w d0
|
|
swap d0
|
|
rts
|
|
}
|
|
|
|
/*
|
|
* Detect if this is a 32-bit processor (68020+ *or* CPU32).
|
|
* Both the 68020+ and CPU32 processors (which have 32x32->64-bit
|
|
* multiply, what the 32-bit math library wants) support scaled indexed
|
|
* addressing. The 68000 and 68010 ignore the scale selection
|
|
* bits, treating it as *1 all the time. So a 32-bit processor
|
|
* will evaluate -2(a0,a0.w*2) as 1+1*2-2 = 1.
|
|
* A 16-bit processor will compute 1+1-2 = 0.
|
|
*
|
|
* Thus, the return value will indicate whether the chip this is
|
|
* running on supports 32x32->64-bit multiply (mulu.l).
|
|
*/
|
|
asm int
|
|
is68020(void)
|
|
{
|
|
machine 68020
|
|
lea 1,a0
|
|
#if 0
|
|
lea -2(a0,a0.w*2),a0 /* Metrowerks won't assemble this, arrgh */
|
|
#else
|
|
dc.w 0x41f0, 0x82fe
|
|
#endif
|
|
move.l a0,d0
|
|
rts
|
|
}
|
|
/*
|
|
* Since I had to hand-assemble that fancy addressing mode, I had to study
|
|
* up on 680x0 addressing modes.
|
|
* A summary of 680x0 addressing modes.
|
|
* A 68000 effective address specifies an operand on an instruction, which
|
|
* may be a register or in memory. It is made up of a 3-bit mode and a
|
|
* 3-bit register specifier. The meanings of the various modes are:
|
|
*
|
|
* 000 reg - Dn, n specified by "reg"
|
|
* 001 reg - An, n specified by "reg"
|
|
* 010 reg - (An)
|
|
* 011 reg - (An)+
|
|
* 100 reg - -(An)
|
|
* 101 reg - d16(An), one 16-bit displacement word follows, sign-extended
|
|
* 110 reg - Fancy addressing mode off of An, see extension word below
|
|
* 111 000 - abs.W, one 16-bit signed absolute address follows
|
|
* 111 001 - abs.L, one 32-bit absolute address follows
|
|
* 111 010 - d16(PC), one 16-bit displacemnt word follows, sign-extended
|
|
* 111 011 - Fancy addressing mode off of PC, see extension word below
|
|
* 111 100 - #immediate, followed by 16 or 32 bits of immediate value
|
|
* 111 101 - unused, reserved
|
|
* 111 110 - unused, reserved
|
|
* 111 111 - unused, reserved
|
|
*
|
|
* Memory references are to data space, except that PC-relative references
|
|
* are to program space, and are read-only.
|
|
*
|
|
* Fancy addressing modes are followed by a 16-bit extension word, and come
|
|
* in "brief" and "full" forms.
|
|
* The "brief" form looks like this. Bit 8 is 0 to indicate this form:
|
|
*
|
|
* 1 1 1 1 1 1 1
|
|
* 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
|
|
* +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|
|
* |A/D| register |L/W| scale | 0 | 8-bit signed displacement |
|
|
* +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|
|
*
|
|
* The basic effective address specifies a 32-bit base register - A0 through
|
|
* A7 or PC (the address of the following instruction).
|
|
* The A/D and register fields specify an index register. A/D is 1 for
|
|
* address registers, and 0 for data registers. L/W specifies the length
|
|
* of the index register, 1 for 32 bits, and 0 for 16 bits (sign-extended).
|
|
* The scale field is a left shift amount (0 to 3 bits) to apply to the
|
|
* sign-extended index register. The final address is d8(An,Rn.X*SCALE),
|
|
* also written (d8,An,Rn.X*SCALE). X is "W" or "L", SCALE is 1, 2, 4 or 8.
|
|
* "*1" may be omitted, as may a d8 of 0.
|
|
*
|
|
* The 68000 supports this form, but only with a scale field of 0.
|
|
* It does NOT (says the MC68030 User's Manual MC68030UM/AD, section 2.7)
|
|
* decode the scale field and the following format bit. They are treated
|
|
* as 0.
|
|
* I recall (I don't have the data book handy) that the CPU32 processor
|
|
* core used in the 683xx series processors supports variable scales,
|
|
* but only the brief extension word form. I suspect it decodes the
|
|
* format bit and traps if it is not zero, but I don't recall.
|
|
*
|
|
* The "full" form (680x0, x >= 2 processors only) looks like this:
|
|
*
|
|
* 1 1 1 1 1 1 1
|
|
* 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
|
|
* +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|
|
* |A/D| register |L/W| scale | 1 | BS| IS|BD size| 0 | P |OD size|
|
|
* +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|
|
*
|
|
* The first 8 bits are interpreted the same way as in the brief form,
|
|
* except that bit 8 is set to 1 to indicate the full form.
|
|
* BS, Base Suppress, if set, causes a value of 0 to be used in place of
|
|
* the base register value. If this is set, the base register
|
|
* specified is irrelevant, except that if it is the PC, the fetch is
|
|
* still done from program space. The specifier "ZPC" can be used in
|
|
* place of "PC" in the effective address mnemonic to represent this
|
|
* case.
|
|
* IS, Index Suppress, if set, causes a value of 0 to be used in place
|
|
* of the scaled index register. In this case, the first 7 bits of the
|
|
* extension word are irrelevant.
|
|
* BD size specifies the base displacement size. A value of 00
|
|
* in this field is illegal, while 01, 10 and 11 indicate that the
|
|
* extension word is followed by 0, 1 or 2 16-bit words of base displacement
|
|
* (zero, sign-extended to 32 bits, and most-significant word first,
|
|
* respectively) to add to the base register value.
|
|
* Bit 3 is unused.
|
|
* The P bit is the pre/post indexing bit, and only applies if an outer
|
|
* displacement is used. This is explained later.
|
|
* OD size specifies the size of an outer displacement. In the simple
|
|
* case, this field is set to 00 and the effective address is
|
|
* (disp,An,Rn.X*SCALE) or (disp,PC,Rn.X*SCALE).
|
|
* In this case the P bit must be 0. Any of those compnents may be
|
|
* suppressed, with a BD size of 01, the BS bit, or the IS bit.
|
|
* If the OD size is not 00, it encodes an outer displacement in the same
|
|
* manner as the BD size, and 0, 1 or 2 16-bit words of outer displacement
|
|
* follow the base displacement in the instruction stream. In this case,
|
|
* this is a double-indirect addressing mode. The base, base displacement,
|
|
* and possibly the index, specify a 32-bit memory word which holds a value
|
|
* which is fetched, and the outer displacement and possibly the index are
|
|
* added to produce the address of the operand.
|
|
* If the P bit is 0, this is pre-indexed, and the index value is added
|
|
* before the fetch of the indirect word, producing an effective address
|
|
* of ([disp,An,Rn.X*SCALE],disp). If the P bit is 1, the post-indexed case,
|
|
* the memory word is fectched from base+base displacement, then the index
|
|
* and outer displacement are added to compute the address of the operand.
|
|
* This effective address is written ([disp,An],Rn.X*SCALE,disp).
|
|
* (In both cases, "An" may also be "PC" or "ZPC".)
|
|
* Any of the components may be omitted. If the index is omitted (using the
|
|
* IS bit), the P bit is irrelevant, but must be written as 0.
|
|
* Thus, legal combinations of IS, P and OD size are:
|
|
* 0 0 00 - (disp,An,Rn.X*SCALE), also written disp(An,Rn.X*SCALE)
|
|
* 0 0 01 - ([disp,An,Rn.X*SCALE])
|
|
* 0 0 10 - ([disp,An,Rn.X*SCALE],d16)
|
|
* 0 0 11 - ([disp,An,Rn.X*SCALE],d32)
|
|
* 0 1 01 - ([disp,An],Rn.X*SCALE)
|
|
* 0 1 10 - ([disp,An],Rn.X*SCALE,d16)
|
|
* 0 1 11 - ([disp,An],Rn.X*SCALE,d32)
|
|
* 1 0 00 - (disp,An), also written disp(An)
|
|
* 1 0 01 - ([disp,An])
|
|
* 1 0 10 - ([disp,An],d16)
|
|
* 1 0 11 - ([disp,An],d32)
|
|
*/
|
|
|
|
/* 45678901234567890123456789012345678901234567890123456789012345678901234567 */
|