I am currently writing another version that uses %d4%d7 for the accumulated result. It'll be tricky since %d0%d3 are the operands themselves, and I need another register or two for temporary calculations. I'm planning on first calculating all partial products that need %d3 so that can be freed first. I still might have to unload a register into an address register so I have enough data registers available for the second half of partial products. Also, I won't be able to do as many staggered long additions, but the speed gains by using registers vs the stack should make up for that.
Anyway, here's the code, which includes the C interface mul64:
Code: .if 0
HAHAHA! Sweet optimization note: any two longs (as a result of multiplying two words) can be added without carry if they are staggered. The largest product of two 16bit unsigned values is 0xFFFE0001, so staggering the adds like so will never carry:
FFFE00010000
0000FFFE0001

FFFEFFFF0001
These can be chained like so without carry.
.endif
 this routine currently uses the following partial product layout:
exploiting the nocarry trick noted above:
first partial product:
0055ddff +
.44ccee.
second partial product:
.1166bb. +
..88aa..
third partial product:
..2277.. +
...99...
fourth partial product:
...33...
 multiply unsigned 64bit integers
 return unsigned 128bit integer
 input:
 x = %d0:%d1
 y = %d2:%d3
 output:
 x*y = %d0:%d1:%d2:%d3
 2326 cycles (74 cycles per mulu)
mulu64:
 save registers
movem.l %d4/%a0%a1,(%sp)
// first partial product
// 0055ddff
push:
ff << 0 dh
dd << 2 df
55 << 4 bf
00 << 6 ae
 ab:cd d0:d1
 ef:gh d2:d3
 dh
move.w %d1,%d4
mulu.w %d3,%d4  dh
move.l %d4,(%sp)
 df
move.w %d1,%d4
mulu.w %d2,%d4  df
move.l %d4,(%sp)
 bf
move.w %d0,%d4
mulu.w %d2,%d4  bf
move.l %d4,(%sp)
 ae
swap %d2
 ab:cd d0:d1
 fe:gh d2:d3
move.l %d0,%d4
swap %d4
mulu.w %d2,%d4  ae
move.l %d4,(%sp)

// .44ccee.
add without carry:
ee << 1 dg
cc << 3 de
44 << 5 be
 dg
move.w %d1,%d4
swap %d3
 ab:cd d0:d1
 fe:hg d2:d3
mulu.w %d3,%d4  dg
add.l %d4,5*2(%sp)
 de
move.w %d1,%d4
mulu.w %d2,%d4  de
add.l %d4,3*2(%sp)
 be
move.w %d0,%d4
mulu.w %d2,%d4  be
add.l %d4,1*2(%sp)

// second partial product
// .1166bb.
push:
 0x0 << 0 // don't really need this
bb << 1 ch
66 << 3 bg
11 << 5 af
 0x0 << 7
 ch
swap %d1
 ab:dc d0:d1
 fe:hg d2:d3
move.l %d3,%d4
swap %d4
mulu.w %d1,%d4  ch
move.l %d4,(%sp)
 bg
move.w %d0,%d4
mulu.w %d3,%d4  bg
move.l %d4,(%sp)
 af
swap %d0
 ba:dc d0:d1
 fe:hg d2:d3
move.l %d2,%d4
swap %d4
mulu.w %d0,%d4  af
move.l %d4,(%sp)
clr.w (%sp)

// ..88aa..
add without carry:
aa << 2 cg
88 << 4 ce
 cg
move %d1,%d4
mulu.w %d3,%d4  cg
add.l %d4,4*2(%sp)
 ce
move %d1,%d4
mulu.w %d2,%d4  ce
add.l %d4,2*2(%sp)

add with carry (upper 7 words or 3.5 longs)
lea 7*2(%sp),%a1  second partial product
lea 7*2+7*2(%sp),%a0  result
addx.l (%a1),(%a0)
addx.l (%a1),(%a0)
addx.l (%a1),(%a0)
addx.w (%a1),(%a0)
 pop off our second partial product
lea 7*2(%sp),%sp

// third partial product
// ..2277..
push:
0x00 << 0 // don't really need this
77 << 2 bh
22 << 4 ag
0x00 << 6
 bh
move.l %d0,%d4
swap %d4
swap %d3
 ba:dc d0:d1
 fe:gh d2:d3
mulu.w %d3,%d4  bh
move.l %d4,(%sp)
 ag
swap %d3
 ba:dc d0:d1
 fe:hg d2:d3
move.w %d0,%d4
mulu.w %d3,%d4  ag
move.l %d4,(%sp)
clr.l (%sp)

// ...99...
add without carry:
99 << 3 cf
 cf
move.w %d1,%d4
swap %d2
 ba:dc d0:d1
 ef:hg d2:d3
mulu.w %d2,%d4  cf
add.l %d4,3*2(%sp)

add with carry (upper 6 words or 3 longs)
lea 6*2(%sp),%a1  third partial product
lea 6*2+6*2(%sp),%a0  result
addx.l (%a1),(%a0)
addx.l (%a1),(%a0)
addx.l (%a1),(%a0)
 pop off the third partial product
lea 6*2(%sp),%sp

// fourth partial product
// ...33...
push:
0x00 << 0 // don't really need this
0x0 << 2 // or this
33 << 3 ah
0x0 << 5
0x00 << 6
 ah
move %d0,%d4
swap %d3
 ba:dc d0:d1
 ef:gh d2:d3
mulu.w %d3,%d4  ah
move.l %d4,(%sp)
clr.w (%sp)
clr.l (%sp)

add with carry (upper 5 words or 2.5 longs)
lea 5*2(%sp),%a1  fourth partial product
lea 5*2+5*2(%sp),%a0  result
addx.l (%a1),(%a0)
addx.l (%a1),(%a0)
addx.w (%a1),(%a0)
 pop off fourth partial product
lea 5*2(%sp),%sp
 pop off result and saved registers
movem.l (%sp)+,%d0%d4/%a0%a1
rts
.global mul64
 void mul64(int64 *a, int64 *b, int128 *result);
mul64:
 save
move.l %d3,(%sp)
move.l 4+4(%sp),%a0
move.l 4+4+4(%sp),%a1
movem.l (%a0)+,%d0%d1
movem.l (%a1)+,%d2%d3
jbsr mulu64
move.l 4+4+4+4(%sp),%a0
lea 4*4(%a0),%a0
movem.l %d0%d3,(%a0)
 restore
move.l (%sp)+,%d3
rts