Login [Register]
Don't have an account? Register now to chat, post, use our tools, and much more.
Share the most optimized eZ80 routines!
Can anybody do better? (Usually: Yes.)


16-bit unsigned multiplication returning 32-bit product. It expects to be called in Z80 mode.

Code:

mul16:
;;Expects Z80 mode
;;Inputs: hl,bc
;;Outputs: Upper 16 bits in DE, lower 16 bits in BC
;;54 or 56 t-states
;;30 bytes
    ld d,c \ ld e,l \ mlt de \ push de ;11
    ld d,h \ ld e,b \ mlt de           ;8
    ld a,l \ ld l,c \ ld c,a           ;3
    mlt hl \ mlt bc \ add hl,bc        ;13
    jr nc,$+3 \ inc de \ pop bc        ;6
    ld a,b \ add a,l \ ld b,a          ;3
    ld a,e \ adc a,h \ ld e,a          ;3
    ret nc \ inc d \ ret               ;7 (+2 for carry)

As a note, I found Karatsuba multiplication to be less efficient up to 24-bits and I have yet to write a 32-bit routine. EDIT:It is more efficient at 32-bits, but only by little


Remember, a lot of the optimized Z80 routines work well here, but some of them can actually be reworked to be even more efficient than a direct translation.
EDIT: 5-Feb-15 The following are UNTESTED.

Here is a 16-bit GCD algorithm.

Code:

GCD16:
;;Expect Z80 mode
;;Inputs: HL,DE
;;Output: HL
;;        BC=0
;;        DE=0
;;        A=0
;Binary GCD algorithm
;About 432cc on average (average of 500 iterations with randomized inputs on [0,65535]
;78 bytes
    xor a
    ld b,a
    ld c,a
    sbc hl,bc
    ex de,hl
    ret z
    sbc hl,bc
    ex de,hl
    ret z

;factor out cofactor powers of two
    ld a,l \ or e \ rra \ jr nc,$+16
    srl h \ rr l
    srl d \ rr e
    inc b
    ld a,l \ or e \ rra \ jr nc,$-12
.loop:
;factor out powers of 2 from hl
    ld a,l \ rra \ ld a,h \ jr c,$+10
    rra \ rr l \ bit 0,l \ jr z,$-5 \ ld h,a
;factor out powers of 2 from de
    ld a,e \ rra \ ld a,d \ jr c,$+10
    rra \ rr e \ bit 0,e \ jr z,$-5 \ ld d,a

    xor a
    sbc hl,de
    jr z,finish
    jr nc,loop
    add hl,de
    or a
    ex de,hl
    sbc hl,de
    jr loop
.finish:
    ex de,hl
    dec b
    inc b
    ret z
    add hl,hl \ djnz $-1 \ ret

32-bit multiplication:

Code:

mul32:
;;Expects Z80 mode
;;Inputs:  ix points to the first 32-bit number
;;         ix+4 points to the next 32-bit number
;;Outputs: ix+8 points to the 64-bit result
;;Algorithm: Karatsuba
;348cc to 375cc

;compute z0
    ld hl,(ix)        ;5
    ld bc,(ix+4)      ;5
    call mul16        ;59+2
    ld (ix+8),bc      ;5
    ld (ix+10),de     ;5
;compute z2           
    ld hl,(ix+2)      ;5
    ld bc,(ix+6)      ;5
    call mul16        ;59+2
    ld (ix+12),bc     ;5
    ld (ix+14),de     ;5
;compute z1, most complicated as it requires 17-bits of info for each factor
    ld hl,(ix+2)      ;5
    ld bc,(ix)        ;5
    add hl,bc         ;1
    rla               ;1
    ld b,h            ;1
    ld c,l            ;1
                       
    ld hl,(ix+6)      ;5
    ld de,(ix+4)      ;5
    add hl,de         ;1
    rla               ;1
                       
    push hl           ;3
    push bc           ;3
    push af           ;3
    call mul16        ;59+2
    pop af            ;3
    and 3             ;2
    ex de,hl          ;1
    pop de            ;3
    jr z,$+7 ;label0  ;3|(6+1)
;bit 0 means add [stack0] to the upper 16 bits
;bit 1 means add [stack1] to the upper 16 bits
;both mean add 2^32   
    jp po,$+5 \ or 4                     ;--
    rra \ jr nc,$+7                      ;4+4
    rrca \ add hl,bc \ adc a,0 \ rlca    ;--
                                         ;
    srl a \ pop de \ jr nc,$+5           ;8+2
    add hl,bc \ adc a,0                  ;
    ld d,b \ ld e,c                      ;2

;z1 = AHLDE-z0-z2
    ld bc,(ix+8) \ ex de,hl \ sbc hl,bc              ;8
    ld bc,(ix+10) \ ex de,hl \ sbc hl,bc \ sbc a,0   ;10

    ld bc,(ix+12) \ ex de,hl \ sbc hl,bc             ;8
    ld bc,(ix+14) \ ex de,hl \ sbc hl,bc \ sbc a,0   ;10
    ex de,hl                                         ;1

    ld bc,(ix+10) \ add hl,bc \ ld (ix+10),hl \ ex de,hl  ;12
    ld bc,(ix+12) \ adc hl,bc \ ld (ix+12),hl \ adc a,0   ;13
    ret z \ ld hl,(ix+14) \ add a,l \ ld l,a              ;7+16
    jr nc,$+3 \ inc h \ ld (ix+14),hl \ ret               ;--


sqrt16:

Code:

sqrt16:
;;Expext Z80 mode
;;Inputs: HL
;;Output: A
;Unrolled, speed optimized.
;At most 112 clock cycles
;111 bytes.
    xor a \ ld c,a \ ld d,a \ ld e,l \ ld l,h \ ld h,c

    add hl,hl \ add hl,hl \ sub h \ jr nc,$+5 \ inc c \ cpl \ ld h,a

    add hl,hl \ add hl,hl \ rl c \ ld a,c \ rla
    sub h \ jr nc,$+5 \ inc c \ cpl \ ld h,a

    add hl,hl \ add hl,hl \ rl c \ ld a,c \ rla
    sub h \ jr nc,$+5 \ inc c \ cpl \ ld h,a

    add hl,hl \ add hl,hl \ rl c \ ld a,c \ rla
    sub h \ jr nc,$+5 \ inc c \ cpl \ ld h,a

    add hl,hl \ add hl,hl \ rl c \ ld a,c \ rla
    sub h \ jr nc,$+5 \ inc c \ cpl \ ld h,a

    add hl,hl \ add hl,hl \ rl c \ ld a,c \ rla
    sub h \ jr nc,$+5 \ inc c \ cpl \ ld h,a

    sla c \ ld a,c \ add a,a \ add hl,hl \ add hl,hl
    jr nc,$+5 \ sub h \ jr $+5 \ sub h \ jr nc,$+5 \ inc c \ cpl \ ld h,a

    ld e,h \ ld h,l \ ld l,c \ ld a,l
    add hl,hl \ rl e \ rl d
    add hl,hl \ rl e \ rl d
    sbc hl,de \ rla \ ret

EDIT: 9-Feb-15
This one performs A*HL/255. Be warned that this does not work on certain boundary values. For example, A=85 would imply division by 3, but if you input HL as divisible by 3, you get one less than the actual result. So 9*85/255 should return 3, but this routine returns 2 instead.

Code:

fastMul_Ndiv255:
;;Expects Z80 mode
;;Description: Multiplies HL by A/255
;;Inputs: A,HL
;;Outputs: HL is the result
;;         A is a copy of the upper byte of the result
;;         DE is the product of the input A,H
;;         B is a copy of E
;;         C is a copy of the upper byte of the product of inputs A,L
;;32cc
;;18 bytes
    ld d,a
    ld e,h
    ld h,d
    mlt hl
    mlt de
;15
;DE
; DE
; HL
;  HL
    ld c,h
    ld b,e
    ld a,d
    add hl,bc \ adc a,0
    add hl,de \ adc a,0
    ld l,h \ ld h,a
    ret
;17

One way to almost remedy the roud-off issue, (I think except for HL=65535) is:

Code:

fastMul_Ndiv255_fix:
;;Expects Z80 mode
;;Description: Multiplies HL by A/255
;;Inputs: A,HL
;;Outputs: HL is the result
;;52cc
;;26 bytes
    push af
    ld d,a
    ld e,l
    ld l,d
    mlt hl
    mlt de
;HL
; HL
; DE
;  DE
    ld c,d
    ld b,l
    ld a,h
    add hl,de \ adc a,0
    add hl,bc \ adc a,0
    ld d,l \ ld l,h \ ld h,a
    pop af  \ add a,e
    ret nc
    adc a,d
    ret nc
    inc hl
    ret

This comes at a cost of 21cc and 8 bytes.
The rounding version works fairly well and only adds 5cc and 5 bytes:

Code:

fastMul_Ndiv255_round:
;;Expects Z80 mode
;;Description: Multiplies HL by A/255
;;Inputs: A,HL
;;Outputs: HL is the result
;;37cc
;;23 bytes
    ld d,a
    ld e,l
    ld l,d
    mlt hl
    mlt de
;15
;HL
; HL
; DE
;  DE
    ld c,d
    ld b,l
    ld a,h
    add hl,de \ adc a,0
    adc hl,bc \ adc a,0
    sla l
    ld l,h \ ld h,a
    jr nc,$+3
    inc hl
    ret
;22

edit: fixed and improved ↑

24-bit integer square root that expects ADL mode. This can also be used for a fixed point square root, where the upper 16-bits make the 8.8 fixed point number, the lower 8 bits are zero (or if the previous operation had extended precision, load that). The output would be in D.E

Code:

sqrt24:
;;Expects ADL mode
;;Inputs: HL
;;Outputs: DE is the integer square root
;;         HL is the difference inputHL-DE^2
;;         c flag reset
    xor a \ ld b,l \ push bc \ ld b,a \ ld d,a \ ld c,a \ ld l,a \ ld e,a
;Iteration 1
    add hl,hl \ rl c \ add hl,hl \ rl c
    sub c \ jr nc,$+6 \ inc e \ inc e \ cpl \ ld c,a
;Iteration 2
    add hl,hl \ rl c \ add hl,hl \ rl c \ rl e \ ld a,e
    sub c \ jr nc,$+6 \ inc e \ inc e \ cpl \ ld c,a
;Iteration 3
    add hl,hl \ rl c \ add hl,hl \ rl c \ rl e \ ld a,e
    sub c \ jr nc,$+6 \ inc e \ inc e \ cpl \ ld c,a
;Iteration 4
    add hl,hl \ rl c \ add hl,hl \ rl c \ rl e \ ld a,e
    sub c \ jr nc,$+6 \ inc e \ inc e \ cpl \ ld c,a
;Iteration 5
    add hl,hl \ rl c \ add hl,hl \ rl c \ rl e \ ld a,e
    sub c \ jr nc,$+6 \ inc e \ inc e \ cpl \ ld c,a
;Iteration 6
    add hl,hl \ rl c \ add hl,hl \ rl c \ rl e \ ld a,e
    sub c \ jr nc,$+6 \ inc e \ inc e \ cpl \ ld c,a

;Iteration 7
    add hl,hl \ rl c \ add hl,hl \ rl c \ rl b
    ex de,hl \ add hl,hl \ push hl \ sbc hl,bc \ jr nc,$+8
    ld a,h \ cpl \ ld b,a
    ld a,l \ cpl \ ld c,a
    pop hl
    jr nc,$+4 \ inc hl \ inc hl
    ex de,hl
;Iteration 8
    add hl,hl \ ld l,c \ ld h,b \ adc hl,hl \ adc hl,hl
    ex de,hl \ add hl,hl \ sbc hl,de \ add hl,de \ ex de,hl
    jr nc,$+6 \ sbc hl,de \ inc de \ inc de
;Iteration 9
    pop af
    rla \ adc hl,hl \ rla \ adc hl,hl
    ex de,hl \ add hl,hl \ sbc hl,de \ add hl,de \ ex de,hl
    jr nc,$+6 \ sbc hl,de \ inc de \ inc de
;Iteration 10
    rla \ adc hl,hl \ rla \ adc hl,hl
    ex de,hl \ add hl,hl \ sbc hl,de \ add hl,de \ ex de,hl
    jr nc,$+6 \ sbc hl,de \ inc de \ inc de
;Iteration 11
    rla \ adc hl,hl \ rla \ adc hl,hl
    ex de,hl \ add hl,hl \ sbc hl,de \ add hl,de \ ex de,hl
    jr nc,$+6 \ sbc hl,de \ inc de \ inc de
;Iteration 11
    rla \ adc hl,hl \ rla \ adc hl,hl
    ex de,hl \ add hl,hl \ sbc hl,de \ add hl,de \ ex de,hl
    jr nc,$+6 \ sbc hl,de \ inc de \ inc de
    rr d \ rr e \ ret


DE/BC

Code:

div16:
;;Inputs: DE is the numerator, BC is the divisor
;;Outputs: DE is the result
;;         A is a copy of E
;;         HL is the remainder
;;         BC is not changed
;140 bytes
;145cc
    xor a \ sbc hl,hl

    ld a,d
    rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc
    rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc
    rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc
    rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc
    rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc
    rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc
    rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc
    rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc
    rla \ cpl \ ld d,a

    ld a,e
    rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc
    rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc
    rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc
    rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc
    rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc
    rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc
    rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc
    rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc
    rla \ cpl \ ld e,a
    ret
As some of you may know, it's not possible to access the top 8 bits of a 24-bit register directly, which nullifies some of the methods we've used for 16-bit registers. But thankfully, the ALU is no longer slow for 16/24-bit data, so some things can be optimized in different ways than before.

Check if HL is 0:

Code:
    ;sets zero/sign
    ;4 bytes, 4 cycles
    add hl,de
    or a,a
    sbc hl,de


Check if BC is 0:

Code:
    ;sets HL=BC, sets zero/sign, preserves carry
    ;4 bytes, 4 cycles
    sbc hl,hl
    adc hl,bc


Compare HL to DE:

Code:
    ;sets all flags from comparison
    ;4 bytes, 4 cycles
    or a,a
    sbc hl,de
    add hl,de
I know I saw from some of calc84's code that to set HL to zero:

Code:

or a  \ sbc hl,hl

That is 3cc and destroys some flags, but is faster than ld hl,0 in ADL mode (same speed in Z80). Plus, if you know that the c flag is reset, you can do just "sbc hl,hl" for 2 bytes, 2cc instead of 3 or 4 for Z80 and ADL respectively.
Xeda, I could use an ADL-mode PRNG routine. It should generate at least 16-bits at a time. (It'd be great if I could specify a maximum to scale the number, too.) I could just use your Rand24 routine, and just add .S to any instructions that need it, but I suspect the MLT instruction would be very useful for that routine.

Honestly, I'm using ADL mode for robotfindskitten because it makes reading a datafile from the archive a lot easier, and I don't feel like trying to manage short/long mode transitions. Furthermore, you can't use the OS interrupt handler in short mode, which would mean writing driver routines I haven't completed yet.

I'd also like to use the RTC as an entropy source. Here's an ugly routine I threw together quickly to convert the current time into a 32-bit integer.

Code:

;------ GetRtcTimeLinear -------------------------------------------------------
GetRtcTimeLinear:
; WARNING: THIS CODE WILL SOMETIMES (RARELY) READ THE TIME INCORRECTLY!
; It does not freeze the time before reading it.
; Not a problem for entropy-purposes, but it is a problem if you need the actual
; time.
;
; This monstrous routine will return the time as a 32-bit number of seconds in
; DE:HL.
; At least, I think.
; Inputs:
;  - None
; Output:
;  - DE:HL
; Destroys:
;  - AF
;  - BC
;  - IX maybe?
;  - ???
   ; Minutes into seconds
   ld   de, (mpRtcMinutes)
   ld   d, 60
   mlt   de
   ; Hours into seconds
   ld   hl, (mpRtcHours)
   ld   h, 225
   mlt   hl
   add   hl, hl
   add   hl, hl
   add   hl, hl
   add   hl, hl
   add   hl, de
   ; Add seconds
   ld   de, (mpRtcSeconds)
   add   hl, de
   ; Convert into DE:HL format
   push   hl
   ld   b, 8
rtcHighBitLoop:
   add   hl, hl
   adc   a, a
   djnz   rtcHighBitLoop
   pop   hl
   ld   e, a
   ld   d, 0
   push   de
   push   hl
   ; Days into seconds.
   ld   hl, (mpRtcDays)
   ld   bc, (60 * 60 * 24) / 2   ; Half
   call   MultBcByDe
   push   bc
   pop   hl
   add.s   hl, hl
   ex   de, hl
   adc.s   hl, hl         ; ... and double
   ex   de, hl
   ; Add in time-of-day
   pop   bc
   add.s   hl, bc
   ex   de, hl
   pop   bc
   adc.s   hl, bc
   ex   de, hl
   ; Done
   ret


;------ MultBcByDe -------------------------------------------------------------
MultBcByDe:
; Multiplies BC (16-bit) by HL (16-bit).
; From Xeda
; Inputs:
;  - HL (16-bits)
;  - BC (16-bits)
; Output:
;  - DE:BC
; Destroys:
;  - HL
;  - AF
   ld   d, c   \   ld   e, l   \   mlt   de   \   push   de
   ld   d, h   \   ld   e, b   \   mlt   de
   ld   a, l   \   ld   l, c   \   ld   c, a
   mlt   hl   
   mlt   bc
   add.s   hl, bc
   jr   nc, $ + 3 \   inc   de
   pop   bc
   ld   a, b   \   add   a, l   \   ld   b, a
   ld   a, e   \   adc   a, h   \   ld   e, a
   ret   nc
   inc   d
   ret


Another source of entropy I'm thinking of using is the 24-bit checksum of RAM.

Code:
   ld   bc, 3
   ld   iy, ((1024 * 256) + (320 * 240 * 2)) / 256
   ld   hl, 0D00000h
_:   xor   a
_:   ld   de, (hl)
   add   hl, bc
   add   ix, de
   dec   a
   jr   nz, -_
   dec   iy
   ld   a, iyl
   or   iyh
   jr   nz, --_


Optimizations?
Oh goody, I've been working on PRNGs all morning and decided to finally check my email Razz Using the RTC is an excellent idea of course.

Here is a 24-bit LFSR which is almost certainly not going to be good for a game (it *could* be, but only in certain situations):

Code:

LFSR24:
;;ADL mode expected
;;Input: (seed1) is non-zero
;;Output: (seed1) updated, HL is the result
;;Destroys: A
;;30cc (-3cc if using SMC)
#IF smc == 0
    ld hl,(seed1)
#ELSE
seed1 = $+1
    ld hl,1
#ENDIF
    ld a,l
    add hl,hl
    rla
    and %00010111
    jp po,$+5
    inc l
    ld (seed1),hl
    ret

It is fast, and I use it for some noisiness. It has period 2^24-1. Next, you could use an LCG like the following:

Code:

lcg16777213:
;;Expects ADL mode
;;x=11875x+137 mod (2^24-3)
;;Returns HL
#IF smc == 0
    ld hl,(seed24)
#ELSE
seed24 = $+1
    ld hl,10101
#ENDIF
;multiply seed by 95, first
    xor a \ ld d,a
    add hl,hl \ rla
    add hl,hl \ rla
    add hl,hl \ rla
    add hl,hl \ rla
    add hl,hl \ rla
    ld e,a
    push hl
    add hl,hl \ rla
    add hl,hl \ rla
    ld bc,(seed24)
    sbc hl,bc
    sbc a,d
    or a
    pop bc
    sbc hl,bc
    sbc a,e
;AHL * 125
;=AHL*128-AHL*2-AHL
    ld e,a
    push de \ push hl
    add hl,hl \ rla
    ld e,a
    push de \ push hl
    add hl,hl \ rla \ rl d
    add hl,hl \ rla \ rl d
    add hl,hl \ rla \ rl d
    add hl,hl \ rla \ rl d
    add hl,hl \ rla \ rl d
    add hl,hl \ rla \ rl d
    ld e,a
    pop bc \ sbc hl,bc \ ex de,hl \ pop bc \ sbc hl,bc \ ex de,hl
    pop bc \ sbc hl,bc \ ex de,hl \ pop bc \ sbc hl,bc \ ex de,hl
;add 37
    ld c,137
    add hl,bc
    ld c,b
    ex de,hl
    adc hl,de
    ex de,hl
;mod 2^16-3
    ex de,hl
    ld b,h
    ld c,l
    add hl,hl
    add hl,bc
    ex de,hl
    sbc hl,de \ jr nc,$+5 \ dec hl \ dec hl \ dec hl
    ld (seed24),hl
    ret

This has a period of 2^24-3. The problem of course, is that it doesn't provide a full 24-bit range of values. However, this should be fairly suitable for games, especially with the extra noise you are adding.

For those not using RTC noise, I suggest using the LFSR for noise by combining the two routines into one:

Code:

prng24:
;;Expects ADL mode
;;Returns HL
;;uses seed24 and seed1 for an LCG and LFSR respectively
;;period length is (2^24-1)(2^24-3) = 281,474,909,601,795
#IF smc == 0
    ld hl,(seed24)
#ELSE
seed24 = $+1
    ld hl,10101
#ENDIF
;multiply seed by 95, first
    xor a \ ld d,a
    add hl,hl \ rla
    add hl,hl \ rla
    add hl,hl \ rla
    add hl,hl \ rla
    add hl,hl \ rla
    ld e,a
    push hl
    add hl,hl \ rla
    add hl,hl \ rla
    ld bc,(seed24)
    sbc hl,bc
    sbc a,d
    or a
    pop bc
    sbc hl,bc
    sbc a,e
;AHL * 125
;=AHL*128-AHL*2-AHL
    ld e,a
    push de \ push hl
    add hl,hl \ rla
    ld e,a
    push de \ push hl
    add hl,hl \ rla \ rl d
    add hl,hl \ rla \ rl d
    add hl,hl \ rla \ rl d
    add hl,hl \ rla \ rl d
    add hl,hl \ rla \ rl d
    add hl,hl \ rla \ rl d
    ld e,a
    pop bc \ sbc hl,bc \ ex de,hl \ pop bc \ sbc hl,bc \ ex de,hl
    pop bc \ sbc hl,bc \ ex de,hl \ pop bc \ sbc hl,bc \ ex de,hl
;add 37
    ld c,137
    add hl,bc
    ld c,b
    ex de,hl
    adc hl,de
    ex de,hl
;mod 2^16-3
    ex de,hl
    ld b,h
    ld c,l
    add hl,hl
    add hl,bc
    ex de,hl
    sbc hl,de \ jr nc,$+5 \ dec hl \ dec hl \ dec hl
    ld (seed24),hl
    ex de,hl
#IF smc == 0
    ld hl,(seed1)
#ELSE
seed1 = $+1
    ld hl,1
#ENDIF
    ld a,l
    add hl,hl
    rla
    and %00010111
    jp po,$+5
    inc l
    ld (seed1),hl
    add hl,de
    ret


I will try to remember to calculate timings later. I hope this helps (and works, as I did not test it).

EDIT:
I made some routines that will probably be more useful for you, DrDnar, but they all expect to be called in Z80 mode. CALL.IS should suffice.

The RNG I included here has a much smaller cycle length (about 1/65536 the size), but feel free to replace it with any of your own, as long as it has some of its output in HL. Included are routines to put a range on the output (8-bit and 16-bit ranges), as well as an optimized rand10 which generates a random int from 0 to 9. I used that to generate random TI floats

Code:

;eZ80
;smc = 1
;inc_rand        = 1
;inc_randrange8  = 1
;inc_randrange16 = 1
;inc_rand10      = 1


;;random.inc
;;Random Number Generator Routines
;;Author:
;;    Zeda Thomas (xedaelnara@gmail.com)
;;License:
;;    +All parts of this document may be used for any purpose.
;;    +Give some credit to the author, even if only mentioned in the source code.
;;      ++This is mainly so that any issues with the codes or algorithms can be reported.
;;      ++If this license info is unchanged and included in this file, that will suffice.
;;Disclaimer:
;;    +These are NOT cryptographically secure. These are quality PRNGs for games and simulation.
;;Routines:
;;  routine         in         out              mode    size    speed
;;=====================================================================
;;  rand            ---         HL               Z80    53      65~67
;;  randrange8      B           A in [0,B-1]     Z80    10      85~87
;;  randrange16     BC          HL in [0,BC-1]   Z80    10      142~146
;;  rand10          ---         A in [0,9]       Z80    16      88~90
;;  randTI          HL          (hl)             Z80    27      1540~1568
;;  mlt16           HL,BC       DEBC             Z80    30      54~56

#IF inc_rand==1
rand:
;;Input:
;;  (seed1) is a non-zero 16-bit int
;;  (seed2) is a 16-bit int
;;Output:
;;  HL is the pseudo-random number
;;  DE is the output of the LFSR
;;  BC is the previous seed of the Lehmer PRNG
;;  (seed1) is the output of the LFSR
;;  (seed2) is the output of the Lehmer PRNG
;;Destroys: A
;;Notes:
;;  This uses a 16-bit Lehmer PRNG, and an LFSR
;;  The period is 4,294,901,760 (65536*65535)
;;  Technically,the Lehmer PRNG here can have values from 1 to 65536. In this application, we store 65536 as 0.
;;Speed:
;;    Add 4cc if smc==0
;;    65+2a, a is 0 or 1
;;      probability a= 1 is 38/65536
;;    Average: 65.00115966796875
;;    Best:65
;;    Worst:67
;;53 bytes
#IF smc == 0
    ld hl,(seed2)       ;5
#ELSE
seed2 = $+1
    ld hl,0             ;3
#ENDIF
;multiply by 75
    ld a,h
    or l
    jr nz,nospecial
    ld hl,-74
    jr writeseed2
nospecial:
    ld b,h
    ld c,75
    ld h,c
    mlt hl
    mlt bc
    ld a,c
    add a,h
    ld h,a
    ld c,b
    ld b,0
    sbc hl,bc
    jr nc,$+5
    inc hl \ inc hl \ inc hl
writeseed2:
    ld (seed2),hl
    ex de,hl
#IF smc == 0
    ld hl,(seed1)
#ELSE
seed1 = $+1
    ld hl,1
#ENDIF
    ld a,h
    add hl,hl
    and %10110100
    jp po,$+4
    inc l
    ld (seed1),hl
    add hl,de
    ret
#ENDIF
#IF inc_randrange8==1
randrange8:
;;Input: B
;;Output: A,H is a number on [0,B-1]
;;Example, if B is 10, this outputs an int from 0 to 9
;;85cc~87
;;10 bytes
    push bc     ;3
    call rand   ;5+
    pop bc      ;3
    ld l,b
    mlt hl
    ld a,h
    ret
#ENDIF
#IF inc_randrange16==1
randrange16:
;;Input: BC is the exclusive upperbound
;;Output:HL is an int less than BC
;;142cc~146cc
    push bc
    call rand
    pop bc
    call mul16
    ex de,hl
    ret
mul16:
;;Expects Z80 mode
;;Inputs: hl,bc
;;Outputs: Upper 16 bits in DE, lower 16 bits in BC
;;54 or 56 t-states
;;30 bytes
    ld d,c \ ld e,l \ mlt de \ push de ;11
    ld d,h \ ld e,b \ mlt de           ;8
    ld a,l \ ld l,c \ ld c,a           ;3
    mlt hl \ mlt bc \ add hl,bc        ;13
    jr nc,$+3 \ inc de \ pop bc        ;6
    ld a,b \ add a,l \ ld b,a          ;3
    ld a,e \ adc a,h \ ld e,a          ;3
    ret nc \ inc d \ ret               ;7 (+2 for carry)
#ENDIF
#IF inc_randTI==1
randTI:
;;Inputs: HL points to where the float should be written (does not allocate RAM).
;;Outputs: Stores a random floating point number on [0,1) to where HL points
;;1540~1568
inc_rand10 = 1
    ld (hl),0 \ inc hl
    ld (hl),7Fh \ inc hl
    ld b,7
randTIloop:
    push bc
    push hl
    call rand10
    pop hl
    rrd
    push hl
    call rand10
    pop hl
    rrd
    pop bc
    djnz randTIloop
    ret
#ENDIF
#IF inc_rand10==1
rand10:
;;Output: A is an int from 0 to 9.
;;88cc~90cc
;;16 bytes
    call rand   
    xor a
    ld b,h
    ld c,l
    add hl,hl \ rla
    add hl,hl \ rla
    add hl,bc \ adc a,0
    add hl,hl \ rla
    ret
#ENDIF

Absolute Value:

Inputs: HL
Outputs: abs(HL)


Code:
AbsHL:
 ld de,0
 or a,a \ adc hl,de
 ret p
 ex de,hl
 sbc hl,de
 ret
MateoConLechuga wrote:
Absolute Value:

Inputs: HL
Outputs: abs(HL)


Code:
AbsHL:
 ld de,0
 or a,a \ adc hl,de
 ret p
 ex de,hl
 sbc hl,de
 ret

I believe this is an improvement:

Code:
AbsHL:
 ex de,hl
AbsDE:
 or a,a
 sbc hl,hl
 sbc hl,de
 ret p
 ex de,hl
 ret

At least, it's an improvement across uniform inputs. It's actually slightly slower (4 cycles) for nonnegative inputs, but it's much faster (21 cycles) for negative inputs.
Runer112 wrote:

It's actually slightly slower (4 cycles) for nonnegative inputs...

A slight modification can fix that at no cost to the negative case or size, but destroying A:

Code:

absHL:
 ex de,hl
absDE:
 ld a,h
 or a
 ret p
 sbc hl,hl
 sbc hl,de
 ret
Xeda112358 wrote:
A slight modification can fix that at no cost to the negative case or size, but destroying A:

Code:

absHL:
 ex de,hl
absDE:
 ld a,h
 or a
 ret p
 sbc hl,hl
 sbc hl,de
 ret

Hm, that's neat! Forgot about the 1's complement inversion and conversion to 2's complement. Technically that measures the sign of the middle byte though, yet it obviously still works. Smile Nice job.
I made a simple text routine for fonts up to 8 pixels wide. It assumes a font in the following format:

Code:
Font_Width           = 8 ; Width in pixels (any integer from 1-8)
Font_Height          = 14 ; Height in pixels (any integer from 1-240)
Font_HSpacing        = 0 ; Space between characters in pixels
Font_VSpacing        = 0 ; Space between lines in pixels
Font_CharData:
   .db $00,$00,$FE,$C6,$C6,$C6,$C6,$C6,$C6,$C6,$C6,$FE,$00,$00 ; Char 00
   .db $00,$00,$FE,$C6,$C6,$C6,$C6,$C6,$C6,$C6,$C6,$FE,$00,$00 ; Char 01
   etc.


The text itself should be in this format:

Code:
txt_HelloWorld:
   .db "Hello world!",1
   .db "The byte $01 denotes a newline",1
   .db "The byte $00 terminates the string",0


Here is the code itself: http://pastebin.com/XKEYt9vR

Hope this is useful to someone! Wink
EDIT 1-31-2016: It turns out that the multiplication and division routines I had posted here didn't work for many cases, specifically negative numbers. I have fixed both. (Also now neither of them destroy IX and IY!)

I'm currently working on some 16.8 fixed-point math routines. Addition and subtraction are obviously very easy, and here are routines I wrote for multiplication and division. I'm absolutely sure they could be optimized.
Pastebin!

Neither of these has been extensively tested, so there could be minor issues.

Also, here are some neat tricks!

HLu→A

Code:
   dec sp
   push hl
   inc sp
   pop af


Byte shift HL up

Code:
   push hl
   dec sp
   pop hl
   inc sp


"Pop grab" (pop something off the stack, but leave it on the stack)

Code:
   pop hl
   dec sp \ dec sp \ dec sp


They are pretty simple and obvious, but I thought I'd put them here anyway.
Here's a pretty nice line routine that doesn't care what order the x or y coordinates are. Works in 8bpp mode. It also doesn't use ix or iy, so it is compatible with ZDS C and other things. No clipping is implemented; that will be with an alternate routine that clips the line before it draws it.
It is fairly well optimized, but then again I don't know much about optimization. Smile Enjoy!

Inputs:

Code:
de=x, hl=x, b=y, c=y, a=indexed color


Code:

Code:
_line:
 ld (color),a
 ld (color2),a
 ld a,c
 ld (y1),a
 push de
  push hl
   push bc   
    or a,a
    sbc hl,de
    ld a,$03
    jr nc,+_
    ld a,$0B
_:  ld (xStep),a
    ld (xStep2),a
    ex de,hl
    or a,a
    sbc hl,hl
    sbc hl,de
    jp p,+_
    ex de,hl
_:  ld (dx),hl
    push hl
     add hl,hl
     ld (dx1),hl
     ld (dx12),hl
     or a,a
     sbc hl,hl
     ex de,hl
     sbc hl,hl
     ld e,b
     ld l,c
     or a,a
     sbc hl,de
     ld a,$3C
     jr nc,+_
     inc a
_:   ld (yStep),a
     ld (yStep2),a
     ex de,hl
     or a,a
     sbc hl,hl
     sbc hl,de
     jp p,+_
     ex de,hl
_:   ld (dy),hl
     add hl,hl
     ld (dy1),hl
     ld (dy12),hl
    pop de
   pop af
   ld hl,(dy)
   or a,a
   sbc hl,de
  pop de
 pop bc
 ld hl,0
 jr nc,changeYLoop
changeXLoop:
 push hl
  ld l,a
  ld h,lcdWidth/2
  mlt hl
  add hl,hl
  add hl,bc
  push bc
   ld bc,vram
   add hl,bc
color: =$+1
   ld (hl),0
   pop bc
   push bc
   pop hl
   or a,a
   sbc hl,de
 pop hl
 ret z
xStep:   
 nop
 push de
dy1: =$+1
  ld de,0
  or a,a
  adc hl,de
  jp m,+_
dx: =$+1
  ld de,0
  or a,a
  sbc hl,de
  add hl,de
  jp c,+_
yStep:
  nop
dx1: =$+1
  ld de,0
  sbc hl,de
_:
 pop de
 jp changeXLoop
changeYLoop:
 push bc
  push hl
   ld l,a
   ld h,lcdWidth/2
   mlt hl
   add hl,hl
   add hl,bc
   ld bc,vram
   add hl,bc
color2: =$+1
   ld (hl),0
  pop hl
 pop bc
y1: =$+1
 cp a,0
 ret z
yStep2:
 nop
 push de
dx12: =$+1
  ld de,0
  or a,a
  adc hl,de
  jp m,+_
dy: =$+1
  ld de,0
  or a,a
  sbc hl,de
  add hl,de
  jp c,+_
xStep2:
  nop
dy12: =$+1
  ld de,0
  sbc hl,de
_:
 pop de
 jp changeYLoop
While working on my 3D renderer I wrote some simple LUT-based trigonometry routines. It takes input in register A where 256 = 360. The result is in 16.8 fixed-point, though you could just change the lookup table to return any other format. Here you go:

EDIT: I think it's pretty well optimized, but I wouldn't say it's perfect. I don't really like needing to use B to store the top two bits of A...


Code:
; Destroys A B DE
; Result in HL
CosA:
   add a,64
SinA:
   ld b,a
   and a,%00111111
   ld hl,3 \ ld h,a \ mlt hl
   bit 6,b
   jr z,_
   ex de,hl
   ld hl,63*3
   sbc hl,de
_:   ld de,SinCosLUT
   add hl,de
   bit 7,b
   jr z,_
   ld de,(hl)
   or a,a \ sbc hl,hl ; carry could have been set by ADD HL,DE four lines earlier
   sbc hl,de
   ret
_:   ld hl,(hl) \ ret

SinCosLUT: ; 192 bytes (64 items, 3 bytes each)
; This could probably be made to be 2 bytes per item, but it would be harder to clear out the MSByte of the output
   .dl 0
   .dl 6
   .dl 13
   .dl 19
   .dl 25
   .dl 31
   .dl 38
   .dl 44
   .dl 50
   .dl 56
   .dl 62
   .dl 68
   .dl 74
   .dl 80
   .dl 86
   .dl 92
   .dl 98
   .dl 104
   .dl 109
   .dl 115
   .dl 121
   .dl 126
   .dl 132
   .dl 137
   .dl 142
   .dl 147
   .dl 152
   .dl 157
   .dl 162
   .dl 167
   .dl 172
   .dl 177
   .dl 181
   .dl 185
   .dl 190
   .dl 194
   .dl 198
   .dl 202
   .dl 206
   .dl 209
   .dl 213
   .dl 216
   .dl 220
   .dl 223
   .dl 226
   .dl 229
   .dl 231
   .dl 234
   .dl 237
   .dl 239
   .dl 241
   .dl 243
   .dl 245
   .dl 247
   .dl 248
   .dl 250
   .dl 251
   .dl 252
   .dl 253
   .dl 254
   .dl 255
   .dl 255
   .dl 256
   .dl 256
Hactar wrote:
"Pop grab" (pop something off the stack, but leave it on the stack)

Code:
   pop hl
   dec sp \ dec sp \ dec sp
This is unsafe if interrupts are enabled! If an interrupt triggers, the interrupt handling data will overwrite the thing on the stack you're trying to preserve.
DrDnar wrote:
Hactar wrote:
"Pop grab" (pop something off the stack, but leave it on the stack)

Code:
   pop hl
   dec sp \ dec sp \ dec sp
This is unsafe if interrupts are enabled! If an interrupt triggers, the interrupt handling data will overwrite the thing on the stack you're trying to preserve.

Just use pop/push. It doesn't have this interrupt problem, it's smaller, and (for non-index registers) it's faster.

By the way, you don't need to make up a name for this operation. It already has a name: "peek."
MateoConLechuga wrote:
Here's a pretty nice line routine that doesn't care what order the x or y coordinates are. Works in 8bpp mode. It also doesn't use ix or iy, so it is compatible with ZDS C and other things. No clipping is implemented; that will be with an alternate routine that clips the line before it draws it.
It is fairly well optimized, but then again I don't know much about optimization. Smile Enjoy!

Mateo, I optimised a few things :

Code:
_line:  ld (color),a    ; de,b =x,y     hl,c=x,y     a=indexed color
        ld (color2),a
        ld a,c
        ld (y1),a
        ld (nde+1),hl
        push de
        push bc     
        or a,a 
        sbc hl,de 
        ld a,$03 
        jr nc,+_ 
        ld a,$0B
_:      ld (xStep),a 
        ld (xStep2),a 
        ex de,hl 
        or a,a 
        sbc hl,hl 
        sbc hl,de 
        jp p,+_ 
        ex de,hl 
_:      ld (dx),hl 
        push hl 
        add hl,hl 
        ld (dx1),hl 
        ld (dx12),hl
        or a,a
        sbc hl,hl
        ex de,hl
        sbc hl,hl
        ld e,b
        ld l,c
        or a,a 
        sbc hl,de
        ld a,30
        adc a,a
        ld (yStep),a
        ld (yStep2),a
        ex de,hl 
        or a,a 
        sbc hl,hl 
        sbc hl,de 
        jp p,+_
        ex de,hl
_:      ld (dy),hl
        add hl,hl
        ld (dy1),hl
        ld (dy12),hl 
        pop de
        pop af
        srl h
        rr l
        sbc hl,de 
        pop bc
        ld hl,0 
        jr nc,changeYLoop
changeXLoop:
        push hl 
        ld l,a 
        ld h,160 
        mlt hl 
        add hl,hl 
        add hl,bc 
        ld de,$d40000
        add hl,de
color: =$+1 
        ld (hl),0 
        sbc hl,hl
        ld h,b
        ld l,c
        or a,a 
nde:    ld de,000
        sbc hl,de 
        pop hl 
        ret z 
xStep:  nop
dy1: =$+1 
        ld de,0 
        or a,a 
        adc hl,de
        jp m,changeXLoop 
dx: =$+1 
        ld de,0 
        or a,a 
        sbc hl,de 
        add hl,de 
        jr c,changeXLoop 
yStep:  nop
dx1: =$+1 
        ld de,0 
        sbc hl,de 
        jr changeXLoop 
changeYLoop:
        push hl
        ld l,a 
        ld h,160
        mlt hl 
        add hl,hl 
        add hl,bc 
        ld de,$d40000
        add hl,de
color2: =$+1 
        ld (hl),0 
        pop hl
y1: =$+1
        cp a,0
        ret z
yStep2:  nop
dx12: =$+1
        ld de,0
        or a,a
        adc hl,de
        jp m,changeYLoop
dy: =$+1
        ld de,0
        or a,a
        sbc hl,de
        add hl,de
        jr c,changeYLoop
xStep2: nop
dy12: =$+1
        ld de,0
        sbc hl,de
        jr changeYLoop
Does anyone have a good routine to get the positive difference between a and c?
tmwilliamlin168 wrote:
Does anyone have a good routine to get the positive difference between a and c?


I've just started learning assembly, but I think "sub c \ ret nc \ neg \ ret" is optimal for unsigned numbers. For the difference between two signed numbers, I think you'd need to negate (a-c) if ((a was <0) xor (c was <0) xor (a-c is <0)).

There's a difference between unsigned and signed, because, for example, when a is 0xFF and c is 0x00, the unsigned difference is 255, but the signed difference is 1. "add 128 \ ld e,a \ ld a,c \ add 128 \ sub e \ ret c \ neg \ ret" works, but there's definitely something better.
lirtosiast wrote:
tmwilliamlin168 wrote:
Does anyone have a good routine to get the positive difference between a and c?


I've just started learning assembly, but I think "sub c \ ret nc \ neg \ ret" is optimal for unsigned numbers. For the difference between two signed numbers, I think you'd need to negate (a-c) if ((a was <0) xor (c was <0) xor (a-c is <0)).


So if c is bigger than a, and you do sub c, then the carry flag will be set?
Yeah, think about it:
c = 10
a = 5
5-10 will set the carry because we go past 0. The other way around (10-5) won't reset the carry because we didn't go past zero so there is no carry.

lirtosiast: I believe your routine should work for unsigned numbers as is as well.
  
Register to Join the Conversation
Have your own thoughts to add to this or any other topic? Want to ask a question, offer a suggestion, share your own programs and projects, upload a file to the file archives, get help with calculator and computer programming, or simply chat with like-minded coders and tech and calculator enthusiasts via the site-wide AJAX SAX widget? Registration for a free Cemetech account only takes a minute.

» Go to Registration page
Page 1 of 2
» All times are GMT - 5 Hours
 
You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot vote in polls in this forum

 

Advertisement