Razz This routine is a fast way to copy a chunk of data. For BC>=35, calling this proves to be faster than using LDIR, and I believe it has identical input and output:

Code:

fastLDIR:
;copy BC bytes from HL to DE
;Cost:
;    27cc for having to call
;    110cc for setting up the loop, worst case
;    10cc * ceiling(BC/n)        ;n=2^k for some k, see the line below "ldirloop:"
;    16cc * BC
;costs roughly 152-BC*(5-10/n) more than a simple LDIR (worst case)
;for n=4,  BC>=61 saves
;for n=8,  BC>=41 saves
;for n=16, BC>=35 saves   * default, see the "ldirloop" to change
;for n=32, BC>=33 saves
;for n=64, BC>=32 saves
    push hl
    push af
    xor a
    sub c
    and 15               ;change to n-1
    add a,a
    ld hl,ldirloop
    add a,l
    ld l,a
    jr nc,$+3  ;these aren't needed if the ldirloop doesn't cross a 256 byte boundary. Can save 12cc on the above timings and 3 bytes.
    inc h       ;
    pop af
    ex (sp),hl
    ret
ldirloop:
;n=16, (number of LDI instructions, use qty of 4,8,16,32,64)
    ldi
    ldi
    ldi
    ldi
   
    ldi
    ldi
    ldi
    ldi
   
    ldi
    ldi
    ldi
    ldi
   
    ldi
    ldi
    ldi
_ldirloop_end:
    ldi
    jp pe,ldirloop
    ret

This might be useful for things like copying code to RAM (from an App) in speed critical applications, or other data handling tasks.

EDIT 29 Sept 18: Saved 1 byte and 3cc. Originally, I had 'ld a,16 \ sub c \ and 15'. The 'ld a,16' could hold any multiple of n (16 in this example), including 0, so I just used 'xor a'. I also updated all the timing info.
While working on a 2D graphics engine and finding all registers but E tied up, I came up with this beautiful method for multiplying HL by 12

Code:

ADD HL, HL ; HL*2
ADD HL, HL ; HL*4
LD E, L    ; As HL is a vertical coordinate and thus cannot exceed 63, therefore HL*4 will be the size of a single byte
           ; No data is lost storing it in E also note that D still contains an arbitrary number
ADD HL, HL ; HL*8
ADD HL, DE ; HL*12+D*256
LD E, 0
SBC HL, DE ; Carry is 0 as HL*12+D*256 will not overflow as D will be smaller than 96, (width of screen)
           ; thus the result is HL*12 and only destroying E

Also only 9 bytes large, 3 bytes larger than the usual way, which destroys DE

I though I'd just share this for all the efficiency freaks out there Smile

Code:
; hl <= 85
   ld e,l
   add hl,hl ; hl*2
   add hl,de ; hl*3+d*256
   ld h,0    ; hl*3
   add hl,hl ; hl*6
   add hl,hl ; hl*12


7 bytes and 55 cycles. Wink 1 byte larger and 3 cycles slower than the conventional method.

EDIT: And a version that's 8 bytes but 46 cycles:

Code:
; hl <= 85
   ld e,a
   ld a,l
   add a,a   ; hl*2
   add a,l   ; hl*3
   ld l,a
   ld a,e
   add hl,hl ; hl*6
   add hl,hl ; hl*12
This is a fast 16-bit Lehmer RNG. It is seeded, and has a period of 65536.

Code:

smc = 1   ;use 1 if the code is in RAM, since it is faster. If it is in an app, use 0 and define a 2-byte RAM location for the seed.

lehmer:
;;Input:
;;  (seed) has the seed value of the RNG
;;Output:
;;  (seed) is updated, HL is the result
;;Destroys:
;;  A,DE,BC
;;Timing:
;;  if seed>0     231cc or 232cc, condition dependent
;;  if seed=0     91cc
;;  if smc=1      subtract 6cc
;;Size: 44 bytes
;;Notes:
;;    Uses the Lehmer RNG used by the Sinclair ZX81
;;    75x mod 65537 -> x
#IF smc == 0
    ld hl,(seed)
#ELSE
seed = $+1
    ld hl,0
#ENDIF
;multiply by 75
    ld c,l
    ld b,h
    xor a
    adc hl,hl \ jr z,special \ ld d,a \ rla
    add hl,hl \ rla
    add hl,hl \ rla \ add hl,bc \ adc a,d
    add hl,hl \ rla
    add hl,hl \ rla \ add hl,bc \ adc a,d
    add hl,hl \ rla \ add hl,bc
;modulo 65537, see note below on how this works
    ld e,a
    sbc hl,de       ;No need to reset the c flag since it is already
    jr nc,$+3
    inc hl
    ld (seed),hl
    ret
special:
;In the case that HL=0, this should be interpreted as 65536 = -1 mod 65537, so return -75 mod 65537 = -74 mod 65536 in HL
    ld hl,-74
    ld (seed),hl
    ret
   
;mod by 2^16 + 1 (a prime)
;current form is A*2^16+HL
;need:
;  (A*2^16+HL) mod (2^16+1)
;add 0 as +1-1
;  (A*(2^16+1-1)+HL) mod (2^16+1)
;distribute
;  (A*(2^16+1)-A+HL) mod (2^16+1)
;A*(2^16+1) mod 2^16+1 = 0, so remove
;  (-A+HL) mod (2^16+1)
;Oh hey, that's easy! :P
;I use this trick everywhere, you should, too.
ConvOP1

This is a replacement for the _ConvOP1 B_CALL. It exists to make these improvements over the B_CALL:
  • Support the full unsigned 16-bit integer domain, instead of capping out at 9999.
  • Throw a domain error if the integer is outside of the domain, instead of an invalid dim error.
  • Throw a domain error when given a non-integer value, instead of just converting the truncated value.
  • Throw a type error when given:
    • A non-number, instead of producing 0.
    • A complex number, instead of just converting the real component.
    • A negative number, instead of just converting the absolute value.
Unlike the B_CALL, the output is in HL.

There are two versions of the routine: one sets OP1 to floating-point 0 and is slower, but is smaller; and one that doesn't modify OP1 and is faster, but is larger. For both routines, if you're targeting the monochrome 83+ or 84+, you may define 83p to save 2 bytes and some cycles.

If you plan on using this code, you may want to grab the raw paste from the pastebin link, as that preserves the tabs.



59 bytes, ~ (355 + 210*digits) cycles, modifies OP1 (pastebin)

Code:
;Attempts to convert the TI float in OP1 into an unsigned integer in HL.
;
;Throws:
; - A data type error if OP1 doesn't hold a nonnegative real number.
; - A domain error if the value cannot be exactly converted to an unsigned
;   16-bit integer.
;
; I: (OP1)=float
; O: A=0, BC=((uint)(OP1))%10, DE=OP1+8, HL=(uint)(OP1), (OP1)=(float)0
;FO: S=0, Z=1, H=0, P/V=0, N=1, C=0
;CC: 355 + 210*d
;    d = (OP1)!=0 ? floor(log10((OP1))) + 1 : 1
ConvOP1:
;Throws an error if OP1 doesn't hold a nonnegative real number.
   ld   a,(OP1)
   or   a
   jr   nz,ErrDataType
;Initializes the 16-bit accumulator to 0.
   ld   h,a
   ld   l,a
ConvOP1_Loop:
;Multiplies the 16-bit accumulator by 10, checking for overflow.
   ld   b,h
   ld   c,l
   add   hl,hl
   adc   a,a
   add   hl,hl
   adc   a,a
   add   hl,bc
   adc   a,a
   add   hl,hl
   adc   a,a
   jr   nz,ErrDomain
;Rotates the first three mantissa bytes of OP1 left by a nibble, collecting the
;highest nibble/digit rotated out.
   ex   de,hl
   ld   hl,OP1+4
   rld
   dec   l
   rld
   dec   l
   rld
   dec   l
;Adds the highest nibble/digit rotated out to the 16-bit accumulator, checking
;for overflow. Decrements the exponent and continues looping if it doesn't
;become $7F. Doesn't care about bad exponents, as the 16-bit accumulator would
;overflow eventually.
   ld   c,a
   xor   a
   ld   b,a
   dec   (hl)
   ex   de,hl
   add   hl,bc
   jr   c,ErrDomain
   jp   po,ConvOP1_Loop
ConvOP1_CheckIntLoop:
;Returns successfully if the last byte of the mantissa has been checked.
#ifdef 83p
   or   e
   ret   m
#else
   ld   a,(OP1+8)&$FF
   sub   e
   ret   z
#endif
;Continues if the next byte of the mantissa is zero.
   inc   e
   ld   a,(de)
   or   a
   jr   z,ConvOP1_CheckIntLoop
ErrDomain:
;Throws a domain error.
   B_CALL(_ErrDomain)
ErrDataType:
;Throws a data type error.
   B_CALL(_ErrDataType)




84 bytes, ~ (380 + 120*digits) cycles, preserves OP1 (pastebin)

Code:
;Attempts to convert the TI float in OP1 into an unsigned integer in HL.
;
;Throws:
; - A data type error if OP1 doesn't hold a nonnegative real number.
; - A domain error if the value cannot be exactly converted to an unsigned
;   16-bit integer.
;
;Don't try to hijack this routine with a pointer other than OP1. It won't work.
;
; I: (OP1)=float
; O: A=0, BC=?, DE=OP1+8, HL=(uint)(OP1)
;FO: S=0, Z=1, H=0, P/V=0, N=1, C=0
;CC: 69 + 154*((d+1)/2) + 131*(d/2) + (d%2 ? 21 : 12) + 43*(7-((d+1)/2)
;    d = (OP1)!=0 ? floor(log10((OP1))) + 1 : 1
ConvOP1:
;Throws an error if OP1 doesn't hold a nonnegative real number.
   ld   de,OP1
   ld   a,(de)
   or   a
   jr   nz,ErrDataType
;Initializes the 16-bit accumulator to 0 and reads the exponent.
   ld   h,a
   ld   l,a
   inc   e
   ld   a,(de)
ConvOP1_Loop:
;Saves the exponent.
   push   af
;Reads the high nibble/digit of the next mantissa byte.
   inc   e
   ld   a,(de)
;Multiplies the 16-bit accumulator by 10, checking for overflow where necessary,
;and shifts the high digit right by 4 bits into normal position.
   ld   b,h
   ld   c,l
   add   hl,hl
   rra
   add   hl,hl
   rra
   add   hl,bc      ;9999*4=39996+9999=49995, cannot overflow yet
   rra
   add   hl,hl      ;9999*5=49995*2=99990, can now overflow
   jr   c,ErrDomain
   rra
;Adds the now shifted high digit to the 16-bit accumulator, checking for
;overflow.
   ld   b,0
   ld   c,a
   add   hl,bc
   jr   c,ErrDomain   ;65530+9=65539, can still overflow
;Restores the exponent, decrements it, and breaks out if it becomes $7F. Doesn't
;care about bad exponents, as the 16-bit accumulator would overflow eventually.
   pop   af
   dec   a
   jp   pe,ConvOP1_DoneMidByte
;Multiplies the 16-bit accumulator by 10, checking for overflow where necessary.
   add   hl,hl      ;40000*2=80000, can still overflow
   jr   c,ErrDomain
   ld   b,h
   ld   c,l
   add   hl,hl      ;20000*2=40000*2=80000, can still overflow
   jr   c,ErrDomain
   add   hl,hl      ;10000*4=40000*2=80000, can still overflow
   jr   c,ErrDomain
   add   hl,bc      ;any 5-digit number has already overflowed
;Reads the low nibble/digit of the current mantissa byte and adds it to the
;16-bit accumulator.
   ld   b,a
   ld   a,(de)
   and   $0F
   ld   c,a
   ld   a,b
   ld   b,0
   add   hl,bc
;Decrements the exponent and continues looping if it doesn't become $7F.
   dec   a
   jp   po,ConvOP1_Loop
;Skips conversion-finished-mid-byte code.
   jr   ConvOP1_CheckIntLoop
ConvOP1_DoneMidByte:
;Throws an error if the low nibble/digit of this mantissa byte is nonzero.
   ld   a,(de)
   and   $0F
   jr   nz,ErrDomain
ConvOP1_CheckIntLoop:
;Returns successfully if the last byte of the mantissa has been checked.
#ifdef 83p
   or   e
   ret   m
#else
   ld   a,(OP1+8)&$FF
   sub   e
   ret   z
#endif
;Continues if the next byte of the mantissa is zero.
   inc   e
   ld   a,(de)
   or   a
   jr   z,ConvOP1_CheckIntLoop
ErrDomain:
;Throws a domain error.
   B_CALL(_ErrDomain)
ErrDataType:
;Throws a data type error.
   B_CALL(_ErrDataType)
AverageBGR565

Calculates the average of two BGR565 (or RGB565) colors. Pretty aggressively optimized. Probably works.

If you plan on using this code, you may want to grab the raw paste from pastebin, as that preserves the tabs.

Code:
; Calculates the average of the BGR565 (or RGB565) colors in DE and HL into HL.
;    I: DE=color1, HL=color2
;    O: A=color1&$FF&~((1<<5)+1), BC=color1&color2&(((1<<6)+1<<5)+1),
;       DE=color1&~(((1<<6)+1<<5)+1), HL=average(color1,color2)
;   FO: SZP(L), C=0
;   CC: 125
AverageBGR565:
   ld   a,d
   and   h
   and   1<<(6+5-8)
   ld   b,a
   ld   a,e
   and   l
   and   (1<<5)+1
   ld   c,a
   res   (6+5)&7,d
   ld   a,e
   and   ~((1<<5)+1)
   ld   e,a
   res   (6+5)&7,h
   res   5,l
   add   hl,de
   rr   h
   rr   l
   add   hl,bc
   ret
modified x-post:
So earlier on IRC I posted a challenge that I was thinking about before bed: Convert an 8-bit unsigned integer to BCD as fast as you can.

I managed to get my code down to 143.5cc, then jacobly noticed an optimization on mine, and I noticed a bug that had been carried through from the beginning. After all was done, we got it down to 131cc.

The code given is a subroutine (so adding an ret to the end, +1 byte +10cc). Without further ado:

L_To_Dec

Code:

L_To_Dec:
;;Unrolled
;;Converts the 8-bit register L to binary coded decimal
;;Digits stored in LA (A has the lower 2 digits, L the upper).
;;Inputs: L is the 8-bit unsigned int to convert
;;Output: A has the lower 2 digits (in BCD form), L has the upper
;;Destroys: H,F
;;141cc
;;27 bytes
    ld h,0
    add hl,hl
    add hl,hl
    add hl,hl
    add hl,hl
    ld a,h \ daa  \ rl l
    adc a,a \ daa \ rl l
    adc a,a \ daa \ rl l
    adc a,a \ daa \ rl l
    adc a,a \ daa \ rl l
    ret



Related to the previous routine, I offer several alternatives to the bcalls _SetXXOP1 and _SetXXOP2. These routines are for converting 8-bit integers to TI floats. The advantages with my routines are that they are faster, you don't need to truncate to only the bottom 2 digits, and you can store the output to any location instead of just OP1 or OP2.

So first, if you still want the output to be the same:
setXXOP2
setXXOP1
setXX

Code:

setXXOP2:
    ld hl,OP2
    jr setXX
setXXOP1:
    ld hl,OP1
setXX:
;;Inputs: A is the unsigned int
;;        HL is where to write the TI float
;;Destroys:All
;;291cc+38b (or 144cc if A=0)
;;average: b=29/255
;;295.3215686cc
;;59 bytes
    ld c,0
    ld (hl),c \ inc hl
    ld (hl),81h
    inc hl \ ld (hl),c
    ld d,h \ ld e,l
    inc hl \ ld (hl),c
    inc hl \ ld (hl),c
    inc hl \ ld (hl),c
    inc hl \ ld (hl),c
    inc hl \ ld (hl),c
    inc hl \ ld (hl),c

    or a \ ret z    ;If A is zero, exit early. +138cc
    ld l,a          ;\
    ld h,c          ; |
    add hl,hl       ; |Start converting A to BCD
    add hl,hl       ; |
    add hl,hl       ; |
    add hl,hl       ; |
    ld a,h \ daa  \ rl l    ; |Finish converting A to BCD
    adc a,a \ daa \ rl l    ; |Number is in LA
    adc a,a \ daa \ rl l    ; |
    adc a,a \ daa \ rl l    ; |
    adc a,a \ daa           ;/ +124cc
    ex de,hl
    ld (hl),a
    and $F0
    ret nz      ;+29cc
    rld         ;\ Rotate up 1 digit
    dec hl      ; |
    ld (hl),80h ; |
    ret         ; /

And if you want to get all 3 digits:
setXXXOP1
setXXX

Code:

setXXXOP1:
    ld hl,OP1
setXXX:
;;Inputs: A is the unsigned int
;;        HL is where to write the TI float
;;Destroys:All
;;423cc+13a+63b (or 233cc if A=0)
;;average: a=99/255, b=29/255
;;435.2117647cc average
;;64 bytes
    ld bc,$0700
    ld (hl),c
    inc hl
    ld (hl),83h
    ld d,h
    ld e,l
    inc hl \ ld (hl),c \ djnz $-2
    or a \ ret z    ;If A is zero, exit early. +227cc
    ld l,a          ;\
    ld h,c          ; |
    add hl,hl       ; |Start converting A to BCD
    add hl,hl       ; |
    add hl,hl       ; |
    add hl,hl       ; |
    ld a,h \ daa  \ rl l    ; |Finish converting A to BCD
    adc a,a \ daa \ rl l    ; |Number is in LA
    adc a,a \ daa \ rl l    ; |
    adc a,a \ daa \ rl l    ; |
    adc a,a \ daa \ rl l    ;/ +132cc
    ex de,hl
    jr nz,$+6 \ ld e,a \ xor a \ ld (hl),81h    ;+(21+4/85)cc
    inc hl
    ld (hl),e
    inc hl
    ld (hl),a
    ld a,e
    and $F0
    ret nz      ;+48cc
    rld         ;\ Rotate up 1 digit
    dec hl      ; |
    rld         ; |
    dec hl      ; |
    dec (hl)    ; |Decrement exponent
    ret         ; /+63(29/255)cc

And if you want to do that, but maybe a little faster:
setXXX (+11 bytes, -89cc)

Code:

setXXX:
;;Inputs: A is the unsigned int
;;        HL is where to write the TI float
;;Destroys:All
;;334cc+13a+63b (or 144cc if A=0)
;;average: a=99/255, b=29/255
;;346.2117647cc average
;;75 bytes
    ld c,0
    ld (hl),c
    inc hl
    ld (hl),83h
    ld d,h
    ld e,l
    inc hl \ ld (hl),c
    inc hl \ ld (hl),c
    inc hl \ ld (hl),c
    inc hl \ ld (hl),c
    inc hl \ ld (hl),c
    inc hl \ ld (hl),c
    inc hl \ ld (hl),c
   
    or a \ ret z    ;If A is zero, exit early. +227cc
    ld l,a          ;\
    ld h,c          ; |
    add hl,hl       ; |Start converting A to BCD
    add hl,hl       ; |
    add hl,hl       ; |
    add hl,hl       ; |
    ld a,h \ daa  \ rl l    ; |Finish converting A to BCD
    adc a,a \ daa \ rl l    ; |Number is in LA
    adc a,a \ daa \ rl l    ; |
    adc a,a \ daa \ rl l    ; |
    adc a,a \ daa \ rl l    ;/ +132cc
    ex de,hl
    jr nz,$+6 \ ld e,a \ xor a \ ld (hl),81h    ;+(21+4/85)cc
    inc hl
    ld (hl),e
    inc hl
    ld (hl),a
    ld a,e
    and $F0
    ret nz      ;+48cc
    rld         ;\ Rotate up 1 digit
    dec hl      ; |
    rld         ; |
    dec hl      ; |
    dec (hl)    ; |Decrement exponent
    ret         ; /+63(29/255)cc

And if you want to convert signed 8-bit ints:
setXXX_signed

Code:

setXXX_signed:
;;Inputs: A is the signed int
;;        HL is where to write the TI float
    ld c,0
    ld (hl),c
    add a,a
    jr c,$+6
    neg
    ld (hl),80h
    inc hl
    ld (hl),81h
    ld d,h
    ld e,l
    inc hl \ ld (hl),c \ djnz $-2
    or a \ ret z
    ld l,a          ;\
    ld h,c          ; |
    add hl,hl       ; |Start converting A to BCD
    add hl,hl       ; |
    add hl,hl       ; |
    add hl,hl       ; |
    ld a,h \ daa  \ rl l    ; |Finish converting A to BCD
    adc a,a \ daa \ rl l    ; |Number is in cA
    adc a,a \ daa \ rl l    ; |(c is carry)
    adc a,a \ daa           ;/
    ex de,hl
    jr nc,$+15
    ld (hl),82h
    inc hl
    inc hl
    ld (hl),a
    xor a
    rld
    or $10
    dec hl
    ld (hl),a
    ret
    inc hl
    ld (hl),a
    and $F0
    ret nz
    rld         ;\ Rotate up 1 digit
    dec hl      ; |
    ld (hl),80h ; |
    ret         ; /


The slowest routine here has a worst case of 499cc and slowest average case is 436cc.
Hey Runer112, I propose a speed optimization challenge >Smile
ConvOP1
ConvFloat
89 bytes, Avg: 496.577cc, Worst:525cc, preserves the float

Code:

ConvOP1:
;;Output: HL is the 16-bit result.
    ld de,OP1
ConvFloat:
;;Input: DE points to the float.
;;Output: HL is the 16-bit result.
;;Errors: DataType if the float is negative or complex
;;        Domain if the integer exceeds 16 bits.
;;Timings:  Assume no errors were called.
;;  Input is on:
;;  (0,1)         => 57cc                        Average=59
;;  0 or [1,10)   => 118cc or 127cc                     =124.5
;;  [10,100)      => 174cc or 175cc                     =176.5
;;  [100,1000)    => 307cc, 308cc, 316cc, or 317cc.     =312
;;  [1000,10000)  => 374cc to 376cc                     =375
;;  [10000,65536) => 512cc to 514cc, or 521cc to 523cc  =517.5
;;Average case:  494.577178955078125cc
;;vs 959.656982421875cc
;;86 bytes
 
    ld a,(de)
    or a
    jr nz,ErrDataType
    inc de
    ld h,a
    ld l,a
    ld a,(de)
    inc de
    sub 80h
    ret c
    jr z,final
    cp 5
    jp c,enterloop
ErrDomain:
;Throws a domain error.
    bcall(_ErrDomain)
ErrDataType:
;Throws a data type error.
    bcall(_ErrDataType)
loop:
    ld a,b
    ld b,h
    ld c,l
    add hl,hl
    add hl,bc
    add hl,hl
    add hl,hl
    add hl,hl
    add hl,bc
    add hl,hl
    add hl,hl
enterloop:
    ld b,a
    ex de,hl
    ld a,(hl) \ and $F0 \ rra \ ld c,a \ rra \ rra \ sub c \ add a,(hl)
    inc hl
    ex de,hl
    add a,l
    ld l,a
    jr nc,$+3
    inc h
    dec b
    ret z
    djnz loop
    ld b,h
    ld c,l
    xor a
;check overflow in this mul by 10!
    add hl,hl \ adc a,a
    add hl,hl \ adc a,a
    add hl,bc \ adc a,0
    add hl,hl \ adc a,a
    jr nz,ErrDomain
final:
    ld a,(de)
    rrca
    rrca
    rrca
    rrca
    and 15
    add a,l
    ld l,a
    ret nc
    inc h
    ret nz
    jr ErrDomain

EDIT: Found a cheap optimization, -1 byte, -2cc.
arriopolis's text input routine might be useful to my ASM library, ASB Lib.
I am trying to learning how to program in assembly.
I am really good at programming in TI BASIC.
ASB wrote:
arriopolis's text input routine might be useful to my ASM library, ASB Lib.
I am trying to learning how to program in assembly.
I am really good at programming in TI BASIC.
This is not the right place for this. Please post your introduction post in the Introduce Yourself thread so that I can remove your post here. Thanks!
This is a quality pseudo-random number generator. It is very fast, passes all the tests that CAcert labs uses, and would take millions of years to go through one full period.

Code:

rand:
;;Tested and passes all CAcert tests
;;Uses a very simple 32-bit LCG and 32-bit LFSR
;;it has a period of 18,446,744,069,414,584,320
;;roughly 18.4 quintillion.
;;291cc
;;58 bytes
seed1_0=$+1
    ld hl,12345
seed1_1=$+1
    ld de,6789
    ld b,h
    ld c,l
    add hl,hl \ rl e \ rl d
    add hl,hl \ rl e \ rl d
    inc l
    add hl,bc
    ld (seed1_0),hl
    ld hl,(seed1_1)
    adc hl,de
    ld (seed1_1),hl
    ex de,hl
seed2_0=$+1
    ld hl,9876
seed2_1=$+1
    ld bc,54321
    add hl,hl \ rl c \ rl b
    ld (seed2_1),bc
    sbc a,a
    and %11000101
    xor l
    ld l,a
    ld (seed2_0),hl
    ex de,hl
    add hl,bc
    ret
Fast L*L -> A

Code:

L_sqrd:
;Input: L
;Output: L*L->A
;147 t-states
;36 bytes
    ld b,l
;First iteration, get the lowest 3 bits of -x^2
    sla l
    rrc b
    sbc a,a
    or l
    ld c,a
;second iteration, get the next 2 bits of -x^2
    rrc b
    sbc a,a
    xor l
    and $F8
    add a,c
    ld c,a
;third iteration, get the next 2 bits of -x^2
    sla l
    rrc b
    sbc a,a
    xor l
    and $E0
    add a,c
    ld c,a
;fourth iteration, get the eight bit of x^2
    sla l
    rrc b
    sbc a,a
    xor l
    and $80
    sub c
    ret

Here is how I came up with the algorithm.
Yes, tricky !
But I see the result is stored into an 8bit register, and there are only 16 possible values for L if we want L^2 to be stored in the Accumulator with no overflow.

Here is my "inelegant" version :

L*L -> A (size=28bytes, T-States=60)

Code:

 ld a,l
 and 15
 ld l,a
 ld h,0
 ld bc,data
 add hl,bc
 ld a,(hl)
 ret
 data:     
 .db 0,1,4,9,16,25,36,49,64,81,100,121,144,169,196,225
It performs L^2 modulo 256. For example, 17^2 ==> 33.
Oh , I see Wink
In case of overflow, you get the "low byte" as result Wink
I don't know assembly, but if speed is a priority wouldn't it be advantageous to make a 256 element LUT? This can speed multiplication too.


Code:
ld h,table/256
ld a,(hl)
ret
;somewhere else, aligned with 256 byte boundary:
table:
.db $0, $1, $4, $9, $10, $19, $24, $31, $40, $51, $64, $79, $90, $a9, $c4, $e1, $0, $21, $44, $69, $90, $b9, $e4, $11, $40, $71, $a4, $d9, $10, $49, $84, $c1, $0, $41, $84, $c9, $10, $59, $a4, $f1, $40, $91, $e4, $39, $90, $e9, $44, $a1, $0, $61, $c4, $29, $90, $f9, $64, $d1, $40, $b1, $24, $99, $10, $89, $4, $81, $0, $81, $4, $89, $10, $99, $24, $b1, $40, $d1, $64, $f9, $90, $29, $c4, $61, $0, $a1, $44, $e9, $90, $39, $e4, $91, $40, $f1, $a4, $59, $10, $c9, $84, $41, $0, $c1, $84, $49, $10, $d9, $a4, $71, $40, $11, $e4, $b9, $90, $69, $44, $21, $0, $e1, $c4, $a9, $90, $79, $64, $51, $40, $31, $24, $19, $10, $9, $4, $1, $0, $1, $4, $9, $10, $19, $24, $31, $40, $51, $64, $79, $90, $a9, $c4, $e1, $0, $21, $44, $69, $90, $b9, $e4, $11, $40, $71, $a4, $d9, $10, $49, $84, $c1, $0, $41, $84, $c9, $10, $59, $a4, $f1, $40, $91, $e4, $39, $90, $e9, $44, $a1, $0, $61, $c4, $29, $90, $f9, $64, $d1, $40, $b1, $24, $99, $10, $89, $4, $81, $0, $81, $4, $89, $10, $99, $24, $b1, $40, $d1, $64, $f9, $90, $29, $c4, $61, $0, $a1, $44, $e9, $90, $39, $e4, $91, $40, $f1, $a4, $59, $10, $c9, $84, $41, $0, $c1, $84, $49, $10, $d9, $a4, $71, $40, $11, $e4, $b9, $90, $69, $44, $21, $0, $e1, $c4, $a9, $90, $79, $64, $51, $40, $31, $24, $19, $10, $9, $4, $1


I think the table can be reduced to 65 bytes by noticing that x^2 is equal to (128+x)^2, (128-x)^2 and (64-x)^2:

Code:
;a = l*l, 13+65=78 bytes, 56 tstates
ld a,l
and %01111111
sub $80
jr c,$+4 \ neg
ld l,a
ld h,table/256
ld a,(hl)
ret
;somewhere else, aligned with 256 byte boundary:
table:     ;64^2, 63^2,...,0^2
.db $0, $81, $4, $89, $10, $99, $24, $b1, $40, $d1, $64, $f9, $90, $29, $c4, $61, $0, $a1, $44, $e9, $90, $39, $e4, $91, $40, $f1, $a4, $59, $10, $c9, $84, $41, $0, $c1, $84, $49, $10, $d9, $a4, $71, $40, $11, $e4, $b9, $90, $69, $44, $21, $0, $e1, $c4, $a9, $90, $79, $64, $51, $40, $31, $24, $19, $10, $9, $4, $1, $0


If there's no space to fit it after a 256 byte boundary, an "add a,*" would add another 7 tstates.

EDIT: Here's Xeda's implementation of the idea that (2x+1)^2 = (4x^2 + 2x) + (2x+1) and (2x)^2 = (4x^2 + 2x) - 2x. This allows a 32 byte LUT.


Code:
sqrA:
;;A*A->A
;;Destroys: HL
;;76cc or 79cc or 82cc
;;Avg: 79cc
;;51 bytes
    add a,a
    add a,a
    jr nc,$+4 \ neg
    rrca
    rrca
    ld l,a
    srl l
    ld h,sqrLUT/256
    jr c,$+4 \ neg
    add a,(hl)
    ret
sqrLUT:
;;MUST BE ALIGNED to a 256-byte boundary.
;;Can use:
;;  #if 0!=$&255
;;  .fill 256-($&255),0
;;  #endif
.db $00,$06,$14,$2A,$48,$6E,$9C,$D2
.db $10,$56,$A4,$FA,$58,$BE,$2C,$A2
.db $20,$A6,$34,$CA,$68,$0E,$BC,$72
.db $30,$F6,$C4,$9A,$78,$5E,$4C,$42
I just want to post my very little sweet, and maybe slow division algorithm.
It divides A by B, and places the quotient in C and the remainder in A Smile


Code:
   ld c, 0
_: sub a, b
   inc c
   jr nc, -_
   dec c
   add a, b

Wonderful, not? Smile
PT_ wrote:
I just want to post my very little sweet, and maybe slow division algorithm.
It divides A by B, and places the quotient in C and the remainder in A Smile


Code:
   ld c, 0
_: sub a, b
   inc c
   jr nc, -_
   dec c
   add a, b

Wonderful, not? Smile


Yes, soooo tiny Smile However, be careful of the neverending loop if B=0
lirtosiast: Really nice job! Now I want to explore that idea some more Wink

PT_ wrote:
I just want to post my very little sweet, and maybe slow division algorithm.
It divides A by B, and places the quotient in C and the remainder in A Smile


Code:
   ld c, 0
_: sub a, b
   inc c
   jr nc, -_
   dec c
   add a, b

Wonderful, not? Smile

To optimize just a teeny bit:

Code:
   ld c, -1
_: sub a, b
   inc c
   jr nc, -_
   add a, b

Razz To fix the issue when b=0:

Code:
   ld c, -1
   inc b
   dec b
   jr z,$+7
_: sub a, b
   inc c
   jr nc, -_
   add a, b

So the net gain is 3 bytes and roughly 11cc (on average 11+1/256cc) to account for that issue.
Since I was able to optimize my best 16-bit multiply even further today, I thought I'd share! I even think it can be further optimized. And for that matter, here is my favorite DE_Times_A, too.

mul16 596.34375cc, 92 bytes (incl. DE_Times_A)

Code:

mul16:
;Inputs:
;   BC,DE are unsigned integers
;Output:
;   HL:DE is the 32-bit product
;Destroys:
;   A,B,C
;min: 359cc
;max: 717cc
;avg: 596.34375cc
;92 bytes
    ld a,c
    call DE_Times_A
    push hl
    push af
    ld a,b
    call DE_Times_A+2
    pop bc
    pop de
;AHL
; BDE
    ld c,d
    add hl,bc
    adc a,0
;AHLE
    ld d,l
    ld l,h
    ld h,a
;HLDE
    ret
DE_Times_A:
;Input: DE,A
;Output: A:HL is the product, C=0, B,DE unaffected, z flag set if result is zero, c flag set if A is input as 1, else nc.
;A:128~255 219+6{0,10}+{0,19}    avg=258.5   *1/2
;A:64~127  203+5{0,10}+{0,19}    avg=237.5   *1/4
;A:32~63   187+4{0,10}+{0,19}    avg=216.5   *1/8
;A:16~31   171+3{0,10}+{0,19}    avg=195.5   *1/16
;A:8~15    155+2{0,10}+{0,19}    avg=174.5   *1/32
;A:4~7     139+{0,10}+{0,19}     avg=153.5   *1/64
;A:2~3     123+{0,19}            avg=132.5   *1/128
;A:1       107cc                 avg=107     *1/256
;A:0       119cc                 avg=119     *1/256
;overall avg: 237.671875cc
    ld c,0
    ld h,d
    ld l,e
    add a,a \ jr c,mul_07
    rla \ jr c,mul_06
    rla \ jr c,mul_05
    rla \ jr c,mul_04
    rla \ jr c,mul_03
    rla \ jr c,mul_02
    rla \ jr c,mul_01
    rla
    ret c
    ld h,a
    ld l,a
    ret
mul_07:
    add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
mul_06:
    add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
mul_05:
    add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
mul_04:
    add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
mul_03:
    add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
mul_02:
    add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
mul_01:
    add hl,hl \ rla \ ret nc \ add hl,de \ adc a,c
    ret


DE_Times_A, 237.671875cc, 72 bytes

Code:

DE_Times_A:
;Input: DE,A
;Output: A:HL is the product, C=0, B,DE unaffected, z flag set if result is zero, c flag set if A is input as 1, else nc.
;A:128~255 219+6{0,10}+{0,19}    avg=258.5   *1/2
;A:64~127  203+5{0,10}+{0,19}    avg=237.5   *1/4
;A:32~63   187+4{0,10}+{0,19}    avg=216.5   *1/8
;A:16~31   171+3{0,10}+{0,19}    avg=195.5   *1/16
;A:8~15    155+2{0,10}+{0,19}    avg=174.5   *1/32
;A:4~7     139+{0,10}+{0,19}     avg=153.5   *1/64
;A:2~3     123+{0,19}            avg=132.5   *1/128
;A:1       107cc                 avg=107     *1/256
;A:0       119cc                 avg=119     *1/256
;overall avg: 237.671875cc
    ld c,0
    ld h,d
    ld l,e
    add a,a \ jr c,mul_07
    rla \ jr c,mul_06
    rla \ jr c,mul_05
    rla \ jr c,mul_04
    rla \ jr c,mul_03
    rla \ jr c,mul_02
    rla \ jr c,mul_01
    rla
    ret c
    ld h,a
    ld l,a
    ret
mul_07:
    add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
mul_06:
    add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
mul_05:
    add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
mul_04:
    add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
mul_03:
    add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
mul_02:
    add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
mul_01:
    add hl,hl \ rla \ ret nc \ add hl,de \ adc a,c
    ret


EDIT: I spotted a bug in the DE_Times_A routine! If A=1 and the c flag is set upon calling, it will erroneously return A=128. The fix was to turn the first 'rla' into 'add a,a'. Thankfully I haven't included the routine in any other work yet.
  
Register to Join the Conversation
Have your own thoughts to add to this or any other topic? Want to ask a question, offer a suggestion, share your own programs and projects, upload a file to the file archives, get help with calculator and computer programming, or simply chat with like-minded coders and tech and calculator enthusiasts via the site-wide AJAX SAX widget? Registration for a free Cemetech account only takes a minute.

» Go to Registration page
» Goto page Previous  1, 2, 3, 4, 5, 6, 7, 8  Next
» View previous topic :: View next topic  
Page 7 of 8
» All times are UTC - 5 Hours
 
You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot vote in polls in this forum

 

Advertisement