Hey folks!

I propose we all look for the fastest Sprite routine possible.

Studying Mateo's great tutorial "Drawing Sprite" ...
( https://ce-programming.github.io/documentation/tutorials/asm/drawing-sprites/ )
I wondered if we could find optimisations for this sprite routine:


Code:
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; DrawSprite
; inputs:
;         bc = x
;          l = y
;         de = pointer to sprite
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
DrawSprite:
 ld h,lcdWidth/2       ; hl=160
 mlt hl                ; 160*y
 add hl,hl             ; hl*2
 add hl,bc             ; add x
 ld bc,vRam
 add hl,bc             ; offset vRam
 ld a,(de)             ; width
 ld (SpriteWidthSMC),a
 inc de
 ld a,(de)             ; height
 inc de
 ex de,hl
drawSpriteLoop:
SpriteWidthSMC: =$+1
 ld bc,0
 push de
 ldir                 ; draw line
 pop de
 ex de,hl
 ld bc,lcdWidth
 add hl,bc             ; move down
 ex de,hl
 dec a
 jr nz,drawSpriteLoop
 ret


Here is my attempt:


Code:
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; DrawSprite
; inputs:
;         bc = x
;          l = y
;         de = pointer to sprite
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
DrawSprite:
 ld h,lcdWidth/2       ; hl=160
 mlt hl                ; 160*y
 add hl,hl             ; hl*2
 add hl,bc             ; add x
 ex de,hl
 ld ix,vRam
 add ix,de             ; offset vRam
 ld a,(hl)             ; width
 ld (SpriteWidthSMC),a
 inc hl
 ld a,(hl)             ; height
 inc hl
 ld bc,0
 di
 ld (stpo+1),sp
 ld sp,lcdWidth
drawSpriteLoop:
SpriteWidthSMC: =$+1
 ld c,0
 lea de,ix+0 ;  now de=ix
 ldir                 ; draw line
 add ix,sp    ; move down
 dec a
 jr nz,drawSpriteLoop
stpo:
 ld sp,0
 di
 ret


EDIT: I just put "ld bc,0" before "di" Wink
Here's my attempt (note: untested!). I restricted myself to not temporarily disabling interrupts, which is appropriate for a general-use routine:

Code:
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; DrawSprite
; inputs:
;       bc = x
;       l = y
;       de = pointer to sprite
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
DrawSprite:
        ld h,lcdWidth/2                 ; hl=160
        mlt hl                          ; 160*y
        add hl,hl                       ; hl*2
        add hl,bc                       ; add x
        ld bc,vRam
        add hl,bc                       ; offset vRam
        ld a,(de)                       ; width
        ld (DrawSprite_Small_Width),a
        sub a,lcdWidth-$FF
        cpl                             ; start of next line - end of this line
        jr nc,DrawSprite_Big
DrawSprite_Small:
        ld (DrawSprite_Small_DeltaNext),a
        inc de
        ld a,(de)                       ; height
        inc de
DrawSprite_Small_Loop:
        ex de,hl
DrawSprite_Small_Width = $+1
        ld c,0
        ldir                            ; draw line
        ex de,hl
DrawSprite_Small_DeltaNext = $+1
        ld c,0
        inc b
        add hl,bc                       ; move down
        dec b
        dec a
        jr nz,DrawSprite_Small_Loop
        ret

DrawSprite_Big:
        ld (DrawSprite_Big_DeltaNext),a
        ld a,(de)
        ld (DrawSprite_Big_Width),a
        inc de
        ld a,(de)                       ; height
        inc de
DrawSprite_Big_Loop:
        ex de,hl
DrawSprite_Big_Width = $+1
        ld c,0
        ldir                            ; draw line
        ex de,hl
DrawSprite_Big_DeltaNext = $+1
        ld c,0
        add hl,bc                       ; move down
        dec a
        jr nz,DrawSprite_Big_Loop
        ret


And the all-important analytics:

Code:
 Version  | Bytes |                      Cycles                       | Side effects
          |       |  8x8  | 16x16 | 32x32 | 64x64 | 128x128 | 255x255 |
----------+-------+-------+-------+-------+-------+---------+------------------------------------------------
 Original |    39 |  1311 |  3407 | 10287 | 34799 |  126831 |  479256 | Destroys AF, BC, DE, HL
 grosged  |    54 |  1065 |  2825 |  9033 | 32201 |  121545 |  468636 | Destroys AF, BC, DE, HL, IX; delays interrupts
 Runer112 |    68 |  1073 |  2897 |  9233 | 32657 |  121518 |  468609 | Destroys AF, BC, DE, HL


Some notes:
  • My version has two code paths: "small" and "big." The slightly slower "small" path has to be used if width ≤ 64.
  • All versions could unroll the main loop to save a small number of cycles (at the cost of a large number of bytes).
  • Combine my general approach with grosged's sp usage for ultimate speed. This would have the nice bonus of eliminating the need I had for two slightly different code paths.
Kudos, Runer112 ! =D

I think of another optimisation: back to the use of ex de,hl + add hl,sp ...
Here it is :


Code:
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; DrawSprite
; inputs:
;         bc = x
;          l = y
;         de = pointer to sprite
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
DrawSprite:
 ld h,lcdWidth/2       ; hl=160
 mlt hl                ; 160*y
 add hl,hl             ; hl*2
 add hl,bc             ; add x
 ld bc,vRam
 add hl,bc             ; offset vRam
 ld a,(de)             ; width
 ld (SpriteWidthSMC),a         
 ld (subWidth+1),a
 inc de
 ld a,(de)             ; height
 inc de
 ld bc,lcdWidth
 ld a,c
subWidth:
 sub 32
 jr nc,NoCarry
 dec b
NoCarry:
 ld c,a
 ld (NewSP+1),bc
 ld b,0
 di
 ld (OldSP+1),sp
NewSP:
 ld sp,0
drawSpriteLoop:
SpriteWidthSMC: =$+1
 ld c,0
 ex de,hl
 ldir                 ; draw line
 ex de,hl
 add hl,sp    ; move down
 dec a
 jr nz,drawSpriteLoop
OldSP:
 ld sp,0
 di
 ret


..And no more use of IX register ...though, SP remains !

EDIT: Runer112, could you please update the analytics ?..Thanks in advance Wink
Now here is the unrolled version , obviously bigger, but faster !

Code:

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; DrawSprite
; inputs:
;         bc = x
;          l = y
;         de = pointer to sprite
;         (don't forget to mention its width (up to 640) & height (up to 240)
;           respectively written in  the 2 bytes which begin the sprite data)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
DrawSprite:
 ld h,lcdWidth/4  ; 16bpp mode ok, now
 mlt hl               
 add hl,hl
 add hl,hl             
 add hl,bc             ; add x
 ld bc,$d40000
 add hl,bc             ; offset vRam
 ld (Ofvram+1),hl
 ld a,(de)             ; width
 inc de
 ld bc,0
 ld c,a
 ld hl,lcdWidth
 sbc hl,bc
 di
 ld (OldSP+1),sp
 ld sp,hl
 ex de,hl
 ld c,(hl)             ; height
 inc hl
 ex de,hl
 ld b,6
 mlt bc
 ld hl,PrgJump+6
 sbc hl,bc
 ld (jpaddr+1),hl
 ld b,0
Ofvram:
 ld hl,0
jpaddr:
 jp 0
 .dw $EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB
 .dw $EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB
 .dw $EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB
 .dw $EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB
 .dw $EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB
 .dw $EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB
 .dw $EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB
 .dw $EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB
 .dw $EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB
 .dw $EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB
 .dw $EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB
 .dw $EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB
 .dw $EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB
 .dw $EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB
 .dw $EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB
 .dw $EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB
 .dw $EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB
 .dw $EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB
 .dw $EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB
 .dw $EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB
 .dw $EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB
 .dw $EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB
 .dw $EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB
 .dw $EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB
 .dw $EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB
 .dw $EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB
 .dw $EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB
 .dw $EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB
 .dw $EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB
 .dw $EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB
 .dw $EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB
 .dw $EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB
 .dw $EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB
 .dw $EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB
 .dw $EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB
 .dw $EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB
 .dw $EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB
 .dw $EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB
 .dw $EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB
 .dw $EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB,$EB4F,$B0ED,$39EB
 ld c,a    ; 4F
 ex de,hl   ; EB
 ldir      ; ED B0
 ex de,hl   ; EB
 add hl,sp    ; 39
PrgJump:
 ld c,a
 ex de,hl
 ldir
OldSP:
 ld sp,0
 ei
 ret



And last (but not least!) we can improve again speed by choosing a faster read-access memory area for storing there our sprite data : 1024 bytes available in Cursor Image RAM Register (starting at addr $e30800 ) Smile
  
Register to Join the Conversation
Have your own thoughts to add to this or any other topic? Want to ask a question, offer a suggestion, share your own programs and projects, upload a file to the file archives, get help with calculator and computer programming, or simply chat with like-minded coders and tech and calculator enthusiasts via the site-wide AJAX SAX widget? Registration for a free Cemetech account only takes a minute.

» Go to Registration page
Page 1 of 1
» All times are UTC - 5 Hours
 
You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot vote in polls in this forum

 

Advertisement