The 84CE does not have a native FPU, making floats slow. How slow? Below is each data type benchmarked over 1000 iterations of the corresponding operation. I'm surprised that floating point mul/div is sloewr than add/div. Here's the results (I'm unsure how rigorous the test itself is, although the emulator seems to always return the same results):


Code:
-------------------------------------------------------------------------------
Operation  Time       (B) Data Type
-------------------------------------------------------------------------------
add/sub:   0.000030   (1) int8_t
mul/div:   0.166229   (1) int8_t
add/sub:   0.000000   (1) uint8_t
mul/div:   0.058135   (1) uint8_t
add/sub:   0.000030   (2) short
mul/div:   0.235931   (2) short
add/sub:   0.000030   (2) unsigned short
mul/div:   0.154693   (2) unsigned short
add/sub:   0.000061   (3) int
mul/div:   0.243041   (3) int
add/sub:   0.000030   (3) unsigned int
mul/div:   0.220855   (3) unsigned int
add/sub:   0.000030   (4) long
mul/div:   0.398284   (4) long
add/sub:   0.000061   (4) unsigned long
mul/div:   0.366790   (4) unsigned long
add/sub:   0.000244   (8) long long
mul/div:   1.712463   (8) long long
add/sub:   0.000244   (8) unsigned long long
mul/div:   1.677520   (8) unsigned long long
add/sub:   0.671569   (4) float
mul/div:   0.590515   (4) float
add/sub:   0.659637   (4) double
mul/div:   0.671295   (4) double


and the source code, courtesy of stackoverflow Laughing

Code:
#include <debug.h>

#include <stdlib.h> // rand()
#include <stdint.h>
#include <time.h> // clock()

template <typename T>
void benchmark(const char* name)
{
    T v  = 0;

    //  - Does not use constants or repeating values to avoid loop unroll optimizations.
    //  - All values >0 to avoid division by 0.
    //  - Perform ten ops/iteration to reduce impact of ++i below on measurements.
    T v0 = (T)(rand() & 255)/16 + 1;
    T v1 = (T)(rand() & 255)/16 + 1;
    T v2 = (T)(rand() & 255)/16 + 1;
    T v3 = (T)(rand() & 255)/16 + 1;
    T v4 = (T)(rand() & 255)/16 + 1;
    T v5 = (T)(rand() & 255)/16 + 1;
    T v6 = (T)(rand() & 255)/16 + 1;
    T v7 = (T)(rand() & 255)/16 + 1;
    T v8 = (T)(rand() & 255)/16 + 1;
    T v9 = (T)(rand() & 255)/16 + 1;

    clock_t t1 = clock();

    for (auto i{0u}; i < 1000; ++i)
    {
        v += v0; v -= v1;
        v += v2; v -= v3;
        v += v4; v -= v5;
        v += v6; v -= v7;
        v += v8; v -= v9;
    }

    clock_t t2 = clock();

    //  Pretend we make use of v so compiler doesn't optimize out the loop completely
    if((int)v&1) dbg_ClearConsole();

    dbg_printf("add/sub:\t%f\t(%d) %s\n", (t2 - t1)/32768.0f, sizeof(T), name);

    t1 = clock();

    for (auto i{0u}; i < 1000; ++i)
    {
        v /= v0; v *= v1;
        v /= v2; v *= v3;
        v /= v4; v *= v5;
        v /= v6; v *= v7;
        v /= v8; v *= v9;
    }

    t2 = clock();

    //  Pretend we make use of v so compiler doesn't optimize out the loop completely
    if((int)v&1) dbg_ClearConsole();

    dbg_printf("mul/div:\t%f\t(%d) %s\n", (t2 - t1)/32768.0f, sizeof(T), name);
}

int main()
{
    dbg_printf("-------------------------------------------------------------------------------\n");
    dbg_printf("Operation\tTime\t(B) Data Type\n");
    dbg_printf("-------------------------------------------------------------------------------\n");

    benchmark< int8_t >("int8");
    benchmark< uint8_t>("uint8");
    benchmark< short >("short");
    benchmark< unsigned short >("unsigned short");
    benchmark< int >("int");
    benchmark< unsigned int >("unsigned int");
    benchmark< long >("long");
    benchmark< unsigned long> ("unsigned long");
    benchmark< long long>("long long");
    benchmark< unsigned long long>("unsigned long long");
    benchmark< float >("float");
    benchmark< double >("double");

    return 0;
}


I tried making a simple wrapper class for int that emulated low decimal precision by scaling the "float" by a factor of 2^x (to maximize the number of bit shifting operations I could use), but it only ended up being around 30% faster than actual floats at mul/div, although about 8 or 9 times more efficient at add/div than floats.

Here's what I have, and it's been cool seeing how small changes in code makes a difference in the benchmark test... although I can optimize it more! (Heck, I even changed all the pre-increment's in the for loops to post-increments as I heard they were faster, and it was cool seeing the speed ever so slightly decrease.)

Header

Code:
#pragma once

#include <math.h>
#include <stdint.h>

// Float is a simple wrapper class for int that emulates decimal precision by
// bit shifting. Performs about 30% better than primitive float at the cost of
// less rigorous precision and smaller carrying capacity
class Float
{

    static constexpr int PRECISION_BITS = 4;

    int m_Value{0};

public:

    Float(int num);
    Float(float num);

    operator int() const;

    friend Float& operator+(Float& lhs, const Float& rhs);
    friend Float& operator-(Float& lhs, const Float& rhs);
    friend Float& operator*(Float& lhs, const Float& rhs);
    friend Float& operator/(Float& lhs, const Float& rhs);

    Float& operator+=(const Float& rhs);
    Float& operator-=(const Float& rhs);
    Float& operator*=(const Float& rhs);
    Float& operator/=(const Float& rhs);

private:

    Float& add(const Float& rhs);
    Float& sub(const Float& rhs);
    Float& mul(const Float& rhs);
    Float& div(const Float& rhs);

};


Source

Code:

#include "Float.h"

Float::Float(int num)
    : m_Value   { num << PRECISION_BITS }
{}

Float::Float(float num)
    : m_Value   { static_cast<int>( num * pow(2, PRECISION_BITS) ) }
{}

Float::operator int() const
{
    return m_Value >> PRECISION_BITS;
}

Float& operator+(Float& lhs, const Float& rhs) { return lhs.add(rhs); }
Float& operator-(Float& lhs, const Float& rhs) { return lhs.sub(rhs); }
Float& operator*(Float& lhs, const Float& rhs) { return lhs.mul(rhs); }
Float& operator/(Float& lhs, const Float& rhs) { return lhs.div(rhs); }

Float& Float::operator+=(const Float& rhs)   { return this->add(rhs); }
Float& Float::operator-=(const Float& rhs)   { return this->sub(rhs); }
Float& Float::operator*=(const Float& rhs)   { return this->mul(rhs); }
Float& Float::operator/=(const Float& rhs)   { return this->div(rhs); }

Float& Float::add(const Float& rhs)
{
    this->m_Value += rhs.m_Value; return *this;
}
Float& Float::sub(const Float& rhs)
{
    this->m_Value -= rhs.m_Value; return *this;
}
Float& Float::mul(const Float& rhs)
{
    this->m_Value = (this->m_Value * rhs.m_Value) >> PRECISION_BITS; return *this;
    // "(f1 * f2)" multiples PRECISION_BITS twice, hence need to shift one out 
}
Float& Float::div(const Float& rhs)
{
    this->m_Value = (this->m_Value << PRECISION_BITS) / rhs.m_Value; return *this;
    // avoid "(f1 / f2) << BITS" due to precision loss from integer divison
}


Anyway, merry christmas y'all. I've been pretty quiet around here lately, but I look forward to changing that. I hope everyone is staying warm and safe. Very Happy
  
Register to Join the Conversation
Have your own thoughts to add to this or any other topic? Want to ask a question, offer a suggestion, share your own programs and projects, upload a file to the file archives, get help with calculator and computer programming, or simply chat with like-minded coders and tech and calculator enthusiasts via the site-wide AJAX SAX widget? Registration for a free Cemetech account only takes a minute.

» Go to Registration page
Page 1 of 1
» All times are UTC - 5 Hours
 
You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot vote in polls in this forum

 

Advertisement