The 84CE does not have a native FPU, making floats slow. How slow? Below is each data type benchmarked over 1000 iterations of the corresponding operation. I'm surprised that floating point mul/div is sloewr than add/div. Here's the results (I'm unsure how rigorous the test itself is, although the emulator seems to always return the same results):
Code:
and the source code, courtesy of stackoverflow
Code:
I tried making a simple wrapper class for int that emulated low decimal precision by scaling the "float" by a factor of 2^x (to maximize the number of bit shifting operations I could use), but it only ended up being around 30% faster than actual floats at mul/div, although about 8 or 9 times more efficient at add/div than floats.
Here's what I have, and it's been cool seeing how small changes in code makes a difference in the benchmark test... although I can optimize it more! (Heck, I even changed all the pre-increment's in the for loops to post-increments as I heard they were faster, and it was cool seeing the speed ever so slightly decrease.)
Header
Code:
Source
Code:
Anyway, merry christmas y'all. I've been pretty quiet around here lately, but I look forward to changing that. I hope everyone is staying warm and safe.
Code:
-------------------------------------------------------------------------------
Operation Time (B) Data Type
-------------------------------------------------------------------------------
add/sub: 0.000030 (1) int8_t
mul/div: 0.166229 (1) int8_t
add/sub: 0.000000 (1) uint8_t
mul/div: 0.058135 (1) uint8_t
add/sub: 0.000030 (2) short
mul/div: 0.235931 (2) short
add/sub: 0.000030 (2) unsigned short
mul/div: 0.154693 (2) unsigned short
add/sub: 0.000061 (3) int
mul/div: 0.243041 (3) int
add/sub: 0.000030 (3) unsigned int
mul/div: 0.220855 (3) unsigned int
add/sub: 0.000030 (4) long
mul/div: 0.398284 (4) long
add/sub: 0.000061 (4) unsigned long
mul/div: 0.366790 (4) unsigned long
add/sub: 0.000244 (8) long long
mul/div: 1.712463 (8) long long
add/sub: 0.000244 (8) unsigned long long
mul/div: 1.677520 (8) unsigned long long
add/sub: 0.671569 (4) float
mul/div: 0.590515 (4) float
add/sub: 0.659637 (4) double
mul/div: 0.671295 (4) double
and the source code, courtesy of stackoverflow
Code:
#include <debug.h>
#include <stdlib.h> // rand()
#include <stdint.h>
#include <time.h> // clock()
template <typename T>
void benchmark(const char* name)
{
T v = 0;
// - Does not use constants or repeating values to avoid loop unroll optimizations.
// - All values >0 to avoid division by 0.
// - Perform ten ops/iteration to reduce impact of ++i below on measurements.
T v0 = (T)(rand() & 255)/16 + 1;
T v1 = (T)(rand() & 255)/16 + 1;
T v2 = (T)(rand() & 255)/16 + 1;
T v3 = (T)(rand() & 255)/16 + 1;
T v4 = (T)(rand() & 255)/16 + 1;
T v5 = (T)(rand() & 255)/16 + 1;
T v6 = (T)(rand() & 255)/16 + 1;
T v7 = (T)(rand() & 255)/16 + 1;
T v8 = (T)(rand() & 255)/16 + 1;
T v9 = (T)(rand() & 255)/16 + 1;
clock_t t1 = clock();
for (auto i{0u}; i < 1000; ++i)
{
v += v0; v -= v1;
v += v2; v -= v3;
v += v4; v -= v5;
v += v6; v -= v7;
v += v8; v -= v9;
}
clock_t t2 = clock();
// Pretend we make use of v so compiler doesn't optimize out the loop completely
if((int)v&1) dbg_ClearConsole();
dbg_printf("add/sub:\t%f\t(%d) %s\n", (t2 - t1)/32768.0f, sizeof(T), name);
t1 = clock();
for (auto i{0u}; i < 1000; ++i)
{
v /= v0; v *= v1;
v /= v2; v *= v3;
v /= v4; v *= v5;
v /= v6; v *= v7;
v /= v8; v *= v9;
}
t2 = clock();
// Pretend we make use of v so compiler doesn't optimize out the loop completely
if((int)v&1) dbg_ClearConsole();
dbg_printf("mul/div:\t%f\t(%d) %s\n", (t2 - t1)/32768.0f, sizeof(T), name);
}
int main()
{
dbg_printf("-------------------------------------------------------------------------------\n");
dbg_printf("Operation\tTime\t(B) Data Type\n");
dbg_printf("-------------------------------------------------------------------------------\n");
benchmark< int8_t >("int8");
benchmark< uint8_t>("uint8");
benchmark< short >("short");
benchmark< unsigned short >("unsigned short");
benchmark< int >("int");
benchmark< unsigned int >("unsigned int");
benchmark< long >("long");
benchmark< unsigned long> ("unsigned long");
benchmark< long long>("long long");
benchmark< unsigned long long>("unsigned long long");
benchmark< float >("float");
benchmark< double >("double");
return 0;
}
I tried making a simple wrapper class for int that emulated low decimal precision by scaling the "float" by a factor of 2^x (to maximize the number of bit shifting operations I could use), but it only ended up being around 30% faster than actual floats at mul/div, although about 8 or 9 times more efficient at add/div than floats.
Here's what I have, and it's been cool seeing how small changes in code makes a difference in the benchmark test... although I can optimize it more! (Heck, I even changed all the pre-increment's in the for loops to post-increments as I heard they were faster, and it was cool seeing the speed ever so slightly decrease.)
Header
Code:
#pragma once
#include <math.h>
#include <stdint.h>
// Float is a simple wrapper class for int that emulates decimal precision by
// bit shifting. Performs about 30% better than primitive float at the cost of
// less rigorous precision and smaller carrying capacity
class Float
{
static constexpr int PRECISION_BITS = 4;
int m_Value{0};
public:
Float(int num);
Float(float num);
operator int() const;
friend Float& operator+(Float& lhs, const Float& rhs);
friend Float& operator-(Float& lhs, const Float& rhs);
friend Float& operator*(Float& lhs, const Float& rhs);
friend Float& operator/(Float& lhs, const Float& rhs);
Float& operator+=(const Float& rhs);
Float& operator-=(const Float& rhs);
Float& operator*=(const Float& rhs);
Float& operator/=(const Float& rhs);
private:
Float& add(const Float& rhs);
Float& sub(const Float& rhs);
Float& mul(const Float& rhs);
Float& div(const Float& rhs);
};
Source
Code:
#include "Float.h"
Float::Float(int num)
: m_Value { num << PRECISION_BITS }
{}
Float::Float(float num)
: m_Value { static_cast<int>( num * pow(2, PRECISION_BITS) ) }
{}
Float::operator int() const
{
return m_Value >> PRECISION_BITS;
}
Float& operator+(Float& lhs, const Float& rhs) { return lhs.add(rhs); }
Float& operator-(Float& lhs, const Float& rhs) { return lhs.sub(rhs); }
Float& operator*(Float& lhs, const Float& rhs) { return lhs.mul(rhs); }
Float& operator/(Float& lhs, const Float& rhs) { return lhs.div(rhs); }
Float& Float::operator+=(const Float& rhs) { return this->add(rhs); }
Float& Float::operator-=(const Float& rhs) { return this->sub(rhs); }
Float& Float::operator*=(const Float& rhs) { return this->mul(rhs); }
Float& Float::operator/=(const Float& rhs) { return this->div(rhs); }
Float& Float::add(const Float& rhs)
{
this->m_Value += rhs.m_Value; return *this;
}
Float& Float::sub(const Float& rhs)
{
this->m_Value -= rhs.m_Value; return *this;
}
Float& Float::mul(const Float& rhs)
{
this->m_Value = (this->m_Value * rhs.m_Value) >> PRECISION_BITS; return *this;
// "(f1 * f2)" multiples PRECISION_BITS twice, hence need to shift one out
}
Float& Float::div(const Float& rhs)
{
this->m_Value = (this->m_Value << PRECISION_BITS) / rhs.m_Value; return *this;
// avoid "(f1 / f2) << BITS" due to precision loss from integer divison
}
Anyway, merry christmas y'all. I've been pretty quiet around here lately, but I look forward to changing that. I hope everyone is staying warm and safe.