For quite some while I have been working with beckadamtheinventor and others on a modular exponentation function that I can use in hashlib for RSA. While I am familiar with the general methods that can be used for a powmod function---l2r or r2l binary, the difficulty is in writing one in assembly that is efficient. I tried in C but ran up against not knowing how to do modulus (or bigint stuff in general, effectively), beck succeeded in the pow section but ran into issues with the modulus and bounding, same as Zeda. To this end, I'm making a topic asking for help from anyone who is good at this type of math... I sure am not... to write the following function in assembly. A C variant that can be converted into asm works too. Thanks in advance.


Code:

uint2048_t ^ 65537 % uint2048_t
msg = uint2048_t, variable
exp = constant value 65537
mod = uint2048_t, variable



Code:


void vint_powmod(const uint8_t* msg, uint24_t exp, const uint8_t* mod, uint8_t modlen, uint8_t* out);
// exponent can be hardcoded
// out can be omitted if it would be easier to write to *msg
// modlen = length of key (modulus), in bytes
// length of msg will be implied as equal to that.
// the RSA wrapper will reject any msg not equal in length.
// modulus and msg are in size range min = 1024, max = 2048 bits
Pseudocode:

Code:
C=A^B%M:
  If A>M, subtract M from A
  Set C=1
  For each bit of B, starting from the most significant nonzero bit:
    C=C*C%M (see below; can skip on the first iteration)
    If the current bit of B is a 1:
      C=C*A%M (see below; can do C=A on the first iteration)

C=A^65537%M (special case):
  If A>M, subtract M from A
  C=A
  Repeat 16 times:
    C=C*C%M (see below)
  C=C*A%M (see below)

C=A*B%M:
  If A>M, subtract M from A (can skip if this function is not exported)
  Set C=0
  For each bit of B, starting from the MSB:
    Shift C left by 1 (can skip on the first iteration)
    If C>M, subtract M from C
    If the current bit of B is 1, add A to C
    If C>M, subtract M from C

Translation to assembly should be relatively straightforward. Note that the second function needs C to have 1 more bit than M, which can be just the carry flag if you are careful. There is also quite possibly some better &/or faster way to do it...this is just the second thing that came to mind (the first having been multiplying first & then dividing).

This should not be used in any situation where timing analysis is relevant. Namely, it will leak M if you let someone call it with A values of their choosing, even for fixed nontrivial B.
Here's a first pass, 231 bytes, runs in 1318881369 cycles for 2048 bit and exp of 0x10001:
Code:
#ifndef POWMOD_H
#define POWMOD_H

#include <stdint.h>

#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */

/// Computes \c{res = base ** exp % mod}.
/// Assumes \c{base < mod} and msb of \p mod is set.
/// Leaks size and exp but not the contents of exp or mod via timing.
/// @param size size in bytes of res, base, and mod, 0 means 2**8
/// @param res stores the result as little-endian bits within little-endian bytes
/// @param base the base as little-endian bits within little-endian bytes
/// @param exp the exponent
/// @param mod the modulus as little-endian bits within little-endian bytes
void powmod(uint8_t size, uint8_t *res, const uint8_t *base, uint24_t exp, const uint8_t *mod);

#ifdef __cplusplus
}
#endif /* __cplusplus */

#endif /* POWMOD_H */

Code:
   public   _powmod
_powmod:
   ld   iy, 0
   add   iy, sp
.size = iy + long
.res = .size + long
.base = .res + long
.exp = .base + long
.mod = .exp + long
   ld   hl, (.base)
   ld   de, (.res)
;   or   a, a
   call   .mulmod
   scf
   sbc   hl, hl
   xor   a, a
   sub   a, (.size)
   ld   l, a
   add   hl, sp
   ld   sp, hl
   ex   de, hl
   ld   hl, (.exp)
;   scf
.normalize:
   adc   hl, hl
   jq   nc, .normalize ; leaks (.exp:long)
   jq   .loop.enter
.loop:
   push   hl, af, de
   ld   bc, (.res)
   or   a, a
   sbc   hl, hl
   add   hl, bc
   push   hl
   scf
   call   .mulmod
   ld   bc, (.base)
   pop   de, hl, af
   push   hl
   call   .mulmod
   pop   de, hl
.loop.enter:
   or   a, a
   adc   hl, hl
   jq   nz, .loop ; leaks (.size:byte)
   ld   sp, iy
   ret
.copy:
   inc   bc
   ldir ; leaks (.size:byte)
   pop   bc
   ret
.mulmod: ; (ude:(.size:byte)) = cf ? (uhl:(.size:byte)) * (ubc:(.size:byte)) % (.mod:(.size:byte)) : (uhl:(.size:byte))
   push   bc
   ld   bc, 0
   ld   c, (.size)
   dec   c
   jq   nc, .copy ; leaks (.exp:long)
   add   hl, bc
   push   hl, de
   inc   de
   scf
   sbc   hl, hl
   add   hl, de
;   ld   b, 0
   ld   (hl), b
   ldir ; leaks (.size:byte)
   pop   de, bc, hl
   ld   a, (.size)
.outer:
   push   af
   ld   a, (bc)
   ld   (.digit), a
   dec   bc
   push   bc, hl, de
   ld   c, a
   ld   b, (hl)
   mlt   bc
   ld   a, b
   or   a, a
   ld   b, (.size)
   jq   .inner.enter
.inner:
   push   bc
   ld   c, 0
.digit := $ - byte
   inc   hl
   ld   b, (hl)
   mlt   bc
   adc   a, c
   push   af
   ld   a, 0
   rla
   add   a, b
   ld   b, a
   pop   af
   ex   de, hl
   add   a, (hl)
   ld   (hl), a
   inc   hl
   ex   de, hl
   ld   a, b
   pop   bc
.inner.enter:
   djnz   .inner ; leaks (.size:byte)
   ex   de, hl
   adc   a, (hl)
   ld   (hl), a
   pop   de, hl
   ld   b, 9
   push   bc
   jq   .reduce.enter
.reduce:
   sla   c
   push   bc, de
   ex   de, hl
   ld   b, (.size)
.shift:
   rl   (hl)
   inc   hl
   djnz   .shift ; leaks (.size:byte)
   ex   de, hl
   pop   de
.reduce.enter:
   push   hl
   sbc   a, a
   cpl
   ld   c, a
   or   a, a
   ld   b, (.size)
   ld   hl, (.mod)
   push   hl, de
.sub:
   ld   a, (de)
   sbc   a, (hl)
   ld   (de), a
   inc   de
   inc   hl
   djnz   .sub ; leaks (.size:byte)
   pop   de, hl
   sbc   a, a
   and   a, c
virtual
   adc   a, (hl)
 load .adc_a__hl_: byte from $$
end virtual
virtual
   ld   c, (hl)
 load .ld_c__hl_: byte from $$
end virtual
   and   a, .adc_a__hl_ xor .ld_c__hl_
   xor   a, .ld_c__hl_
   ld   (.mask), a
;   or   a, a
   ld   b, (.size)
   push   de
.add:
   ld   a, (de)
   adc   a, (hl)
.mask := $ - byte
   ld   (de), a
   inc   de
   inc   hl
   djnz   .add ; leaks (.size:byte)
   pop   de, hl, bc
   djnz   .reduce
   pop   bc, af
   dec   a
   jq   nz, .outer ; leaks (.size:byte)
   ret

nts: 88% of time spent reducing, so alternative representation should be beneficial.
Thanks for this!! Quick question(s) before I can move forward with it:

1. In what endianness is a string of bytes (like for example, if you were to encrypt the string "Hello World?") Would it be big-endian byte order, but little endian bit order?

2. If the answer to #1 is what I think it is, I can document that any public keys (moduli) be sent to the calculator in little endian, but i may also need to write an endian reversing function for hashlib, to deal with the 'msg' portion.
API update for powmod


Code:

/// Computes \c{base = base ** exp % mod}.
/// Assumes that \p base is less than \p mod and than \p mod is odd.
/// Leaks size and exp but not the contents of exp or mod via timing.
/// @param size size in bytes of base, and mod, 0 means 2**8
/// @param base the base stored MSB first, must be less than \p mod
///             the result is also written here
/// @param exp the exponent, must be non-zero
/// @param mod the modulus stored MSB first, must not be all zeros
void powmod(uint8_t size, uint8_t *base, uint24_t exp, const uint8_t *mod);
Here's a second pass, 348 bytes, runs in 280811106 cycles for 2048 bit and exp of 0x10001:
Code:
#ifndef POWMOD_H
#define POWMOD_H

#include <stdint.h>

#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */

/// Computes \c{base = base ** exp % mod}.
/// \p base and \p mod may not overlap.
/// Leaks size and exp but not the contents of exp or mod via timing,
/// with the assumption that \p base and \c sp point to normal ram.
/// @param size size in bytes of \p base and \p mod, 0 means 2**8, must not be 1
/// @param base the base stored MSB first, must be less than \p mod
///             the result is also written here MSB first
/// @param exp the exponent, must be non-zero
/// @param mod the modulus stored MSB first, must be odd
void powmod(uint8_t size, uint8_t *restrict base, uint24_t exp, const uint8_t *restrict mod);

#ifdef __cplusplus
}
#endif /* __cplusplus */

#endif /* POWMOD_H */

Code:
   public   _powmod
_powmod:
   push   ix
   ld   ix, 0
   lea   bc, ix
   add   ix, sp
.ret  := ix    + long
.size := .ret  + long
.base := .size + long
.exp  := .base + long
.mod  := .exp  + long
.acc  := ix    - long
.tmp  := .acc  - long
.end  := .tmp  - byte
   ld   c, (.size)
   dec   c
   ld   hl, .end - ix
   add   hl, sp
   push   hl
;   scf
   sbc   hl, bc
   push   hl
;   or   a, a
   sbc   hl, bc
   ld   sp, hl
   ld   hl, (.mod)
   add   hl, bc
   ld   (.mod), hl
   ld   b, bsr 8
   ld   e, b
;   ld   e, 1
.nmi.loop:
   ld   a, e
   ld   d, (hl)
   mlt   de
   inc   de
   inc   de
   ld   d, a
   mlt   de
   djnz   .nmi.loop ; leaks size
   ld   a, e
   ld   (.nmi), a
   ld   hl, (.base)
   add   hl, bc
   ld   (.base), hl
   ld   c, 8
.mod.outer:
   ld   b, (.size)
.mod.inner:
   push   bc, hl
   ld   b, (.size)
;   or   a, a
.shift:
   rl   (hl)
   dec   hl
   djnz   .shift ; leaks size
   pop   hl
   push   hl
   call   .reduce
   pop   hl, bc
   djnz   .mod.inner ; leaks size
   dec   c
   jq   nz, .mod.outer ; leaks constant
   ld   c, (.size)
   dec   c
   inc   bc
   ld   de, (.acc)
   lddr ; leaks size
   ld   hl, (.exp)
   scf
.normalize:
   adc   hl, hl
   jq   nc, .normalize ; leaks exp
   xor   a, a
.loop:
   push   hl, af
   ld   hl, (.acc)
   call   nz, .mul ; leaks exp
   pop   af
   ld   hl, (.base)
   call   c, .mul ; leaks exp
   pop   hl
;   or   a, a
   adc   hl, hl
   jq   nz, .loop ; leaks exp
   ld   de, (.tmp)
   add   hl, de
   dec   de
;   ld   bc, 0
   ld   c, (.size)
   dec   c
   ld   (hl), b
   push   hl
   lddr ; leaks size
   pop   hl
   inc   (hl)
   ld   iy, (.base)
   call   .mul.alt
   ld   sp, ix
   pop   ix
   ret
   ; vi(acc) = vi(acc) * vi(hl) % vi(mod)
   ; assumes bc = 0
   ; destroys vi(tmp)
   ; returns bc = 0, cf = 0
.mul:
   ld   iy, (.tmp)
.mul.alt:
   push   hl
   lea   hl, iy - 0
   lea   de, iy - 1
   ld   c, (.size)
   dec   c
   ld   (hl), b
   lddr ; leaks size
   ld   de, (.acc)
   ld   (.acc), iy
   ld   (.tmp), de
   pop   hl
   ld   c, (.size)
   or   a, a
.mul.outer:
   ld   a, (de)
   ld   (.cur), a
   dec   de
   push   de, hl, ix, iy, af
   ld   e, (hl)
   ld   d, a
   mlt   de
   push   hl
   ld   l, (iy)
   ld   h, 0
   add   hl, de
   ld   e, l
   ld   d, 0
.nmi := $ - byte
   mlt   de
   ld   a, e
   ld   (.adj), a
   ld   b, (.size)
   dec   b
   ld   ix, (.mod)
   ld   d, (ix)
   mlt   de
   add.s   hl, de
   ld   e, h
   ld   d, l
;   ld   d, 0
   rl   d
   pop   hl
.mul.inner:
   dec   hl
   push   hl
   ld   l, (hl)
   ld   h, 0
.cur := $ - byte
   mlt   hl
   adc   hl, de
   dec   ix
   ld   e, (ix)
   ld   d, 0
.adj := $ - byte
   mlt   de
   add.s   hl, de
   ld   e, h
   ld   d, 0
   rl   d
   ld   a, l
   dec   iy
   add   a, (iy)
   ld   (iy + 1), a
   pop   hl
   djnz   .mul.inner ; leaks size
   ld   l, b
   rl   l
   ld   h, b
   pop   af
   adc   hl, de
   ld   (iy + 0), l
   sra   h
   pop   iy, ix, hl, de
   dec   c
   jq   nz, .mul.outer ; leaks size
   lea   hl, iy
   ; if (cf:vi(hl) >= vi(mod)) cf:vi(hl) -= vi(mod)
   ; assumes bcu = 0
   ; destroys vi(tmp)
   ; returns bc = 0, cf = 0
.reduce:
   ccf
   sbc   a, a
   ld   c, a
   ld   b, (.size)
   ld   de, (.tmp)
   ld   iy, (.mod)
   or   a, a
   push   hl, de
.reduce.sub:
   ld   a, (hl)
   dec   hl
   sbc   a, (iy)
   dec   iy
   ld   (de), a
   dec   de
   djnz   .reduce.sub ; leaks size
   sbc   a, a
   and   a, c
   and   a, long
   sbc   hl, hl
   ld   l, a
   add   hl, sp
   ld   hl, (hl)
   pop   de, de
   ld   c, (.size)
   dec   c
   inc   bc
   lddr ; leaks size, assuming that base and stack are in normal ram
   ret
Thank you so much, @jacobly, from both myself and beck for helping out with this.
In the words of beck: "jacobly++, you absolute madlad".

At the rate we were struggling with this, without this help^ it would likely have been a long time before this got done. This was the last obstacle to completing hashlib, and now that it's done, hashlib is ready to move into RC releases. I'll update the hashlib thread presently.
  
Register to Join the Conversation
Have your own thoughts to add to this or any other topic? Want to ask a question, offer a suggestion, share your own programs and projects, upload a file to the file archives, get help with calculator and computer programming, or simply chat with like-minded coders and tech and calculator enthusiasts via the site-wide AJAX SAX widget? Registration for a free Cemetech account only takes a minute.

» Go to Registration page
Page 1 of 1
» All times are UTC - 5 Hours
 
You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot vote in polls in this forum

 

Advertisement