From c1bcc9e6a6e7a0b48479deeb41bd996aaf91a294 Mon Sep 17 00:00:00 2001
From: czurnieden <czurnieden@gmx.de>
Date: Thu, 24 Dec 2020 01:44:59 +0100
Subject: [PATCH 1/8] Addition of faster to_radix function

---
 demo/test.c            |  33 +++++-
 mp_to_radix.c          |  63 ++++-------
 s_mp_faster_to_radix.c | 235 +++++++++++++++++++++++++++++++++++++++++
 s_mp_radix_map.c       |  16 +++
 s_mp_slower_to_radix.c |  77 ++++++++++++++
 tommath_private.h      |  17 +++
 6 files changed, 392 insertions(+), 49 deletions(-)
 create mode 100644 s_mp_faster_to_radix.c
 create mode 100644 s_mp_slower_to_radix.c

diff --git a/demo/test.c b/demo/test.c
index f86853644..8f2b0381d 100644
--- a/demo/test.c
+++ b/demo/test.c
@@ -1155,9 +1155,10 @@ static int test_mp_read_radix(void)
 {
    char buf[4096];
    size_t written;
+   int bignum, i;
 
-   mp_int a;
-   DOR(mp_init_multi(&a, NULL));
+   mp_int a, b;
+   DOR(mp_init_multi(&a, &b, NULL));
 
    DO(mp_read_radix(&a, "123456", 10));
 
@@ -1183,6 +1184,30 @@ static int test_mp_read_radix(void)
    DO(mp_to_radix(&a, buf, sizeof(buf), &written, 10));
    printf("\r '0' a == %s, length = %zu", buf, written);
 
+   /* Test the fast method with a slightly larger number */
+
+   /* Must be bigger than the cut-off value, of course */
+   bignum = 2* (2 * s_mp_radix_exponent_y[2] * MP_RADIX_BARRETT_START_MULTIPLICATOR);
+   printf("Size of bignum_size = %d\n", bignum);
+   /* Check if "bignum" is small enough for the result to fit into "buf"
+      otherwise lead tester to this function */
+   if (bignum >= 4096) {
+      fprintf(stderr, "Buffer too small, please check function \"test_mp_read_radix\" in \"test.c\"");
+      goto LBL_ERR;
+   }
+   /* Produce a random number */
+   bignum /= MP_DIGIT_BIT;
+   DO(mp_rand(&b, bignum));
+   /* Check if it makes the round */
+   printf("Number of limbs in &b = %d, bit_count of &b = %d\n", bignum, mp_count_bits(&b));
+   for (i = 2; i < 65; i++) {
+      DO(mp_to_radix(&b, buf, sizeof(buf), &written, i));
+      DO(mp_read_radix(&a, buf, i));
+      EXPECT(mp_cmp(&a, &b) == MP_EQ);
+      /* fprintf(stderr,"radix = %d\n",i); */
+   }
+
+
    while (0) {
       char *s = fgets(buf, sizeof(buf), stdin);
       if (s != buf) break;
@@ -1192,10 +1217,10 @@ static int test_mp_read_radix(void)
       printf("%s, %lu\n", buf, (unsigned long)a.dp[0] & 3uL);
    }
 
-   mp_clear(&a);
+   mp_clear_multi(&a, &b, NULL);
    return EXIT_SUCCESS;
 LBL_ERR:
-   mp_clear(&a);
+   mp_clear_multi(&a, &b, NULL);
    return EXIT_FAILURE;
 }
 
diff --git a/mp_to_radix.c b/mp_to_radix.c
index 1e5e67110..114902f36 100644
--- a/mp_to_radix.c
+++ b/mp_to_radix.c
@@ -3,17 +3,6 @@
 /* LibTomMath, multiple-precision integer library -- Tom St Denis */
 /* SPDX-License-Identifier: Unlicense */
 
-/* reverse an array, used for radix code */
-static void s_reverse(char *s, size_t len)
-{
-   size_t ix = 0, iy = len - 1u;
-   while (ix < iy) {
-      MP_EXCH(char, s[ix], s[iy]);
-      ++ix;
-      --iy;
-   }
-}
-
 /* stores a bignum as a ASCII string in a given radix (2..64)
  *
  * Stores upto "size - 1" chars and always a NULL byte, puts the number of characters
@@ -21,11 +10,9 @@ static void s_reverse(char *s, size_t len)
  */
 mp_err mp_to_radix(const mp_int *a, char *str, size_t maxlen, size_t *written, int radix)
 {
-   size_t  digs;
-   mp_err  err;
-   mp_int  t;
-   mp_digit d;
-   char   *_s = str;
+   mp_err err;
+   mp_int a_bar = *a;
+   size_t part_written = 0;
 
    /* check range of radix and size*/
    if (maxlen < 2u) {
@@ -45,50 +32,36 @@ mp_err mp_to_radix(const mp_int *a, char *str, size_t maxlen, size_t *written, i
       return MP_OKAY;
    }
 
-   if ((err = mp_init_copy(&t, a)) != MP_OKAY) {
-      return err;
-   }
-
    /* if it is negative output a - */
-   if (mp_isneg(&t)) {
-      /* we have to reverse our digits later... but not the - sign!! */
-      ++_s;
-
+   if (mp_isneg(a)) {
       /* store the flag and mark the number as positive */
       *str++ = '-';
-      t.sign = MP_ZPOS;
+      a_bar.sign = MP_ZPOS;
 
       /* subtract a char */
       --maxlen;
    }
-   digs = 0u;
-   while (!mp_iszero(&t)) {
-      if (--maxlen < 1u) {
-         /* no more room */
-         err = MP_BUF;
-         goto LBL_ERR;
-      }
-      if ((err = mp_div_d(&t, (mp_digit)radix, &t, &d)) != MP_OKAY) {
-         goto LBL_ERR;
+
+   /* TODO: check if it can be done better */
+   if (MP_HAS(S_MP_FASTER_TO_RADIX)) {
+      if ((err = s_mp_faster_to_radix(&a_bar, str, maxlen, &part_written, radix)) != MP_OKAY)            goto LBL_ERR;
+   } else {
+      if (MP_HAS(S_MP_SLOWER_TO_RADIX)) {
+         if ((err = s_mp_slower_to_radix(&a_bar, &str, &maxlen, &part_written, radix, false)) != MP_OKAY) goto LBL_ERR;
+         /* part_written does not count EOS */
+         part_written++;
       }
-      *str++ = s_mp_radix_map[d];
-      ++digs;
    }
-   /* reverse the digits of the string.  In this case _s points
-    * to the first digit [excluding the sign] of the number
-    */
-   s_reverse(_s, digs);
 
-   /* append a NULL so the string is properly terminated */
-   *str = '\0';
-   digs++;
+   /* TODO: Think about adding a function for base-2 radices only although
+            s_faster_to_radix is rather quick with such radices. */
 
    if (written != NULL) {
-      *written = mp_isneg(a) ? (digs + 1u): digs;
+      part_written += mp_isneg(a) ? 1: 0;
+      *written = part_written;
    }
 
 LBL_ERR:
-   mp_clear(&t);
    return err;
 }
 
diff --git a/s_mp_faster_to_radix.c b/s_mp_faster_to_radix.c
new file mode 100644
index 000000000..c621bf2e4
--- /dev/null
+++ b/s_mp_faster_to_radix.c
@@ -0,0 +1,235 @@
+#include "tommath_private.h"
+#ifdef S_MP_FASTER_TO_RADIX_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis */
+/* SPDX-License-Identifier: Unlicense */
+
+/* Portable integer log of two with small footprint */
+static int32_t s_floor_ilog2(int32_t value)
+{
+   int r = 0;
+   while ((value /= 2) != 0) {
+      r++;
+   }
+   return r;
+}
+
+/* Exponentiation with small footprint */
+static int32_t s_pow(int32_t base, int32_t exponent)
+{
+   int32_t result = 1;
+   while (exponent != 0) {
+      if ((exponent % 2) == 1) {
+         result *= base;
+      }
+      exponent /= 2;
+      base *= base;
+   }
+   return result;
+}
+
+static mp_err s_mp_to_radix_recursive(const mp_int *a, char **str, size_t *part_maxlen, size_t *part_written,
+                                      int radix, int32_t k, int32_t t, bool pad, mp_int *P, mp_int *R)
+{
+
+   mp_int r, q, a1;
+   mp_err err;
+   int Beta;
+
+   if (t < 0) {
+      /* Print the string from the number given*/
+      if ((err = s_mp_slower_to_radix(a, str, part_maxlen, part_written, radix, pad)) != MP_OKAY)        goto LTM_ERR;
+
+   }  else {
+      if ((err = mp_init_multi(&q, &r, &a1, NULL)) != MP_OKAY)                                           goto LTM_ERR;
+      /*
+         Barrett reduction. A step by step proof can be found at
+         https://www.nayuki.io/page/barrett-reduction-algorithm
+
+         See also: Modern Computer Arithmetic, version 0.5.9, page 59
+       */
+
+      /* If this cast-feast looks familiar: it is the numerator from computing the reciprocal*/
+      Beta = (int)((int32_t)((uint32_t)1 << (t+1)) * k);
+
+      /* Q = floor(A1 * I / 2^Beta) */
+      /* I = floor( (2^(2*Beta)) / B) Here we have R[t] = I, P[t] = B */
+      if ((err = mp_mul(a, &R[t], &q)) != MP_OKAY)                                                       goto LTM_ERR;
+      if ((err = mp_div_2d(&q, Beta, &q, NULL)) != MP_OKAY)                                              goto LTM_ERR;
+
+      /* R = A - Q*B */
+      if ((err = mp_mul(&q, &P[t], &r)) != MP_OKAY)                                                      goto LTM_ERR;
+      if ((err = mp_sub(a, &r, &r)) != MP_OKAY)                                                          goto LTM_ERR;
+
+      /* We can use this simple correction because of the way we computed the reciprocal */
+      if (r.sign == MP_NEG) {
+         if ((err = mp_decr(&q)) != MP_OKAY)                                                             goto LTM_ERR;
+         if ((err = mp_add(&r, &P[t], &r)) != MP_OKAY)                                                   goto LTM_ERR;
+      }
+
+      /* Go down the lists while climbing up the tree. */
+      t--;
+
+      /* Follow branches */
+      if (mp_iszero(&q) && (!pad)) {
+         if ((err = s_mp_to_radix_recursive(&r, str, part_maxlen, part_written, radix,
+                                            k, t, false, P, R)) != MP_OKAY)                              goto LTM_ERR;
+      } else {
+         if ((err = s_mp_to_radix_recursive(&q, str, part_maxlen, part_written, radix,
+                                            k, t,  pad, P, R)) != MP_OKAY)                               goto LTM_ERR;
+         if ((err = s_mp_to_radix_recursive(&r, str, part_maxlen, part_written, radix,
+                                            k, t, true, P, R)) != MP_OKAY)                               goto LTM_ERR;
+      }
+      mp_clear_multi(&q, &r, &a1, NULL);
+   }
+
+   err = MP_OKAY;
+LTM_ERR:
+   return err;
+}
+
+
+mp_err s_mp_faster_to_radix(const mp_int *a, char *str, size_t maxlen, size_t *written, int radix)
+{
+   mp_err err;
+   int32_t n = 0, k, t = 0, steps;
+   int ilog2a;
+
+   /* Use given buffer directly, no temporary buffers for the individual chunks */
+   char **sptr = &str;
+   /* Size of the chunk */
+   size_t part_written = 0;
+   size_t part_maxlen = maxlen;
+
+   /* List of reciprocals */
+   mp_int *R = NULL;
+   /* List of moduli */
+   mp_int *P = NULL;
+
+   /* Denominator for the reciprocal: b^y */
+   n = s_pow((int32_t)radix, (int32_t)s_mp_radix_exponent_y[radix]);
+
+   /* Numerator of the reciprocal: ceil(log_2(n)) */
+   k = s_floor_ilog2(n) + 1;
+
+   /* steps = floor(log_2(floor(log_2(a))))*/
+   ilog2a = mp_count_bits(a) - 1;
+
+   /* Cutoff at about twice the size of P[0]. Interestingly far below Karatsuba cut-off. */
+   if (ilog2a < (2 * k * MP_RADIX_BARRETT_START_MULTIPLICATOR)) {
+      if ((err = s_mp_slower_to_radix(a, sptr, &part_maxlen, &part_written, radix, false)) != MP_OKAY)   goto LTM_ERR;
+      /* part_written does not count EOS */
+      *written = part_written + 1;
+      return err;
+   }
+   /*
+      floor(log_2(floor(log_2(a)))) is not enough but we check for
+      the end inside the loop and the list is just a list of pointers,
+      not much memory wasted here.
+    */
+   steps  = s_floor_ilog2((int32_t)ilog2a) + 2;
+
+   /* Allocate memory for list of reciprocals */
+   R = (mp_int *) MP_MALLOC((size_t) steps * sizeof(mp_int));
+   if (R == NULL) {
+      return MP_MEM;
+   }
+   /* Allocate memory for list of moduli */
+   P = (mp_int *) MP_MALLOC((size_t) steps * sizeof(mp_int));
+   if (P == NULL) {
+      MP_FREE_BUF(R, (size_t) steps * sizeof(mp_int));
+      return MP_MEM;
+   }
+
+   /*
+      The approximation to the reciprocal used in Barrett's method is
+          R_t = ceil(2^((2^t)*k)/n^(2^t))
+      with R_0 = (2^(2*k))/b^y and k = ceil(log_2(n)) as computed above.
+    */
+
+   /* To get the tree a bit flatter. Alternative: do it iteratively instead of recursively */
+   k = k * MP_RADIX_BARRETT_START_MULTIPLICATOR;
+
+   /* Compute initial reciprocal R[0] and expand it (R[0]^(2^k) */
+   if ((err = mp_init_i32(&P[0], n)) != MP_OKAY)                                                         goto LTM_ERR;
+   if ((err = mp_expt_n(&P[0], MP_RADIX_BARRETT_START_MULTIPLICATOR, &P[0])) != MP_OKAY)                 goto LTM_ERR;
+
+   if ((err = mp_init(&R[0])) != MP_OKAY)                                                                goto LTM_ERR;
+   if ((err = mp_2expt(&R[0], 2*k)) != MP_OKAY)                                                          goto LTM_ERR;
+
+   if ((err = mp_div(&R[0], &P[0], &R[0], NULL)) != MP_OKAY)                                             goto LTM_ERR;
+   if ((err = mp_incr(&R[0])) != MP_OKAY)                                                                goto LTM_ERR;
+
+   /* Compute the rest of the reciprocals if as needed */
+   for (t = 1; t < steps; t++) {
+      /* P_t = (b^y)^(2^t) = n^(2^t) */
+      /*
+         We cannot just square because it can
+            a) overflow MP_MAX_DIGIT_COUNT
+            b) it can get bigger than "a" which it shouldn't
+               which also means that
+            c) if it gets bigger than "a" we have all necessary
+               reciprocals and can break out of the loop
+      */
+      /* Check for overflow of 2^((2^t)*k) i.e. bigger than 2^MP_MAX_DIGIT_COUNT */
+      if (((int)(1u << t)*k) > MP_MAX_DIGIT_COUNT) {
+         /* TODO: This can only happen near MP_MAX_DIGIT_COUNT and we can use
+                  the reciprocal R[t-1] to do the division but R[t] != R[t-1]^2
+                  so we cannot just divide by R[t-1] twice.
+          */
+         err = MP_OVF;
+         goto LTM_ERR;
+      }
+
+      /* P[t-1]^2 > a at most likely more than just a bit or too, so check if we
+         can bail out early without actually computing the square. The
+         constant "10" is comprised of unity plus some angst-allowance */
+      if ((2 * mp_count_bits(&P[t-1]) - 10) > ilog2a) {
+         /* Correct index */
+         t--;
+         break;
+      }
+
+      /* Compute denominator */
+      if ((err = mp_init(&P[t])) != MP_OKAY)                                                             goto LTM_ERR;
+      /* P[t] = P[t-1]^2 */
+      if ((err = mp_sqr(&P[t-1], &P[t])) != MP_OKAY)                                                     goto LTM_ERR;
+      /* Check if P[t]^2 > a */
+      if (mp_cmp(&P[t],a) == MP_GT) {
+         /* We don't need P[t] anymore */
+         mp_clear(&P[t]);
+         /* Correct index */
+         t--;
+         break;
+      }
+      /* Compute numerator */
+      if ((err = mp_init(&R[t])) != MP_OKAY)                                                             goto LTM_ERR;
+
+      /* R[t] = R[t] << (2^t * k) The factor cannot overflow, we checked that above */
+      /* TODO: these are more castings than in the ER in Mayrhofen at New Year's Eve! */
+      if ((err = mp_2expt(&(R[t]), (int)((int32_t)((uint32_t)1 << (t+1)) * k))) != MP_OKAY)              goto LTM_ERR;
+
+      /* Compute reciprocal */
+      /* R[t] = floor(2^(2^t * k) / P[t] */
+      if ((err = mp_div(&R[t], &P[t], &R[t], NULL)) != MP_OKAY)                                          goto LTM_ERR;
+      /* Ceiling if P[t] is not a power of two but it is not a problem if P[t] is a power of two. */
+      if ((err = mp_incr(&R[t])) != MP_OKAY)                                                             goto LTM_ERR;
+   }
+
+   /* And finally: start the recursion. */
+   if ((err = s_mp_to_radix_recursive(a, sptr, &part_maxlen, &part_written, radix,
+                                      k, t, false, P, R)) != MP_OKAY)                                    goto LTM_ERR;
+   /* part_written does not account for EOS */
+   *written = part_written + 1;
+
+   err = MP_OKAY;
+LTM_ERR:
+   do {
+      mp_clear(&P[t]);
+      mp_clear(&R[t]);
+   } while (t--);
+   MP_FREE_BUF(P, (size_t) steps * sizeof(mp_int));
+   MP_FREE_BUF(R, (size_t) steps * sizeof(mp_int));
+   return err;
+}
+
+#endif
diff --git a/s_mp_radix_map.c b/s_mp_radix_map.c
index 68e21f32e..950a698fe 100644
--- a/s_mp_radix_map.c
+++ b/s_mp_radix_map.c
@@ -15,5 +15,21 @@ const uint8_t s_mp_radix_map_reverse[] = {
    0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, /* ghijklmnop */
    0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d  /* qrstuvwxyz */
 };
+
 MP_STATIC_ASSERT(correct_radix_map_reverse_size, sizeof(s_mp_radix_map_reverse) == MP_RADIX_MAP_REVERSE_SIZE)
+
+/* Exponents chosen such that b^(y) <= 2^20 */
+const uint8_t s_mp_radix_exponent_y[] = {  0, 0,                      /*  0 .. 1*/
+                                           20, 12, 10, 8, 7, 7, 6, 6, /*  2 .. 9 */
+                                           6, 5, 5, 5, 5, 5, 5, 4,    /* 10 .. 17 */
+                                           4, 4, 4, 4, 4, 4, 4, 4,    /* 18 .. 25 */
+                                           4, 4, 4, 4, 4, 4, 4, 3,    /* 26 .. 33 */
+                                           3, 3, 3, 3, 3, 3, 3, 3,    /* 34 .. 41 */
+                                           3, 3, 3, 3, 3, 3, 3, 3,    /* 42 .. 49 */
+                                           3, 3, 3, 3, 3, 3, 3, 3,    /* 51 .. 57 */
+                                           3, 3, 3, 3, 3, 3, 3        /* 58 .. 64 */
+                                        };
+
+MP_STATIC_ASSERT(correct_mp_radix_exponent_y, sizeof(s_mp_radix_exponent_y) == MP_RADIX_EXPONENT_Y_SIZE)
+
 #endif
diff --git a/s_mp_slower_to_radix.c b/s_mp_slower_to_radix.c
new file mode 100644
index 000000000..f0f5caf52
--- /dev/null
+++ b/s_mp_slower_to_radix.c
@@ -0,0 +1,77 @@
+#include "tommath_private.h"
+#ifdef S_MP_SLOWER_TO_RADIX_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis */
+/* SPDX-License-Identifier: Unlicense */
+
+static void s_reverse(char *s, size_t len)
+{
+   size_t ix = 0, iy = len - 1u;
+   while (ix < iy) {
+      MP_EXCH(char, s[ix], s[iy]);
+      ++ix;
+      --iy;
+   }
+}
+
+mp_err s_mp_slower_to_radix(const mp_int *a, char **str,
+                            size_t *part_maxlen, size_t *part_written, int radix, bool pad)
+{
+   size_t digs = 0u;
+   mp_int t;
+   mp_digit d;
+   mp_err err = MP_OKAY;
+
+   /* The number of digits of "radix" to be filled if this chunk is not the most significant one. */
+   int ybar = s_mp_radix_exponent_y[radix] * MP_RADIX_BARRETT_START_MULTIPLICATOR;
+
+   /* A temporary pointer to the output string to make reversal simpler */
+   char *s = *str;
+
+   /* TODO: input a is already a copy of the original and we could use it destructively? */
+   if ((err = mp_init_copy(&t, a)) != MP_OKAY)                                                           goto LTM_ERR;
+
+   while (!mp_iszero(&t)) {
+      /* TODO: this method to decrease "maxlen" is not threadsafe! */
+      if ((--(*part_maxlen)) < 1u) {
+         /* no more room */
+         err = MP_BUF;
+         goto LTM_ERR;
+      }
+      if ((err = mp_div_d(&t, (mp_digit)radix, &t, &d)) != MP_OKAY) {
+         goto LTM_ERR;
+      }
+      *s++ = s_mp_radix_map[d];
+      ++digs;
+      ybar--;
+   }
+
+   /* Fill in leading zeros if this chunk contains the most significant digits. */
+   if (pad) {
+      while ((ybar-- > 0) && (((*part_maxlen)--) > 0)) {
+         *s++ = '0';
+         digs++;
+      }
+   }
+
+   /* TODO: I think that can be done more elegantly */
+   /* "rewind" */
+   s = *str;
+   /* reverse */
+   s_reverse(s, digs);
+   /* step forward */
+   *str += digs;
+   /* Add EOS at teh end of every chunk to allow this function to be used stand-alone */
+   **str = '\0';
+
+   /* TODO: this method to increase "written" is not threadsafe! */
+   if (part_written != NULL) {
+      *part_written = *part_written + digs;
+   }
+
+   err = MP_OKAY;
+LTM_ERR:
+   mp_clear(&t);
+   return err;
+}
+
+#endif
diff --git a/tommath_private.h b/tommath_private.h
index d319a1db0..b55392c49 100644
--- a/tommath_private.h
+++ b/tommath_private.h
@@ -229,10 +229,27 @@ MP_PRIVATE mp_err s_mp_radix_size_overestimate(const mp_int *a, const int radix,
 MP_PRIVATE mp_err s_mp_fp_log(const mp_int *a, mp_int *c) MP_WUR;
 MP_PRIVATE mp_err s_mp_fp_log_d(const mp_int *a, mp_word *c) MP_WUR;
 
+/* Radix conversion O(M(n) log n) Schoenhage method */
+MP_PRIVATE mp_err s_mp_faster_to_radix(const mp_int *a, char *str, size_t maxlen, size_t *written, int radix) MP_WUR;
+/* Radix conversion O(n^2) method */
+MP_PRIVATE mp_err s_mp_slower_to_radix(const mp_int *a, char **str, size_t *part_maxlen, size_t *part_written,
+                                       int radix, bool pad) MP_WUR;
+
 #define MP_RADIX_MAP_REVERSE_SIZE 80u
+#define MP_RADIX_EXPONENT_Y_SIZE  65u
 extern MP_PRIVATE const char s_mp_radix_map[];
 extern MP_PRIVATE const uint8_t s_mp_radix_map_reverse[];
 extern MP_PRIVATE const mp_digit s_mp_prime_tab[];
+extern MP_PRIVATE const uint8_t s_mp_radix_exponent_y[];
+
+/*
+  There is not much to tune here, the steps are of the form 2^k and too large
+  for tuning to make a alot of sense.
+ */
+#ifndef MP_RADIX_BARRETT_START_MULTIPLICATOR
+#   define MP_RADIX_BARRETT_START_MULTIPLICATOR   8
+#endif
+
 
 /* number of primes */
 #define MP_PRIME_TAB_SIZE 256

From 6c9fa6b0af90c899d4a086da43bc39b6729601b8 Mon Sep 17 00:00:00 2001
From: czurnieden <czurnieden@gmx.de>
Date: Thu, 24 Dec 2020 04:09:09 +0100
Subject: [PATCH 2/8] Addition of faster read_radix method

---
 demo/test.c              |  85 +++++++++++++++++++++++------
 mp_read_radix.c          |  58 ++++++++------------
 s_mp_faster_read_radix.c |  69 +++++++++++++++++++++++
 s_mp_faster_to_radix.c   | 115 ++++++++++++++++++++++++---------------
 s_mp_slower_read_radix.c |  39 +++++++++++++
 s_mp_slower_to_radix.c   |   4 +-
 tommath_private.h        |  10 +++-
 7 files changed, 279 insertions(+), 101 deletions(-)
 create mode 100644 s_mp_faster_read_radix.c
 create mode 100644 s_mp_slower_read_radix.c

diff --git a/demo/test.c b/demo/test.c
index 8f2b0381d..c48d5d80f 100644
--- a/demo/test.c
+++ b/demo/test.c
@@ -1151,12 +1151,17 @@ static int test_mp_montgomery_reduce(void)
 
 }
 
+#include <time.h>
 static int test_mp_read_radix(void)
 {
    char buf[4096];
-   size_t written;
+   size_t written, maxlen;
    int bignum, i;
 
+   char *buffer, *bcpy;
+
+   clock_t start, stop, t_slow, t_fast;
+
    mp_int a, b;
    DOR(mp_init_multi(&a, &b, NULL));
 
@@ -1187,25 +1192,73 @@ static int test_mp_read_radix(void)
    /* Test the fast method with a slightly larger number */
 
    /* Must be bigger than the cut-off value, of course */
-   bignum = 2* (2 * s_mp_radix_exponent_y[2] * MP_RADIX_BARRETT_START_MULTIPLICATOR);
-   printf("Size of bignum_size = %d\n", bignum);
-   /* Check if "bignum" is small enough for the result to fit into "buf"
-      otherwise lead tester to this function */
-   if (bignum >= 4096) {
-      fprintf(stderr, "Buffer too small, please check function \"test_mp_read_radix\" in \"test.c\"");
+   bignum = (2 * 20 * MP_RADIX_BARRETT_START_MULTIPLICATOR)  * 10;
+   buffer = (char *)malloc((size_t)(bignum + 2));
+   if (buffer == NULL) {
       goto LBL_ERR;
    }
-   /* Produce a random number */
-   bignum /= MP_DIGIT_BIT;
-   DO(mp_rand(&b, bignum));
-   /* Check if it makes the round */
-   printf("Number of limbs in &b = %d, bit_count of &b = %d\n", bignum, mp_count_bits(&b));
+   DO(mp_rand(&a, bignum / MP_DIGIT_BIT));
+   printf("\nNumber of limbs in &b = %d, bit_count of &b = %d\n", bignum / MP_DIGIT_BIT, mp_count_bits(&a));
+   start = clock();
+   for (i = 2; i < 65; i++) {
+      /* printf("FAST radix = %d\n",i); */
+      DO(mp_to_radix(&a, buffer, (size_t)(bignum + 1), &written, i));
+      DO(mp_read_radix(&b, buffer, i));
+      EXPECT(mp_cmp(&a, &b) == MP_EQ);
+   }
+   stop = clock();
+   t_fast = stop - start;
+
+   printf("Same number, slow radix conversions\n");
+   start = clock();
    for (i = 2; i < 65; i++) {
-      DO(mp_to_radix(&b, buf, sizeof(buf), &written, i));
-      DO(mp_read_radix(&a, buf, i));
+      /* printf("SLOW radix = %d\n",i); */
+      maxlen = (size_t)(bignum + 1);
+      bcpy = buffer;
+      DO(s_mp_slower_to_radix(&a, &bcpy, &maxlen, &written, i, false));
+      DO(s_mp_slower_read_radix(&b, bcpy, 0, strlen(bcpy), i));
       EXPECT(mp_cmp(&a, &b) == MP_EQ);
-      /* fprintf(stderr,"radix = %d\n",i); */
    }
+   stop = clock();
+   t_slow = stop - start;
+
+   /* It is "long int" in GLibC but can be bigger and/or even a floating point elsewhere */
+   printf("SLOW: %.10f, FAST: %.10f\n", (double)t_slow/(double)CLOCKS_PER_SEC, (double)t_fast/(double)CLOCKS_PER_SEC);
+
+   /* Check if the branching works. */
+   if (MP_HAS(S_MP_FASTER_READ_RADIX) && MP_HAS(S_MP_FASTER_TO_RADIX)) {
+      if (t_fast > t_slow) {
+         fprintf(stderr, "Timing suspicious in test_mp_read_radix. No fast multiplication? Cut-off too low?\n");
+         goto LBL_ERR;
+      }
+   }
+
+
+   free(buffer);
+
+#if ((MP_DIGIT_BIT <= 16) && (defined MP_CHECK_RADIX_OVF))
+   /* Check a number of size (MP_MAX_DIGIT_COUNT * MP_DIGIT_BIT - 1) at fixed radix "10". */
+   /* Will not work if test is run on platforms with larger int's because
+         #define MP_MAX_DIGIT_COUNT ((INT_MAX - 2) / MP_DIGIT_BIT)
+      So we have to replace the value for INT_MAX with 2^15 - 1 = 32767 to test 16-bit int's. Not
+      very elegant but it works.
+   */
+   bignum = ((32767 - 2) / MP_DIGIT_BIT);
+   bignum = ((bignum - 1) * MP_DIGIT_BIT) + (MP_DIGIT_BIT - 1);
+   /* Manual computation because the automatic methods might not have been included in the build */
+   buffer = (char *)malloc(((bignum + 2)/1000) * 333);
+   if (buffer == NULL) {
+      goto LBL_ERR;
+   }
+   DO(mp_2expt(&a, bignum));
+   DO(mp_decr(&a));
+   printf("Number of limbs in &b = %d, bit_count of &b = %d\n", bignum / MP_DIGIT_BIT, mp_count_bits(&a));
+   DO(mp_to_radix(&a, buffer, ((bignum + 2)/1000) * 333, &written, 10));
+   DO(mp_read_radix(&b, buffer, 10));
+   EXPECT(mp_cmp(&a, &b) == MP_EQ);
+   free(buffer);
+#endif
+
 
 
    while (0) {
@@ -2504,7 +2557,7 @@ static int unit_tests(int argc, char **argv)
       T1(mp_prime_next_prime, MP_PRIME_NEXT_PRIME),
       T1(mp_prime_rand, MP_PRIME_RAND),
       T1(mp_rand, MP_RAND),
-      T1(mp_read_radix, MP_READ_RADIX),
+      T3(mp_read_radix, ONLY_PUBLIC_API, MP_READ_RADIX, MP_TO_RADIX),
       T1(mp_read_write_ubin, MP_TO_UBIN),
       T1(mp_read_write_sbin, MP_TO_SBIN),
       T1(mp_reduce_2k, MP_REDUCE_2K),
diff --git a/mp_read_radix.c b/mp_read_radix.c
index 28e6eb60a..f02d8c592 100644
--- a/mp_read_radix.c
+++ b/mp_read_radix.c
@@ -3,11 +3,23 @@
 /* LibTomMath, multiple-precision integer library -- Tom St Denis */
 /* SPDX-License-Identifier: Unlicense */
 
+static size_t s_mp_strlen(const char *s)
+{
+   const char *p;
+   p = s;
+   while (*p != '\0') {
+      p++;
+   }
+   return (size_t)(p - s);
+}
+
 /* read a string [ASCII] in a given radix */
 mp_err mp_read_radix(mp_int *a, const char *str, int radix)
 {
+
    mp_err   err;
    mp_sign  sign = MP_ZPOS;
+   size_t slen;
 
    /* make sure the radix is ok */
    if ((radix < 2) || (radix > 64)) {
@@ -22,48 +34,24 @@ mp_err mp_read_radix(mp_int *a, const char *str, int radix)
       sign = MP_NEG;
    }
 
-   /* set the integer to the default of zero */
-   mp_zero(a);
-
-   /* process each digit of the string */
-   while (*str != '\0') {
-      /* if the radix <= 36 the conversion is case insensitive
-       * this allows numbers like 1AB and 1ab to represent the same  value
-       * [e.g. in hex]
-       */
-      uint8_t y;
-      char ch = (radix <= 36) ? (char)MP_TOUPPER((int)*str) : *str;
-      unsigned pos = (unsigned)(ch - '+');
-      if (MP_RADIX_MAP_REVERSE_SIZE <= pos) {
-         break;
-      }
-      y = s_mp_radix_map_reverse[pos];
+   slen = s_mp_strlen(str);
 
-      /* if the char was found in the map
-       * and is less than the given radix add it
-       * to the number, otherwise exit the loop.
-       */
-      if (y >= radix) {
-         break;
-      }
-      if ((err = mp_mul_d(a, (mp_digit)radix, a)) != MP_OKAY) {
-         return err;
-      }
-      if ((err = mp_add_d(a, y, a)) != MP_OKAY) {
-         return err;
-      }
-      ++str;
-   }
+   mp_zero(a);
 
-   /* if an illegal character was found, fail. */
-   if ((*str != '\0') && (*str != '\r') && (*str != '\n')) {
-      return MP_VAL;
+   /* Try faster version first */
+   if (MP_HAS(S_MP_FASTER_READ_RADIX)) {
+      if ((err = s_mp_faster_read_radix(a, str, 0, slen, radix)) != MP_OKAY)                             goto LTM_ERR;
+   } else if (MP_HAS(S_MP_SLOWER_READ_RADIX)) {
+      if ((err = s_mp_slower_read_radix(a, str, 0, slen, radix)) != MP_OKAY)                             goto LTM_ERR;
    }
 
    /* set the sign only if a != 0 */
    if (!mp_iszero(a)) {
       a->sign = sign;
    }
-   return MP_OKAY;
+
+LTM_ERR:
+   return err;
 }
+
 #endif
diff --git a/s_mp_faster_read_radix.c b/s_mp_faster_read_radix.c
new file mode 100644
index 000000000..76c0441b9
--- /dev/null
+++ b/s_mp_faster_read_radix.c
@@ -0,0 +1,69 @@
+#include "tommath_private.h"
+#ifdef S_MP_FASTER_READ_RADIX_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis */
+/* SPDX-License-Identifier: Unlicense */
+
+/* TODO: It is tunable */
+#define MP_READ_RADIX_CUTOFF_MULTIPLICATOR 3
+
+/* for(n=2,64,t = ceil(log(2^100)/log(n)); printf(t", "); ) */
+static const uint8_t s_read_radix_cutoff[65] = { 0, 0,                            /*  0 .. 1*/
+                                                 100, 64, 50, 44, 39, 36, 34, 32, /*  2 .. 9 */
+                                                 31, 29, 28, 28, 27, 26, 25, 25,  /* 10 .. 17 */
+                                                 24, 24, 24, 23, 23, 23, 22, 22,  /* 18 .. 25 */
+                                                 22, 22, 21, 21, 21, 21, 20, 20,  /* 26 .. 33 */
+                                                 20, 20, 20, 20, 20, 19, 19, 19,  /* 34 .. 41 */
+                                                 19, 19, 19, 19, 19, 19, 18, 18,  /* 42 .. 49 */
+                                                 18, 18, 18, 18, 18, 18, 18, 18,  /* 51 .. 57 */
+                                                 18, 17, 17, 17, 17, 17, 17       /* 58 .. 64 */
+                                               };
+
+/* This is in mp_prime_is_prime.c and can be reused */
+static int s_floor_ilog2(int value)
+{
+   int r = 0;
+   while ((value >>= 1) != 0) {
+      r++;
+   }
+   return r;
+}
+
+mp_err s_mp_faster_read_radix(mp_int *a, const char *str, size_t start, size_t end, int radix)
+{
+   size_t len, mid;
+   mp_int A, B, m;
+   mp_err err = MP_OKAY;
+
+   len = end - start;
+
+   if (len < (size_t)(s_read_radix_cutoff[radix] * MP_READ_RADIX_CUTOFF_MULTIPLICATOR)) {
+      return s_mp_slower_read_radix(a, str, start, end, radix);
+   }
+
+   mid = len / 2u;
+
+   if ((err = mp_init_set(&m, (mp_digit)radix)) != MP_OKAY) {
+      return err;
+   }
+   if ((err = mp_init_multi(&A, &B, NULL)) != MP_OKAY) {
+      mp_clear(&m);
+      return err;
+   }
+
+   if ((err = s_mp_slower_read_radix(&A, str, start, start + mid + 1, radix)) != MP_OKAY)                goto LTM_ERR;
+   if ((err = s_mp_slower_read_radix(&B, str, start + mid +1, end, radix)) != MP_OKAY)                   goto LTM_ERR;
+
+   if (MP_IS_2EXPT((unsigned int)radix)) {
+      if ((err = mp_mul_2d(&A, (int)(((len - mid) - 1u) * (size_t)s_floor_ilog2(radix)), &A)) != MP_OKAY)goto LTM_ERR;
+   } else {
+      if ((err = mp_expt_n(&m, (int)((len - mid) - 1u), &m)) != MP_OKAY)                                 goto LTM_ERR;
+      if ((err = mp_mul(&A, &m, &A)) != MP_OKAY)                                                         goto LTM_ERR;
+   }
+   if ((err = mp_add(&A, &B, a)) != MP_OKAY)                                                             goto LTM_ERR;
+
+LTM_ERR:
+   mp_clear_multi(&A, &B, &m, NULL);
+   return err;
+}
+
+#endif
diff --git a/s_mp_faster_to_radix.c b/s_mp_faster_to_radix.c
index c621bf2e4..e45a059dd 100644
--- a/s_mp_faster_to_radix.c
+++ b/s_mp_faster_to_radix.c
@@ -14,6 +14,7 @@ static int32_t s_floor_ilog2(int32_t value)
 }
 
 /* Exponentiation with small footprint */
+
 static int32_t s_pow(int32_t base, int32_t exponent)
 {
    int32_t result = 1;
@@ -22,13 +23,18 @@ static int32_t s_pow(int32_t base, int32_t exponent)
          result *= base;
       }
       exponent /= 2;
+      if (exponent == 0) {
+         break;
+      }
       base *= base;
    }
    return result;
 }
 
+#define MP_COMPUTE_ESS(T) ((int)((int32_t)((uint32_t)1 << (T)) * k))
+
 static mp_err s_mp_to_radix_recursive(const mp_int *a, char **str, size_t *part_maxlen, size_t *part_written,
-                                      int radix, int32_t k, int32_t t, bool pad, mp_int *P, mp_int *R)
+                                      int radix, int32_t k, int32_t t, bool pad, bool first, mp_int *P, mp_int *R)
 {
 
    mp_int r, q, a1;
@@ -41,43 +47,45 @@ static mp_err s_mp_to_radix_recursive(const mp_int *a, char **str, size_t *part_
 
    }  else {
       if ((err = mp_init_multi(&q, &r, &a1, NULL)) != MP_OKAY)                                           goto LTM_ERR;
-      /*
-         Barrett reduction. A step by step proof can be found at
-         https://www.nayuki.io/page/barrett-reduction-algorithm
+      if (first) {
+         if ((err = mp_div(a, &P[t], &q, &r)) != MP_OKAY)                                                  goto LTM_ERR;
+      } else {
+         /*
+            Barrett reduction. A step by step proof can be found at
+            https://www.nayuki.io/page/barrett-reduction-algorithm
 
-         See also: Modern Computer Arithmetic, version 0.5.9, page 59
-       */
+            See also: Modern Computer Arithmetic, version 0.5.9, page 59
+          */
 
-      /* If this cast-feast looks familiar: it is the numerator from computing the reciprocal*/
-      Beta = (int)((int32_t)((uint32_t)1 << (t+1)) * k);
+         Beta = MP_COMPUTE_ESS(t+1);
 
-      /* Q = floor(A1 * I / 2^Beta) */
-      /* I = floor( (2^(2*Beta)) / B) Here we have R[t] = I, P[t] = B */
-      if ((err = mp_mul(a, &R[t], &q)) != MP_OKAY)                                                       goto LTM_ERR;
-      if ((err = mp_div_2d(&q, Beta, &q, NULL)) != MP_OKAY)                                              goto LTM_ERR;
+         /* Q = floor(A1 * I / 2^Beta) */
+         /* I = floor( (2^(2*Beta)) / B) Here we have R[t] = I, P[t] = B */
+         if ((err = mp_mul(a, &R[t], &q)) != MP_OKAY)                                                    goto LTM_ERR;
+         if ((err = mp_div_2d(&q, Beta, &q, NULL)) != MP_OKAY)                                           goto LTM_ERR;
 
-      /* R = A - Q*B */
-      if ((err = mp_mul(&q, &P[t], &r)) != MP_OKAY)                                                      goto LTM_ERR;
-      if ((err = mp_sub(a, &r, &r)) != MP_OKAY)                                                          goto LTM_ERR;
+         /* R = A - Q*B */
+         if ((err = mp_mul(&q, &P[t], &r)) != MP_OKAY)                                                   goto LTM_ERR;
+         if ((err = mp_sub(a, &r, &r)) != MP_OKAY)                                                       goto LTM_ERR;
 
-      /* We can use this simple correction because of the way we computed the reciprocal */
-      if (r.sign == MP_NEG) {
-         if ((err = mp_decr(&q)) != MP_OKAY)                                                             goto LTM_ERR;
-         if ((err = mp_add(&r, &P[t], &r)) != MP_OKAY)                                                   goto LTM_ERR;
+         /* We can use this simple correction because of the way we computed the reciprocal */
+         if (r.sign == MP_NEG) {
+            if ((err = mp_decr(&q)) != MP_OKAY)                                                          goto LTM_ERR;
+            if ((err = mp_add(&r, &P[t], &r)) != MP_OKAY)                                                goto LTM_ERR;
+         }
       }
-
       /* Go down the lists while climbing up the tree. */
       t--;
 
       /* Follow branches */
       if (mp_iszero(&q) && (!pad)) {
          if ((err = s_mp_to_radix_recursive(&r, str, part_maxlen, part_written, radix,
-                                            k, t, false, P, R)) != MP_OKAY)                              goto LTM_ERR;
+                                            k, t, false, false, P, R)) != MP_OKAY)                       goto LTM_ERR;
       } else {
          if ((err = s_mp_to_radix_recursive(&q, str, part_maxlen, part_written, radix,
-                                            k, t,  pad, P, R)) != MP_OKAY)                               goto LTM_ERR;
+                                            k, t,  pad, false, P, R)) != MP_OKAY)                        goto LTM_ERR;
          if ((err = s_mp_to_radix_recursive(&r, str, part_maxlen, part_written, radix,
-                                            k, t, true, P, R)) != MP_OKAY)                               goto LTM_ERR;
+                                            k, t, true, false, P, R)) != MP_OKAY)                        goto LTM_ERR;
       }
       mp_clear_multi(&q, &r, &a1, NULL);
    }
@@ -87,11 +95,10 @@ static mp_err s_mp_to_radix_recursive(const mp_int *a, char **str, size_t *part_
    return err;
 }
 
-
 mp_err s_mp_faster_to_radix(const mp_int *a, char *str, size_t maxlen, size_t *written, int radix)
 {
    mp_err err;
-   int32_t n = 0, k, t = 0, steps;
+   int32_t n = 0, k, t = 0, steps = 0;
    int ilog2a;
 
    /* Use given buffer directly, no temporary buffers for the individual chunks */
@@ -100,6 +107,8 @@ mp_err s_mp_faster_to_radix(const mp_int *a, char *str, size_t maxlen, size_t *w
    size_t part_written = 0;
    size_t part_maxlen = maxlen;
 
+   bool num_ovf = false;
+
    /* List of reciprocals */
    mp_int *R = NULL;
    /* List of moduli */
@@ -114,7 +123,7 @@ mp_err s_mp_faster_to_radix(const mp_int *a, char *str, size_t maxlen, size_t *w
    /* steps = floor(log_2(floor(log_2(a))))*/
    ilog2a = mp_count_bits(a) - 1;
 
-   /* Cutoff at about twice the size of P[0]. Interestingly far below Karatsuba cut-off. */
+   /* Cutoff at about twice the size of P[0]. */
    if (ilog2a < (2 * k * MP_RADIX_BARRETT_START_MULTIPLICATOR)) {
       if ((err = s_mp_slower_to_radix(a, sptr, &part_maxlen, &part_written, radix, false)) != MP_OKAY)   goto LTM_ERR;
       /* part_written does not count EOS */
@@ -159,6 +168,7 @@ mp_err s_mp_faster_to_radix(const mp_int *a, char *str, size_t maxlen, size_t *w
    if ((err = mp_div(&R[0], &P[0], &R[0], NULL)) != MP_OKAY)                                             goto LTM_ERR;
    if ((err = mp_incr(&R[0])) != MP_OKAY)                                                                goto LTM_ERR;
 
+
    /* Compute the rest of the reciprocals if as needed */
    for (t = 1; t < steps; t++) {
       /* P_t = (b^y)^(2^t) = n^(2^t) */
@@ -175,15 +185,29 @@ mp_err s_mp_faster_to_radix(const mp_int *a, char *str, size_t maxlen, size_t *w
          /* TODO: This can only happen near MP_MAX_DIGIT_COUNT and we can use
                   the reciprocal R[t-1] to do the division but R[t] != R[t-1]^2
                   so we cannot just divide by R[t-1] twice.
+
+                  But as it is the root of the tree it is used only once and caching
+                  makes no sense in the first place, we can divide a/P[last] directly
+
+                  This is always the case for the first division and we can do it
+                  in general to save about half of the cache memory and a bit of
+                  computation time by avoiding the overhead of the Barrett division.
+
+                  We can set a flag (MP_OVF is an error and it might be frowned upon
+                  using it as a flag) or R[last] to zero (minus one) or just start
+                  with a plain division every time as described above.
+
+                  Problem with the "always dividing directly" is that it is not known
+                  for sure if P[t-1]^2 > a without actualy computing P[t-1]^2 but it
+                  is a rare event that the heuristic check below fails, so the cost is
+                  not as high as it seems.
           */
-         err = MP_OVF;
-         goto LTM_ERR;
+         num_ovf = true;
       }
 
-      /* P[t-1]^2 > a at most likely more than just a bit or too, so check if we
-         can bail out early without actually computing the square. The
-         constant "10" is comprised of unity plus some angst-allowance */
-      if ((2 * mp_count_bits(&P[t-1]) - 10) > ilog2a) {
+      /* P[t-1]^2 > a is most likely more than just a bit or too, so check if we
+         can bail out early without actually computing the square. */
+      if ((2 * mp_count_bits(&P[t-1]) - 4) > ilog2a) {
          /* Correct index */
          t--;
          break;
@@ -201,23 +225,24 @@ mp_err s_mp_faster_to_radix(const mp_int *a, char *str, size_t maxlen, size_t *w
          t--;
          break;
       }
-      /* Compute numerator */
-      if ((err = mp_init(&R[t])) != MP_OKAY)                                                             goto LTM_ERR;
-
-      /* R[t] = R[t] << (2^t * k) The factor cannot overflow, we checked that above */
-      /* TODO: these are more castings than in the ER in Mayrhofen at New Year's Eve! */
-      if ((err = mp_2expt(&(R[t]), (int)((int32_t)((uint32_t)1 << (t+1)) * k))) != MP_OKAY)              goto LTM_ERR;
-
-      /* Compute reciprocal */
-      /* R[t] = floor(2^(2^t * k) / P[t] */
-      if ((err = mp_div(&R[t], &P[t], &R[t], NULL)) != MP_OKAY)                                          goto LTM_ERR;
-      /* Ceiling if P[t] is not a power of two but it is not a problem if P[t] is a power of two. */
-      if ((err = mp_incr(&R[t])) != MP_OKAY)                                                             goto LTM_ERR;
+
+      /* We cannot evaluate the numerator if the computation would overflow */
+      if (!num_ovf) {
+         /* Compute numerator */
+         if ((err = mp_init(&R[t])) != MP_OKAY)                                                          goto LTM_ERR;
+         /* R[t] = R[t] << (2^t * k) The factor cannot overflow, we checked that above */
+         if ((err = mp_2expt(&(R[t]), MP_COMPUTE_ESS(t + 1))) != MP_OKAY)                                goto LTM_ERR;
+         /* Compute reciprocal */
+         /* R[t] = floor(2^(2^t * k) / P[t] */
+         if ((err = mp_div(&R[t], &P[t], &R[t], NULL)) != MP_OKAY)                                       goto LTM_ERR;
+         /* Ceiling if P[t] is not a power of two but it is not a problem if P[t] is a power of two. */
+         if ((err = mp_incr(&R[t])) != MP_OKAY)                                                          goto LTM_ERR;
+      }
    }
 
    /* And finally: start the recursion. */
    if ((err = s_mp_to_radix_recursive(a, sptr, &part_maxlen, &part_written, radix,
-                                      k, t, false, P, R)) != MP_OKAY)                                    goto LTM_ERR;
+                                      k, t, false, num_ovf, P, R)) != MP_OKAY)                           goto LTM_ERR;
    /* part_written does not account for EOS */
    *written = part_written + 1;
 
diff --git a/s_mp_slower_read_radix.c b/s_mp_slower_read_radix.c
new file mode 100644
index 000000000..852c47cb4
--- /dev/null
+++ b/s_mp_slower_read_radix.c
@@ -0,0 +1,39 @@
+#include "tommath_private.h"
+#ifdef S_MP_SLOWER_READ_RADIX_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis */
+/* SPDX-License-Identifier: Unlicense */
+
+mp_err s_mp_slower_read_radix(mp_int *a, const char *str, size_t start, size_t end, int radix)
+{
+   mp_err err;
+   size_t i;
+
+   /* checks are done by caller */
+
+   char *_s = (char *)(str + start);
+   for (i = start; (i < end) && (*_s != '\0'); i++) {
+      uint8_t y;
+
+      char ch = (radix <= 36) ? (char)MP_TOUPPER((int)*_s) : *_s;
+      unsigned int pos = (unsigned int)(ch - '+');
+      if (MP_RADIX_MAP_REVERSE_SIZE <= pos) {
+         err = MP_VAL;
+         goto LBL_ERR;
+      }
+      y = s_mp_radix_map_reverse[pos];
+      if (y >= radix) {
+         err = MP_VAL;
+         goto LBL_ERR;
+      }
+      if ((err = mp_mul_d(a, (mp_digit)radix, a)) != MP_OKAY)                                            goto LBL_ERR;
+      if ((err = mp_add_d(a, (mp_digit)y, a)) != MP_OKAY)                                                goto LBL_ERR;
+      _s++;
+   }
+
+   return MP_OKAY;
+LBL_ERR:
+   mp_zero(a);
+   return err;
+}
+
+#endif
diff --git a/s_mp_slower_to_radix.c b/s_mp_slower_to_radix.c
index f0f5caf52..6dacb13a1 100644
--- a/s_mp_slower_to_radix.c
+++ b/s_mp_slower_to_radix.c
@@ -45,7 +45,7 @@ mp_err s_mp_slower_to_radix(const mp_int *a, char **str,
       ybar--;
    }
 
-   /* Fill in leading zeros if this chunk contains the most significant digits. */
+   /* Fill in leading zeros if this chunk does not contain the most significant digits. */
    if (pad) {
       while ((ybar-- > 0) && (((*part_maxlen)--) > 0)) {
          *s++ = '0';
@@ -60,7 +60,7 @@ mp_err s_mp_slower_to_radix(const mp_int *a, char **str,
    s_reverse(s, digs);
    /* step forward */
    *str += digs;
-   /* Add EOS at teh end of every chunk to allow this function to be used stand-alone */
+   /* Add EOS at the end of every chunk to allow this function to be used stand-alone */
    **str = '\0';
 
    /* TODO: this method to increase "written" is not threadsafe! */
diff --git a/tommath_private.h b/tommath_private.h
index b55392c49..b510be254 100644
--- a/tommath_private.h
+++ b/tommath_private.h
@@ -231,9 +231,13 @@ MP_PRIVATE mp_err s_mp_fp_log_d(const mp_int *a, mp_word *c) MP_WUR;
 
 /* Radix conversion O(M(n) log n) Schoenhage method */
 MP_PRIVATE mp_err s_mp_faster_to_radix(const mp_int *a, char *str, size_t maxlen, size_t *written, int radix) MP_WUR;
+MP_PRIVATE mp_err s_mp_faster_read_radix(mp_int *a, const char *str, size_t start, size_t end, int radix) MP_WUR;
 /* Radix conversion O(n^2) method */
 MP_PRIVATE mp_err s_mp_slower_to_radix(const mp_int *a, char **str, size_t *part_maxlen, size_t *part_written,
                                        int radix, bool pad) MP_WUR;
+MP_PRIVATE mp_err s_mp_slower_read_radix(mp_int *a, const char *str, size_t start, size_t end, int radix) MP_WUR;
+
+
 
 #define MP_RADIX_MAP_REVERSE_SIZE 80u
 #define MP_RADIX_EXPONENT_Y_SIZE  65u
@@ -243,11 +247,11 @@ extern MP_PRIVATE const mp_digit s_mp_prime_tab[];
 extern MP_PRIVATE const uint8_t s_mp_radix_exponent_y[];
 
 /*
-  There is not much to tune here, the steps are of the form 2^k and too large
-  for tuning to make a alot of sense.
+  This is the value without the Newton-Raphson optimization.
+  Tuneable?
  */
 #ifndef MP_RADIX_BARRETT_START_MULTIPLICATOR
-#   define MP_RADIX_BARRETT_START_MULTIPLICATOR   8
+#   define MP_RADIX_BARRETT_START_MULTIPLICATOR   50
 #endif
 
 

From 11e735039e5eeabf68a0fa2f5a6e09a3920e4097 Mon Sep 17 00:00:00 2001
From: Steffen Jaeckel <s@jaeckel.eu>
Date: Tue, 11 Apr 2023 14:36:40 +0200
Subject: [PATCH 3/8] add option to use standard `strlen()` instead of
 home-baked version

Signed-off-by: Steffen Jaeckel <s@jaeckel.eu>
---
 mp_read_radix.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/mp_read_radix.c b/mp_read_radix.c
index f02d8c592..4b4313a68 100644
--- a/mp_read_radix.c
+++ b/mp_read_radix.c
@@ -3,6 +3,11 @@
 /* LibTomMath, multiple-precision integer library -- Tom St Denis */
 /* SPDX-License-Identifier: Unlicense */
 
+#ifdef MP_USE_MEMOPS
+#  include <string.h>
+
+#  define MP_STRLEN(s) strlen(s)
+#else
 static size_t s_mp_strlen(const char *s)
 {
    const char *p;
@@ -12,6 +17,8 @@ static size_t s_mp_strlen(const char *s)
    }
    return (size_t)(p - s);
 }
+#  define MP_STRLEN(s) s_mp_strlen(s)
+#endif
 
 /* read a string [ASCII] in a given radix */
 mp_err mp_read_radix(mp_int *a, const char *str, int radix)
@@ -34,7 +41,7 @@ mp_err mp_read_radix(mp_int *a, const char *str, int radix)
       sign = MP_NEG;
    }
 
-   slen = s_mp_strlen(str);
+   slen = MP_STRLEN(str);
 
    mp_zero(a);
 

From fec4f5ca4c9c69e247c9ab669c3d1420ccd23682 Mon Sep 17 00:00:00 2001
From: Steffen Jaeckel <s@jaeckel.eu>
Date: Tue, 11 Apr 2023 16:08:55 +0200
Subject: [PATCH 4/8] add `s_mp_floor_ilog2()`

instead of all those copies

Signed-off-by: Steffen Jaeckel <s@jaeckel.eu>
---
 libtommath_VS2008.vcproj | 20 ++++++++++++
 makefile                 | 13 ++++----
 makefile.mingw           | 13 ++++----
 makefile.msvc            | 13 ++++----
 makefile.shared          | 13 ++++----
 makefile.unix            | 13 ++++----
 mp_prime_is_prime.c      | 12 +------
 s_mp_faster_read_radix.c | 17 +++-------
 s_mp_floor_ilog2.c       | 15 +++++++++
 s_mp_fp_log_d.c          | 11 +------
 sources.cmake            |  5 +++
 tommath_class.h          | 68 ++++++++++++++++++++++++++++++++++++----
 tommath_private.h        |  1 +
 13 files changed, 144 insertions(+), 70 deletions(-)
 create mode 100644 s_mp_floor_ilog2.c

diff --git a/libtommath_VS2008.vcproj b/libtommath_VS2008.vcproj
index 13158a09d..c90ff4b6f 100644
--- a/libtommath_VS2008.vcproj
+++ b/libtommath_VS2008.vcproj
@@ -832,6 +832,18 @@
 			RelativePath="s_mp_exptmod_fast.c"
 			>
 		</File>
+		<File
+			RelativePath="s_mp_faster_read_radix.c"
+			>
+		</File>
+		<File
+			RelativePath="s_mp_faster_to_radix.c"
+			>
+		</File>
+		<File
+			RelativePath="s_mp_floor_ilog2.c"
+			>
+		</File>
 		<File
 			RelativePath="s_mp_fp_log.c"
 			>
@@ -908,6 +920,14 @@
 			RelativePath="s_mp_rand_platform.c"
 			>
 		</File>
+		<File
+			RelativePath="s_mp_slower_read_radix.c"
+			>
+		</File>
+		<File
+			RelativePath="s_mp_slower_to_radix.c"
+			>
+		</File>
 		<File
 			RelativePath="s_mp_sqr.c"
 			>
diff --git a/makefile b/makefile
index f8feff7cc..21a40483f 100644
--- a/makefile
+++ b/makefile
@@ -44,12 +44,13 @@ mp_reduce_setup.o mp_root_n.o mp_rshd.o mp_sbin_size.o mp_set.o mp_set_double.o
 mp_set_l.o mp_set_u32.o mp_set_u64.o mp_set_ul.o mp_shrink.o mp_signed_rsh.o mp_sqrmod.o mp_sqrt.o \
 mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_radix.o mp_to_sbin.o mp_to_ubin.o mp_ubin_size.o \
 mp_unpack.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_3.o s_mp_div_recursive.o \
-s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_fp_log.o s_mp_fp_log_d.o \
-s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o s_mp_log_2expt.o s_mp_montgomery_reduce_comba.o s_mp_mul.o \
-s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o \
-s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_prime_tab.o s_mp_radix_map.o \
-s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o s_mp_sqr_karatsuba.o \
-s_mp_sqr_toom.o s_mp_sub.o s_mp_zero_buf.o s_mp_zero_digs.o
+s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_faster_read_radix.o \
+s_mp_faster_to_radix.o s_mp_floor_ilog2.o s_mp_fp_log.o s_mp_fp_log_d.o s_mp_get_bit.o s_mp_invmod.o \
+s_mp_invmod_odd.o s_mp_log_2expt.o s_mp_montgomery_reduce_comba.o s_mp_mul.o s_mp_mul_balance.o \
+s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o s_mp_mul_toom.o \
+s_mp_prime_is_divisible.o s_mp_prime_tab.o s_mp_radix_map.o s_mp_radix_size_overestimate.o \
+s_mp_rand_platform.o s_mp_slower_read_radix.o s_mp_slower_to_radix.o s_mp_sqr.o s_mp_sqr_comba.o \
+s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o s_mp_zero_buf.o s_mp_zero_digs.o
 
 #END_INS
 
diff --git a/makefile.mingw b/makefile.mingw
index 532747be0..3cd8f5b53 100644
--- a/makefile.mingw
+++ b/makefile.mingw
@@ -46,12 +46,13 @@ mp_reduce_setup.o mp_root_n.o mp_rshd.o mp_sbin_size.o mp_set.o mp_set_double.o
 mp_set_l.o mp_set_u32.o mp_set_u64.o mp_set_ul.o mp_shrink.o mp_signed_rsh.o mp_sqrmod.o mp_sqrt.o \
 mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_radix.o mp_to_sbin.o mp_to_ubin.o mp_ubin_size.o \
 mp_unpack.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_3.o s_mp_div_recursive.o \
-s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_fp_log.o s_mp_fp_log_d.o \
-s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o s_mp_log_2expt.o s_mp_montgomery_reduce_comba.o s_mp_mul.o \
-s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o \
-s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_prime_tab.o s_mp_radix_map.o \
-s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o s_mp_sqr_karatsuba.o \
-s_mp_sqr_toom.o s_mp_sub.o s_mp_zero_buf.o s_mp_zero_digs.o
+s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_faster_read_radix.o \
+s_mp_faster_to_radix.o s_mp_floor_ilog2.o s_mp_fp_log.o s_mp_fp_log_d.o s_mp_get_bit.o s_mp_invmod.o \
+s_mp_invmod_odd.o s_mp_log_2expt.o s_mp_montgomery_reduce_comba.o s_mp_mul.o s_mp_mul_balance.o \
+s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o s_mp_mul_toom.o \
+s_mp_prime_is_divisible.o s_mp_prime_tab.o s_mp_radix_map.o s_mp_radix_size_overestimate.o \
+s_mp_rand_platform.o s_mp_slower_read_radix.o s_mp_slower_to_radix.o s_mp_sqr.o s_mp_sqr_comba.o \
+s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o s_mp_zero_buf.o s_mp_zero_digs.o
 
 HEADERS_PUB=tommath.h
 HEADERS=tommath_private.h tommath_class.h tommath_superclass.h tommath_cutoffs.h $(HEADERS_PUB)
diff --git a/makefile.msvc b/makefile.msvc
index 5d1285490..903c3f4b1 100644
--- a/makefile.msvc
+++ b/makefile.msvc
@@ -42,12 +42,13 @@ mp_reduce_setup.obj mp_root_n.obj mp_rshd.obj mp_sbin_size.obj mp_set.obj mp_set
 mp_set_l.obj mp_set_u32.obj mp_set_u64.obj mp_set_ul.obj mp_shrink.obj mp_signed_rsh.obj mp_sqrmod.obj mp_sqrt.obj \
 mp_sqrtmod_prime.obj mp_sub.obj mp_sub_d.obj mp_submod.obj mp_to_radix.obj mp_to_sbin.obj mp_to_ubin.obj mp_ubin_size.obj \
 mp_unpack.obj mp_xor.obj mp_zero.obj s_mp_add.obj s_mp_copy_digs.obj s_mp_div_3.obj s_mp_div_recursive.obj \
-s_mp_div_school.obj s_mp_div_small.obj s_mp_exptmod.obj s_mp_exptmod_fast.obj s_mp_fp_log.obj s_mp_fp_log_d.obj \
-s_mp_get_bit.obj s_mp_invmod.obj s_mp_invmod_odd.obj s_mp_log_2expt.obj s_mp_montgomery_reduce_comba.obj s_mp_mul.obj \
-s_mp_mul_balance.obj s_mp_mul_comba.obj s_mp_mul_high.obj s_mp_mul_high_comba.obj s_mp_mul_karatsuba.obj \
-s_mp_mul_toom.obj s_mp_prime_is_divisible.obj s_mp_prime_tab.obj s_mp_radix_map.obj \
-s_mp_radix_size_overestimate.obj s_mp_rand_platform.obj s_mp_sqr.obj s_mp_sqr_comba.obj s_mp_sqr_karatsuba.obj \
-s_mp_sqr_toom.obj s_mp_sub.obj s_mp_zero_buf.obj s_mp_zero_digs.obj
+s_mp_div_school.obj s_mp_div_small.obj s_mp_exptmod.obj s_mp_exptmod_fast.obj s_mp_faster_read_radix.obj \
+s_mp_faster_to_radix.obj s_mp_floor_ilog2.obj s_mp_fp_log.obj s_mp_fp_log_d.obj s_mp_get_bit.obj s_mp_invmod.obj \
+s_mp_invmod_odd.obj s_mp_log_2expt.obj s_mp_montgomery_reduce_comba.obj s_mp_mul.obj s_mp_mul_balance.obj \
+s_mp_mul_comba.obj s_mp_mul_high.obj s_mp_mul_high_comba.obj s_mp_mul_karatsuba.obj s_mp_mul_toom.obj \
+s_mp_prime_is_divisible.obj s_mp_prime_tab.obj s_mp_radix_map.obj s_mp_radix_size_overestimate.obj \
+s_mp_rand_platform.obj s_mp_slower_read_radix.obj s_mp_slower_to_radix.obj s_mp_sqr.obj s_mp_sqr_comba.obj \
+s_mp_sqr_karatsuba.obj s_mp_sqr_toom.obj s_mp_sub.obj s_mp_zero_buf.obj s_mp_zero_digs.obj
 
 HEADERS_PUB=tommath.h
 HEADERS=tommath_private.h tommath_class.h tommath_superclass.h tommath_cutoffs.h $(HEADERS_PUB)
diff --git a/makefile.shared b/makefile.shared
index c9b933513..c48efdd6a 100644
--- a/makefile.shared
+++ b/makefile.shared
@@ -41,12 +41,13 @@ mp_reduce_setup.o mp_root_n.o mp_rshd.o mp_sbin_size.o mp_set.o mp_set_double.o
 mp_set_l.o mp_set_u32.o mp_set_u64.o mp_set_ul.o mp_shrink.o mp_signed_rsh.o mp_sqrmod.o mp_sqrt.o \
 mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_radix.o mp_to_sbin.o mp_to_ubin.o mp_ubin_size.o \
 mp_unpack.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_3.o s_mp_div_recursive.o \
-s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_fp_log.o s_mp_fp_log_d.o \
-s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o s_mp_log_2expt.o s_mp_montgomery_reduce_comba.o s_mp_mul.o \
-s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o \
-s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_prime_tab.o s_mp_radix_map.o \
-s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o s_mp_sqr_karatsuba.o \
-s_mp_sqr_toom.o s_mp_sub.o s_mp_zero_buf.o s_mp_zero_digs.o
+s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_faster_read_radix.o \
+s_mp_faster_to_radix.o s_mp_floor_ilog2.o s_mp_fp_log.o s_mp_fp_log_d.o s_mp_get_bit.o s_mp_invmod.o \
+s_mp_invmod_odd.o s_mp_log_2expt.o s_mp_montgomery_reduce_comba.o s_mp_mul.o s_mp_mul_balance.o \
+s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o s_mp_mul_toom.o \
+s_mp_prime_is_divisible.o s_mp_prime_tab.o s_mp_radix_map.o s_mp_radix_size_overestimate.o \
+s_mp_rand_platform.o s_mp_slower_read_radix.o s_mp_slower_to_radix.o s_mp_sqr.o s_mp_sqr_comba.o \
+s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o s_mp_zero_buf.o s_mp_zero_digs.o
 
 #END_INS
 
diff --git a/makefile.unix b/makefile.unix
index 9fe939f04..737a73377 100644
--- a/makefile.unix
+++ b/makefile.unix
@@ -47,12 +47,13 @@ mp_reduce_setup.o mp_root_n.o mp_rshd.o mp_sbin_size.o mp_set.o mp_set_double.o
 mp_set_l.o mp_set_u32.o mp_set_u64.o mp_set_ul.o mp_shrink.o mp_signed_rsh.o mp_sqrmod.o mp_sqrt.o \
 mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_radix.o mp_to_sbin.o mp_to_ubin.o mp_ubin_size.o \
 mp_unpack.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_3.o s_mp_div_recursive.o \
-s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_fp_log.o s_mp_fp_log_d.o \
-s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o s_mp_log_2expt.o s_mp_montgomery_reduce_comba.o s_mp_mul.o \
-s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o \
-s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_prime_tab.o s_mp_radix_map.o \
-s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o s_mp_sqr_karatsuba.o \
-s_mp_sqr_toom.o s_mp_sub.o s_mp_zero_buf.o s_mp_zero_digs.o
+s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_faster_read_radix.o \
+s_mp_faster_to_radix.o s_mp_floor_ilog2.o s_mp_fp_log.o s_mp_fp_log_d.o s_mp_get_bit.o s_mp_invmod.o \
+s_mp_invmod_odd.o s_mp_log_2expt.o s_mp_montgomery_reduce_comba.o s_mp_mul.o s_mp_mul_balance.o \
+s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o s_mp_mul_toom.o \
+s_mp_prime_is_divisible.o s_mp_prime_tab.o s_mp_radix_map.o s_mp_radix_size_overestimate.o \
+s_mp_rand_platform.o s_mp_slower_read_radix.o s_mp_slower_to_radix.o s_mp_sqr.o s_mp_sqr_comba.o \
+s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o s_mp_zero_buf.o s_mp_zero_digs.o
 
 
 HEADERS_PUB=tommath.h
diff --git a/mp_prime_is_prime.c b/mp_prime_is_prime.c
index bb24f5944..55c054e8a 100644
--- a/mp_prime_is_prime.c
+++ b/mp_prime_is_prime.c
@@ -3,16 +3,6 @@
 /* LibTomMath, multiple-precision integer library -- Tom St Denis */
 /* SPDX-License-Identifier: Unlicense */
 
-/* portable integer log of two with small footprint */
-static unsigned int s_floor_ilog2(int value)
-{
-   unsigned int r = 0;
-   while ((value >>= 1) != 0) {
-      r++;
-   }
-   return r;
-}
-
 mp_err mp_prime_is_prime(const mp_int *a, int t, bool *result)
 {
    mp_int  b;
@@ -186,7 +176,7 @@ mp_err mp_prime_is_prime(const mp_int *a, int t, bool *result)
        * Hence the ugly type-fiddling in the following code.
        */
       size_a = mp_count_bits(a);
-      mask = (1u << s_floor_ilog2(size_a)) - 1u;
+      mask = (1u << s_mp_floor_ilog2((unsigned int)size_a)) - 1u;
       /*
          Assuming the General Rieman hypothesis (never thought to write that in a
          comment) the upper bound can be lowered to  2*(log a)^2.
diff --git a/s_mp_faster_read_radix.c b/s_mp_faster_read_radix.c
index 76c0441b9..0843c0da4 100644
--- a/s_mp_faster_read_radix.c
+++ b/s_mp_faster_read_radix.c
@@ -18,20 +18,11 @@ static const uint8_t s_read_radix_cutoff[65] = { 0, 0,
                                                  18, 17, 17, 17, 17, 17, 17       /* 58 .. 64 */
                                                };
 
-/* This is in mp_prime_is_prime.c and can be reused */
-static int s_floor_ilog2(int value)
-{
-   int r = 0;
-   while ((value >>= 1) != 0) {
-      r++;
-   }
-   return r;
-}
-
 mp_err s_mp_faster_read_radix(mp_int *a, const char *str, size_t start, size_t end, int radix)
 {
    size_t len, mid;
    mp_int A, B, m;
+   mp_digit radix_ = (mp_digit)radix;
    mp_err err = MP_OKAY;
 
    len = end - start;
@@ -42,7 +33,7 @@ mp_err s_mp_faster_read_radix(mp_int *a, const char *str, size_t start, size_t e
 
    mid = len / 2u;
 
-   if ((err = mp_init_set(&m, (mp_digit)radix)) != MP_OKAY) {
+   if ((err = mp_init_set(&m, radix_)) != MP_OKAY) {
       return err;
    }
    if ((err = mp_init_multi(&A, &B, NULL)) != MP_OKAY) {
@@ -53,8 +44,8 @@ mp_err s_mp_faster_read_radix(mp_int *a, const char *str, size_t start, size_t e
    if ((err = s_mp_slower_read_radix(&A, str, start, start + mid + 1, radix)) != MP_OKAY)                goto LTM_ERR;
    if ((err = s_mp_slower_read_radix(&B, str, start + mid +1, end, radix)) != MP_OKAY)                   goto LTM_ERR;
 
-   if (MP_IS_2EXPT((unsigned int)radix)) {
-      if ((err = mp_mul_2d(&A, (int)(((len - mid) - 1u) * (size_t)s_floor_ilog2(radix)), &A)) != MP_OKAY)goto LTM_ERR;
+   if (MP_IS_2EXPT(radix_)) {
+      if ((err = mp_mul_2d(&A, (int)(((len - mid) - 1u) * s_mp_floor_ilog2(radix_)), &A)) != MP_OKAY)    goto LTM_ERR;
    } else {
       if ((err = mp_expt_n(&m, (int)((len - mid) - 1u), &m)) != MP_OKAY)                                 goto LTM_ERR;
       if ((err = mp_mul(&A, &m, &A)) != MP_OKAY)                                                         goto LTM_ERR;
diff --git a/s_mp_floor_ilog2.c b/s_mp_floor_ilog2.c
new file mode 100644
index 000000000..ec32b1337
--- /dev/null
+++ b/s_mp_floor_ilog2.c
@@ -0,0 +1,15 @@
+#include "tommath_private.h"
+#ifdef S_MP_FLOOR_ILOG2_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis */
+/* SPDX-License-Identifier: Unlicense */
+
+size_t s_mp_floor_ilog2(mp_word value)
+{
+   size_t r = 0u;
+   while ((value >>= 1) != 0u) {
+      r++;
+   }
+   return r;
+}
+
+#endif
diff --git a/s_mp_fp_log_d.c b/s_mp_fp_log_d.c
index 71d82dc41..e7ce5f6e9 100644
--- a/s_mp_fp_log_d.c
+++ b/s_mp_fp_log_d.c
@@ -3,22 +3,13 @@
 /* LibTomMath, multiple-precision integer library -- Tom St Denis */
 /* SPDX-License-Identifier: Unlicense */
 
-static mp_word s_mp_flog2_mp_word_d(mp_word value)
-{
-   mp_word r = 0u;
-   while ((value >>= 1) != 0u) {
-      r++;
-   }
-   return r;
-}
-
 /* Fixed point bitwise logarithm base two of "x" with precision "p"  */
 static mp_err s_mp_fp_log_fraction_d(mp_word x, int p, mp_word *c)
 {
    mp_word b, L_out, L, a_bar, twoep;
    int i;
 
-   L = s_mp_flog2_mp_word_d(x);
+   L = s_mp_floor_ilog2(x);
 
    if ((L + (mp_word)p) > MP_UPPER_LIMIT_FIXED_LOG) {
       return MP_VAL;
diff --git a/sources.cmake b/sources.cmake
index bbb2aeab6..87c6b9820 100644
--- a/sources.cmake
+++ b/sources.cmake
@@ -132,6 +132,9 @@ s_mp_div_school.c
 s_mp_div_small.c
 s_mp_exptmod.c
 s_mp_exptmod_fast.c
+s_mp_faster_read_radix.c
+s_mp_faster_to_radix.c
+s_mp_floor_ilog2.c
 s_mp_fp_log.c
 s_mp_fp_log_d.c
 s_mp_get_bit.c
@@ -151,6 +154,8 @@ s_mp_prime_tab.c
 s_mp_radix_map.c
 s_mp_radix_size_overestimate.c
 s_mp_rand_platform.c
+s_mp_slower_read_radix.c
+s_mp_slower_to_radix.c
 s_mp_sqr.c
 s_mp_sqr_comba.c
 s_mp_sqr_karatsuba.c
diff --git a/tommath_class.h b/tommath_class.h
index e08bc5f3c..c5edbc641 100644
--- a/tommath_class.h
+++ b/tommath_class.h
@@ -141,6 +141,9 @@
 #   define S_MP_DIV_SMALL_C
 #   define S_MP_EXPTMOD_C
 #   define S_MP_EXPTMOD_FAST_C
+#   define S_MP_FASTER_READ_RADIX_C
+#   define S_MP_FASTER_TO_RADIX_C
+#   define S_MP_FLOOR_ILOG2_C
 #   define S_MP_FP_LOG_C
 #   define S_MP_FP_LOG_D_C
 #   define S_MP_GET_BIT_C
@@ -160,6 +163,8 @@
 #   define S_MP_RADIX_MAP_C
 #   define S_MP_RADIX_SIZE_OVERESTIMATE_C
 #   define S_MP_RAND_PLATFORM_C
+#   define S_MP_SLOWER_READ_RADIX_C
+#   define S_MP_SLOWER_TO_RADIX_C
 #   define S_MP_SQR_C
 #   define S_MP_SQR_COMBA_C
 #   define S_MP_SQR_KARATSUBA_C
@@ -647,6 +652,7 @@
 #   define MP_RAND_C
 #   define MP_READ_RADIX_C
 #   define MP_SET_C
+#   define S_MP_FLOOR_ILOG2_C
 #   define S_MP_PRIME_IS_DIVISIBLE_C
 #endif
 
@@ -734,9 +740,10 @@
 #endif
 
 #if defined(MP_READ_RADIX_C)
-#   define MP_ADD_D_C
-#   define MP_MUL_D_C
 #   define MP_ZERO_C
+#   define S_MP_FASTER_READ_RADIX_C
+#   define S_MP_SLOWER_READ_RADIX_C
+#   define S_MP_STRLEN_C
 #endif
 
 #if defined(MP_REDUCE_C)
@@ -931,9 +938,8 @@
 #endif
 
 #if defined(MP_TO_RADIX_C)
-#   define MP_CLEAR_C
-#   define MP_DIV_D_C
-#   define MP_INIT_COPY_C
+#   define S_MP_FASTER_TO_RADIX_C
+#   define S_MP_SLOWER_TO_RADIX_C
 #endif
 
 #if defined(MP_TO_SBIN_C)
@@ -1069,6 +1075,44 @@
 #   define S_MP_MONTGOMERY_REDUCE_COMBA_C
 #endif
 
+#if defined(S_MP_FASTER_READ_RADIX_C)
+#   define MP_ADD_C
+#   define MP_CLEAR_C
+#   define MP_CLEAR_MULTI_C
+#   define MP_EXPT_N_C
+#   define MP_INIT_MULTI_C
+#   define MP_INIT_SET_C
+#   define MP_MUL_2D_C
+#   define MP_MUL_C
+#   define S_MP_FLOOR_ILOG2_C
+#   define S_MP_SLOWER_READ_RADIX_C
+#endif
+
+#if defined(S_MP_FASTER_TO_RADIX_C)
+#   define MP_2EXPT_C
+#   define MP_ADD_C
+#   define MP_ADD_D_C
+#   define MP_CLEAR_C
+#   define MP_CLEAR_MULTI_C
+#   define MP_CMP_C
+#   define MP_COUNT_BITS_C
+#   define MP_DIV_2D_C
+#   define MP_DIV_C
+#   define MP_EXPT_N_C
+#   define MP_INIT_C
+#   define MP_INIT_I32_C
+#   define MP_INIT_MULTI_C
+#   define MP_MUL_C
+#   define MP_SUB_C
+#   define MP_SUB_D_C
+#   define S_MP_SLOWER_TO_RADIX_C
+#   define S_MP_TO_RADIX_RECURSIVE_C
+#   define S_MP_ZERO_BUF_C
+#endif
+
+#if defined(S_MP_FLOOR_ILOG2_C)
+#endif
+
 #if defined(S_MP_FP_LOG_C)
 #   define MP_2EXPT_C
 #   define MP_ADD_C
@@ -1091,7 +1135,7 @@
 #   define MP_DIV_2D_C
 #   define MP_GET_I64_C
 #   define MP_INIT_C
-#   define S_MP_FLOG2_MP_WORD_D_C
+#   define S_MP_FLOOR_ILOG2_C
 #   define S_MP_FP_LOG_FRACTION_D_C
 #endif
 
@@ -1234,6 +1278,18 @@
 #if defined(S_MP_RAND_PLATFORM_C)
 #endif
 
+#if defined(S_MP_SLOWER_READ_RADIX_C)
+#   define MP_ADD_D_C
+#   define MP_MUL_D_C
+#   define MP_ZERO_C
+#endif
+
+#if defined(S_MP_SLOWER_TO_RADIX_C)
+#   define MP_CLEAR_C
+#   define MP_DIV_D_C
+#   define MP_INIT_COPY_C
+#endif
+
 #if defined(S_MP_SQR_C)
 #   define MP_CLAMP_C
 #   define MP_CLEAR_C
diff --git a/tommath_private.h b/tommath_private.h
index b510be254..1c5061404 100644
--- a/tommath_private.h
+++ b/tommath_private.h
@@ -223,6 +223,7 @@ MP_PRIVATE void s_mp_copy_digs(mp_digit *d, const mp_digit *s, int digits);
 MP_PRIVATE void s_mp_zero_buf(void *mem, size_t size);
 MP_PRIVATE void s_mp_zero_digs(mp_digit *d, int digits);
 MP_PRIVATE mp_err s_mp_radix_size_overestimate(const mp_int *a, const int radix, size_t *size);
+MP_PRIVATE size_t s_mp_floor_ilog2(mp_word value);
 
 #define MP_PRECISION_FIXED_LOG   ( (int) (((sizeof(mp_word) * CHAR_BIT) / 2) - 1))
 #define MP_UPPER_LIMIT_FIXED_LOG ( (int) ( (sizeof(mp_word) * CHAR_BIT) - 1))

From f7d5d32b3ab44cc21d1e14ac46d497875c2513a7 Mon Sep 17 00:00:00 2001
From: czurnieden <czurnieden@gmx.de>
Date: Wed, 21 Jun 2023 15:21:05 +0200
Subject: [PATCH 5/8] Print additional debugging information in test.c

---
 demo/test.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/demo/test.c b/demo/test.c
index c48d5d80f..c3983fd2b 100644
--- a/demo/test.c
+++ b/demo/test.c
@@ -1223,12 +1223,13 @@ static int test_mp_read_radix(void)
    t_slow = stop - start;
 
    /* It is "long int" in GLibC but can be bigger and/or even a floating point elsewhere */
-   printf("SLOW: %.10f, FAST: %.10f\n", (double)t_slow/(double)CLOCKS_PER_SEC, (double)t_fast/(double)CLOCKS_PER_SEC);
+   fprintf(stderr,"SLOW: %.10f, FAST: %.10f\n", (double)t_slow/(double)CLOCKS_PER_SEC, (double)t_fast/(double)CLOCKS_PER_SEC);
 
    /* Check if the branching works. */
    if (MP_HAS(S_MP_FASTER_READ_RADIX) && MP_HAS(S_MP_FASTER_TO_RADIX)) {
       if (t_fast > t_slow) {
          fprintf(stderr, "Timing suspicious in test_mp_read_radix. No fast multiplication? Cut-off too low?\n");
+         DO(mp_fwrite(&a, 16, stderr));
          goto LBL_ERR;
       }
    }

From 7d8209acb379cc91dce0adf9eeeced8f8697f80a Mon Sep 17 00:00:00 2001
From: czurnieden <czurnieden@gmx.de>
Date: Wed, 21 Jun 2023 15:22:15 +0200
Subject: [PATCH 6/8] formatted

---
 demo/test.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/demo/test.c b/demo/test.c
index c3983fd2b..c793de897 100644
--- a/demo/test.c
+++ b/demo/test.c
@@ -1223,7 +1223,8 @@ static int test_mp_read_radix(void)
    t_slow = stop - start;
 
    /* It is "long int" in GLibC but can be bigger and/or even a floating point elsewhere */
-   fprintf(stderr,"SLOW: %.10f, FAST: %.10f\n", (double)t_slow/(double)CLOCKS_PER_SEC, (double)t_fast/(double)CLOCKS_PER_SEC);
+   fprintf(stderr,"SLOW: %.10f, FAST: %.10f\n", (double)t_slow/(double)CLOCKS_PER_SEC,
+           (double)t_fast/(double)CLOCKS_PER_SEC);
 
    /* Check if the branching works. */
    if (MP_HAS(S_MP_FASTER_READ_RADIX) && MP_HAS(S_MP_FASTER_TO_RADIX)) {

From 816833c730c2757e341f7b8d9486a043375ddd71 Mon Sep 17 00:00:00 2001
From: czurnieden <czurnieden@gmx.de>
Date: Wed, 21 Jun 2023 16:13:42 +0200
Subject: [PATCH 7/8] temporarily removed timing

---
 demo/test.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/demo/test.c b/demo/test.c
index c793de897..a93620366 100644
--- a/demo/test.c
+++ b/demo/test.c
@@ -1226,7 +1226,7 @@ static int test_mp_read_radix(void)
    fprintf(stderr,"SLOW: %.10f, FAST: %.10f\n", (double)t_slow/(double)CLOCKS_PER_SEC,
            (double)t_fast/(double)CLOCKS_PER_SEC);
 
-   /* Check if the branching works. */
+   /* Check if the branching works.
    if (MP_HAS(S_MP_FASTER_READ_RADIX) && MP_HAS(S_MP_FASTER_TO_RADIX)) {
       if (t_fast > t_slow) {
          fprintf(stderr, "Timing suspicious in test_mp_read_radix. No fast multiplication? Cut-off too low?\n");
@@ -1234,7 +1234,7 @@ static int test_mp_read_radix(void)
          goto LBL_ERR;
       }
    }
-
+   */
 
    free(buffer);
 

From 28b373dae6409f7bd771fe36760a1dddb9290b0c Mon Sep 17 00:00:00 2001
From: czurnieden <czurnieden@gmx.de>
Date: Sat, 1 Jul 2023 01:50:49 +0200
Subject: [PATCH 8/8] clean up

---
 demo/test.c              | 16 ++++++++++------
 mp_to_radix.c            |  2 ++
 s_mp_faster_read_radix.c |  6 +++---
 s_mp_slower_to_radix.c   | 21 ++++++++++-----------
 4 files changed, 25 insertions(+), 20 deletions(-)

diff --git a/demo/test.c b/demo/test.c
index a93620366..0132baeea 100644
--- a/demo/test.c
+++ b/demo/test.c
@@ -1158,7 +1158,7 @@ static int test_mp_read_radix(void)
    size_t written, maxlen;
    int bignum, i;
 
-   char *buffer, *bcpy;
+   char *buffer, *bcpy, *startb;
 
    clock_t start, stop, t_slow, t_fast;
 
@@ -1191,18 +1191,19 @@ static int test_mp_read_radix(void)
 
    /* Test the fast method with a slightly larger number */
 
-   /* Must be bigger than the cut-off value, of course */
-   bignum = (2 * 20 * MP_RADIX_BARRETT_START_MULTIPLICATOR)  * 10;
+   /* Needs to be big enough to make a sufficiently large timing difference */
+   bignum = 30000;
    buffer = (char *)malloc((size_t)(bignum + 2));
    if (buffer == NULL) {
       goto LBL_ERR;
    }
    DO(mp_rand(&a, bignum / MP_DIGIT_BIT));
-   printf("\nNumber of limbs in &b = %d, bit_count of &b = %d\n", bignum / MP_DIGIT_BIT, mp_count_bits(&a));
+   fprintf(stderr,"\nNumber of limbs in &b = %d, bit_count of &b = %d\n", bignum / MP_DIGIT_BIT, mp_count_bits(&a));
    start = clock();
    for (i = 2; i < 65; i++) {
       /* printf("FAST radix = %d\n",i); */
       DO(mp_to_radix(&a, buffer, (size_t)(bignum + 1), &written, i));
+      mp_zero(&b);
       DO(mp_read_radix(&b, buffer, i));
       EXPECT(mp_cmp(&a, &b) == MP_EQ);
    }
@@ -1215,7 +1216,11 @@ static int test_mp_read_radix(void)
       /* printf("SLOW radix = %d\n",i); */
       maxlen = (size_t)(bignum + 1);
       bcpy = buffer;
+      /* s_mp_slower_to_radix is very rudimentary as a stand-alone */
+      startb = bcpy;
       DO(s_mp_slower_to_radix(&a, &bcpy, &maxlen, &written, i, false));
+      bcpy = startb;
+      mp_zero(&b);
       DO(s_mp_slower_read_radix(&b, bcpy, 0, strlen(bcpy), i));
       EXPECT(mp_cmp(&a, &b) == MP_EQ);
    }
@@ -1226,7 +1231,7 @@ static int test_mp_read_radix(void)
    fprintf(stderr,"SLOW: %.10f, FAST: %.10f\n", (double)t_slow/(double)CLOCKS_PER_SEC,
            (double)t_fast/(double)CLOCKS_PER_SEC);
 
-   /* Check if the branching works.
+   /* Check if the branching works. */
    if (MP_HAS(S_MP_FASTER_READ_RADIX) && MP_HAS(S_MP_FASTER_TO_RADIX)) {
       if (t_fast > t_slow) {
          fprintf(stderr, "Timing suspicious in test_mp_read_radix. No fast multiplication? Cut-off too low?\n");
@@ -1234,7 +1239,6 @@ static int test_mp_read_radix(void)
          goto LBL_ERR;
       }
    }
-   */
 
    free(buffer);
 
diff --git a/mp_to_radix.c b/mp_to_radix.c
index 114902f36..e7509efe5 100644
--- a/mp_to_radix.c
+++ b/mp_to_radix.c
@@ -47,7 +47,9 @@ mp_err mp_to_radix(const mp_int *a, char *str, size_t maxlen, size_t *written, i
       if ((err = s_mp_faster_to_radix(&a_bar, str, maxlen, &part_written, radix)) != MP_OKAY)            goto LBL_ERR;
    } else {
       if (MP_HAS(S_MP_SLOWER_TO_RADIX)) {
+         char *start = str;
          if ((err = s_mp_slower_to_radix(&a_bar, &str, &maxlen, &part_written, radix, false)) != MP_OKAY) goto LBL_ERR;
+         str = start;
          /* part_written does not count EOS */
          part_written++;
       }
diff --git a/s_mp_faster_read_radix.c b/s_mp_faster_read_radix.c
index 0843c0da4..f35d89472 100644
--- a/s_mp_faster_read_radix.c
+++ b/s_mp_faster_read_radix.c
@@ -3,7 +3,7 @@
 /* LibTomMath, multiple-precision integer library -- Tom St Denis */
 /* SPDX-License-Identifier: Unlicense */
 
-/* TODO: It is tunable */
+/* TODO: It is tunable. Or is it? */
 #define MP_READ_RADIX_CUTOFF_MULTIPLICATOR 3
 
 /* for(n=2,64,t = ceil(log(2^100)/log(n)); printf(t", "); ) */
@@ -41,8 +41,8 @@ mp_err s_mp_faster_read_radix(mp_int *a, const char *str, size_t start, size_t e
       return err;
    }
 
-   if ((err = s_mp_slower_read_radix(&A, str, start, start + mid + 1, radix)) != MP_OKAY)                goto LTM_ERR;
-   if ((err = s_mp_slower_read_radix(&B, str, start + mid +1, end, radix)) != MP_OKAY)                   goto LTM_ERR;
+   if ((err = s_mp_faster_read_radix(&A, str, start, start + mid + 1, radix)) != MP_OKAY)                goto LTM_ERR;
+   if ((err = s_mp_faster_read_radix(&B, str, start + mid +1, end, radix)) != MP_OKAY)                   goto LTM_ERR;
 
    if (MP_IS_2EXPT(radix_)) {
       if ((err = mp_mul_2d(&A, (int)(((len - mid) - 1u) * s_mp_floor_ilog2(radix_)), &A)) != MP_OKAY)    goto LTM_ERR;
diff --git a/s_mp_slower_to_radix.c b/s_mp_slower_to_radix.c
index 6dacb13a1..0dffb2b79 100644
--- a/s_mp_slower_to_radix.c
+++ b/s_mp_slower_to_radix.c
@@ -20,29 +20,30 @@ mp_err s_mp_slower_to_radix(const mp_int *a, char **str,
    mp_int t;
    mp_digit d;
    mp_err err = MP_OKAY;
-
-   /* The number of digits of "radix" to be filled if this chunk is not the most significant one. */
-   int ybar = s_mp_radix_exponent_y[radix] * MP_RADIX_BARRETT_START_MULTIPLICATOR;
+   int ybar = 0;
 
    /* A temporary pointer to the output string to make reversal simpler */
    char *s = *str;
 
-   /* TODO: input a is already a copy of the original and we could use it destructively? */
+   /* The number of digits of "radix" to be filled if this chunk is not the most significant one. */
+   if (pad) {
+      ybar = s_mp_radix_exponent_y[radix] * MP_RADIX_BARRETT_START_MULTIPLICATOR;
+   }
+
    if ((err = mp_init_copy(&t, a)) != MP_OKAY)                                                           goto LTM_ERR;
 
    while (!mp_iszero(&t)) {
-      /* TODO: this method to decrease "maxlen" is not threadsafe! */
       if ((--(*part_maxlen)) < 1u) {
          /* no more room */
          err = MP_BUF;
          goto LTM_ERR;
       }
-      if ((err = mp_div_d(&t, (mp_digit)radix, &t, &d)) != MP_OKAY) {
-         goto LTM_ERR;
-      }
+      if ((err = mp_div_d(&t, (mp_digit)radix, &t, &d)) != MP_OKAY)                                      goto LTM_ERR;
       *s++ = s_mp_radix_map[d];
       ++digs;
-      ybar--;
+      if (pad) {
+         ybar--;
+      }
    }
 
    /* Fill in leading zeros if this chunk does not contain the most significant digits. */
@@ -53,7 +54,6 @@ mp_err s_mp_slower_to_radix(const mp_int *a, char **str,
       }
    }
 
-   /* TODO: I think that can be done more elegantly */
    /* "rewind" */
    s = *str;
    /* reverse */
@@ -63,7 +63,6 @@ mp_err s_mp_slower_to_radix(const mp_int *a, char **str,
    /* Add EOS at the end of every chunk to allow this function to be used stand-alone */
    **str = '\0';
 
-   /* TODO: this method to increase "written" is not threadsafe! */
    if (part_written != NULL) {
       *part_written = *part_written + digs;
    }