diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 24f881f3..18a832bb 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -70,6 +70,12 @@ jobs:
           # RSA superclass with tests (no sanitizer, but debug info)
           - { BUILDOPTIONS: '--with-cc=gcc --with-m64 --cflags=-DLTM_NOTHING --cflags=-DSC_RSA_1_WITH_TESTS --limit-valgrind',   SANITIZER: '',  COMPILE_DEBUG: '1', COMPILE_LTO: '0', CONV_WARNINGS: '',        OTHERDEPS: '' }
 
+          # Build with small stack-size
+          - { BUILDOPTIONS: '--with-cc=gcc --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE',                                SANITIZER: '',  COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '',        OTHERDEPS: 'gcc-multilib' }
+          - { BUILDOPTIONS: '--with-cc=clang-10 --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE --limit-valgrind',          SANITIZER: '1', COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '',        OTHERDEPS: 'clang-10 llvm-10 libc6-dev-i386 gcc-multilib' }
+          - { BUILDOPTIONS: '--with-cc=gcc --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE --multithread --limit-valgrind', SANITIZER: '',  COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '',        OTHERDEPS: 'libc6-dev-i386 gcc-multilib' }
+          - { BUILDOPTIONS: '--with-cc=clang-10 --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE --multithread',             SANITIZER: '1', COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '',        OTHERDEPS: 'clang-10 llvm-10 gcc-multilib' }
+
           # Test "autotuning", the automatic evaluation and setting of the Toom-Cook cut-offs.
           #- env: SANITIZER=1 BUILDOPTIONS='--with-cc=gcc-5 --cflags=-DMP_16BIT --limit-valgrind --make-option=tune'
           #- env: SANITIZER=1 BUILDOPTIONS='--with-cc=gcc-5 --cflags=-DMP_32BIT --limit-valgrind --make-option=tune'
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d6063277..2f59d32e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -138,7 +138,7 @@ if(COMPILE_LTO)
     if(COMPILER_SUPPORTS_LTO)
         set_property(TARGET ${PROJECT_NAME} PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE)
     else()
-        message(SEND_ERROR "This compiler does not support LTO. Reconfigure ${PROJECT_NAME} with -DCOMPILE_LTO=OFF.")
+        message(FATAL_ERROR "This compiler does not support LTO. Reconfigure ${PROJECT_NAME} with -DCOMPILE_LTO=OFF.")
     endif()
 endif()
 
diff --git a/appveyor.yml b/appveyor.yml
index 30d9ee75..2134f2dd 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -11,6 +11,10 @@ image:
 - Visual Studio 2019
 - Visual Studio 2017
 - Visual Studio 2015
+environment:
+  matrix:
+  - CFLAGS_VAR: ""
+    CFLAGS_VAR_DLL: "CFLAGS=\"/Ox /MD /DLTM_TEST_DYNAMIC\""
 build_script:
 - cmd: >-
     if "Visual Studio 2022"=="%APPVEYOR_BUILD_WORKER_IMAGE%" call "C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvars64.bat"
@@ -18,9 +22,9 @@ build_script:
       if "Visual Studio 2017"=="%APPVEYOR_BUILD_WORKER_IMAGE%" call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat"
       if "Visual Studio 2015"=="%APPVEYOR_BUILD_WORKER_IMAGE%" call "C:\Program Files\Microsoft SDKs\Windows\v7.1\Bin\SetEnv.cmd" /x64
       if "Visual Studio 2015"=="%APPVEYOR_BUILD_WORKER_IMAGE%" call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" x86_amd64
-      nmake -f makefile.msvc test.exe
+      nmake -f makefile.msvc test.exe %CFLAGS_VAR%
       nmake -f makefile.msvc clean-obj
-      nmake -f makefile.msvc test_dll.exe CFLAGS="/Ox /MD /DLTM_TEST_DYNAMIC"
+      nmake -f makefile.msvc test_dll.exe %CFLAGS_VAR_DLL%
 test_script:
 - cmd: test.exe
 - cmd: test_dll.exe
diff --git a/demo/test.c b/demo/test.c
index f290dbf2..2fa6e08d 100644
--- a/demo/test.c
+++ b/demo/test.c
@@ -2455,12 +2455,101 @@ static int test_mp_pack_unpack(void)
 #define ONLY_PUBLIC_API_C
 #endif
 
+#if !defined(LTM_TEST_MULTITHREAD)
+#define SINGLE_THREADED_C
+typedef uintptr_t thread_id_t;
+#else
+#define MULTI_THREADED_C
+#if !defined(_WIN32)
+#define MULTI_THREADED_PTHREAD_C
+#include <pthread.h>
+typedef pthread_t thread_id_t;
+#else
+#define MULTI_THREADED_MSVC_C
+
+#ifndef _WIN32_WINNT
+#define _WIN32_WINNT 0x0501
+#endif
+#ifndef WINVER
+#define WINVER 0x0501
+#endif
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+typedef HANDLE thread_id_t;
+#endif
+#endif
+
+#if !defined(MULTI_THREADED_PTHREAD_C)
+extern int pthread_create(thread_id_t *, const void *, void *(*)(void *), void *);
+extern int pthread_join(thread_id_t, void **);
+#endif
+
+#if !defined(MULTI_THREADED_MSVC_C)
+extern thread_id_t CreateThread(void *, size_t, unsigned long (*)(void *), void *, unsigned long, void *);
+extern unsigned long WaitForSingleObject(thread_id_t hHandle, unsigned long dwMilliseconds);
+#define INFINITE ((unsigned long)-1)
+#endif
+
+struct test_fn {
+   const char *name;
+   int (*fn)(void);
+};
+
+struct thread_info {
+   thread_id_t thread_id;
+   const struct test_fn *t;
+   int ret;
+};
+
+static void run(struct thread_info *tinfo)
+{
+   tinfo->ret = tinfo->t->fn();
+
+   if (mp_warray_free() == -2)
+      tinfo->ret = EXIT_FAILURE;
+}
+
+static void *run_pthread(void *arg)
+{
+   run(arg);
+
+   return arg;
+}
+
+static unsigned long run_msvc(void *arg)
+{
+   run(arg);
+
+   return 0;
+}
+
+static int thread_start(struct thread_info *info)
+{
+   if (MP_HAS(MULTI_THREADED_PTHREAD))
+      return pthread_create(&info->thread_id, NULL, run_pthread, info);
+   if (MP_HAS(MULTI_THREADED_MSVC)) {
+      info->thread_id = CreateThread(NULL, 0, run_msvc, info, 0, NULL);
+      return info->thread_id == (thread_id_t)NULL ? -1 : 0;
+   }
+   return -1;
+}
+
+static int thread_join(struct thread_info *info, struct thread_info **res)
+{
+   if (MP_HAS(MULTI_THREADED_PTHREAD))
+      return pthread_join(info->thread_id, (void **)res);
+   if (MP_HAS(MULTI_THREADED_MSVC)) {
+      WaitForSingleObject(info->thread_id, INFINITE);
+      *res = info;
+      return 0;
+   }
+   return -1;
+}
+
 static int unit_tests(int argc, char **argv)
 {
-   static const struct {
-      const char *name;
-      int (*fn)(void);
-   } test[] = {
+   static const struct test_fn test[] = {
 #define T0(n)              { #n, test_##n }
 #define T1(n, o)           { #n, MP_HAS(o) ? test_##n : NULL }
 #define T2(n, o1, o2)      { #n, (MP_HAS(o1) && MP_HAS(o2)) ? test_##n : NULL }
@@ -2522,10 +2611,10 @@ static int unit_tests(int argc, char **argv)
 #undef T2
 #undef T1
    };
+   struct thread_info test_threads[sizeof(test)/sizeof(test[0])], *res;
    unsigned long i, ok, fail, nop;
    uint64_t t;
    int j;
-
    ok = fail = nop = 0;
 
    t = (uint64_t)time(NULL);
@@ -2533,20 +2622,43 @@ static int unit_tests(int argc, char **argv)
    s_mp_rand_jenkins_init(t);
    mp_rand_source(s_mp_rand_jenkins);
 
+   if (MP_HAS(MP_SMALL_STACK_SIZE)) {
+      printf("Small-stack enabled\n\n");
+   }
+
+   if (MP_HAS(MULTI_THREADED)) {
+      printf("Multi-threading enabled\n\n");
+      /* we ignore the fact that jenkins is not thread safe */
+      for (i = 0; i < (sizeof(test) / sizeof(test[0])); ++i) {
+         test_threads[i].t = &test[i];
+         EXPECT(thread_start(&test_threads[i]) == 0);
+      }
+   }
+
    for (i = 0; i < (sizeof(test) / sizeof(test[0])); ++i) {
-      if (argc > 1) {
-         for (j = 1; j < argc; ++j) {
-            if (strstr(test[i].name, argv[j]) != NULL) {
-               break;
+      j = -1;
+      if (MP_HAS(SINGLE_THREADED)) {
+         if (argc > 1) {
+            for (j = 1; j < argc; ++j) {
+               if (strstr(test[i].name, argv[j]) != NULL) {
+                  break;
+               }
             }
+            if (j == argc) continue;
          }
-         if (j == argc) continue;
+
+         if (test[i].fn)
+            j = test[i].fn();
+      } else if (MP_HAS(MULTI_THREADED)) {
+         EXPECT(thread_join(&test_threads[i], &res) == 0);
+         j = res->ret;
       }
       printf("TEST %s\n", test[i].name);
+
       if (test[i].fn == NULL) {
          nop++;
          printf("NOP %s\n\n", test[i].name);
-      } else if (test[i].fn() == EXIT_SUCCESS) {
+      } else if (j == EXIT_SUCCESS) {
          ok++;
          printf("\n");
       } else {
@@ -2556,8 +2668,12 @@ static int unit_tests(int argc, char **argv)
    }
    fprintf(fail?stderr:stdout, "Tests OK/NOP/FAIL: %lu/%lu/%lu\n", ok, nop, fail);
 
-   if (fail != 0) return EXIT_FAILURE;
-   else return EXIT_SUCCESS;
+   EXPECT(mp_warray_free() != -2);
+
+   if (fail == 0)
+      return EXIT_SUCCESS;
+LBL_ERR:
+   return EXIT_FAILURE;
 }
 
 int main(int argc, char **argv)
diff --git a/doc/bn.tex b/doc/bn.tex
index 22ae5f3e..63e71633 100644
--- a/doc/bn.tex
+++ b/doc/bn.tex
@@ -352,6 +352,16 @@ \subsubsection{Operand Size Related}
   \end{center}
 \end{small}
 
+\subsection{Small-Stack option}
+\label{ch:SMALL_STACK_INTRO}
+The library can be compiled with the symbol \texttt{MP\_SMALL\_STACK\_SIZE} defined, which results in
+the temporary \texttt{MP\_WARRAY}-sized stack buffers being put on the heap.
+This comes with one problem, namely: formerly promised thread-safety isn't given anymore.
+Therefore if the Small-Stack option is enabled while doing multi threading, one shall always initialize
+the library by calling \texttt{mp\_warray\_init()} once with the correct number of threads.
+
+C.f. \ref{ch:SMALL_STACK_API} for the API description and further details.
+
 \section{Purpose of LibTomMath}
 Unlike	GNU MP (GMP) Library, LIP, OpenSSL or various other commercial kits (Miracl), LibTomMath
 was not written with bleeding edge performance in mind.  First and foremost LibTomMath was written
@@ -428,7 +438,11 @@ \chapter{Getting Started with LibTomMath}
 \section{Building Programs}
 In order to use LibTomMath you must include ``tommath.h'' and link against the appropriate library
 file (typically
-libtommath.a).	There is no library initialization required and the entire library is thread safe.
+libtommath.a).	There is no library initialization required and the entire library is thread safe
+if it is used in its default configuration. The small-stack option makes use of atomic operations
+to maintain its internal state and therefore does not require locking, but it MUST be initialized
+if used from multiple threads. For further information see \ref{ch:SMALL_STACK_INTRO} resp.
+\ref{ch:SMALL_STACK_API}.
 
 \section{Return Codes}
 There are five possible return codes a function may return.
@@ -813,6 +827,37 @@ \subsection{Adding additional digits}
 \end{alltt}
 \end{small}
 
+\section{Small-Stack option}
+\label{ch:SMALL_STACK_API}
+
+In case the \texttt{MP\_SMALL\_STACK\_SIZE} symbol is defined the following functions
+can be useful.
+
+To initialize the internal structure the following function shall be called.
+
+\index{mp\_warray\_init}
+\begin{alltt}
+mp_err mp_warray_init(size_t n_alloc, bool preallocate);
+\end{alltt}
+
+The flag \texttt{preallocate} controls whether the internal buffers --
+\texttt{n\_alloc} buffers of size \texttt{MP\_WARRAY} -- will be allocated when
+\texttt{mp\_warray\_init()} is called, or whether they will be allocated when required.
+
+To free the internally allocated memory the following function shall be called.
+
+\index{mp\_warray\_free}
+\begin{alltt}
+int mp_warray_free(void);
+\end{alltt}
+
+
+Those two API functions are always available, even if the \texttt{MP\_SMALL\_STACK\_SIZE} option
+has been disabled at compile time.
+In that case \texttt{mp\_warray\_init()} will return \texttt{MP\_ERR} and \texttt{mp\_warray\_free()}
+will return $-1$.
+
+
 \chapter{Basic Operations}
 \section{Copying}
 
diff --git a/helper.pl b/helper.pl
index 53658614..ffc592a7 100755
--- a/helper.pl
+++ b/helper.pl
@@ -394,7 +394,7 @@ sub update_dep
     foreach my $filename (glob '*mp_*.c') {
         my $content;
         my $cc = $ENV{'CC'} || 'gcc';
-        $content = `$cc -E -x c -DLTM_ALL $filename`;
+        $content = `$cc -E -x c -DLTM_ALL -DMP_SMALL_STACK_SIZE $filename`;
         $content =~ s/^# 1 "$filename".*?^# 2 "$filename"//ms;
 
         # convert filename to upper case so we can use it as a define
diff --git a/libtommath_VS2008.vcproj b/libtommath_VS2008.vcproj
index 13158a09..71dd3807 100644
--- a/libtommath_VS2008.vcproj
+++ b/libtommath_VS2008.vcproj
@@ -792,6 +792,10 @@
 			RelativePath="mp_unpack.c"
 			>
 		</File>
+		<File
+			RelativePath="mp_warray_free.c"
+			>
+		</File>
 		<File
 			RelativePath="mp_xor.c"
 			>
@@ -928,6 +932,18 @@
 			RelativePath="s_mp_sub.c"
 			>
 		</File>
+		<File
+			RelativePath="s_mp_warray.c"
+			>
+		</File>
+		<File
+			RelativePath="s_mp_warray_get.c"
+			>
+		</File>
+		<File
+			RelativePath="s_mp_warray_put.c"
+			>
+		</File>
 		<File
 			RelativePath="s_mp_zero_buf.c"
 			>
diff --git a/makefile b/makefile
index ec32ecd0..8f211f5f 100644
--- a/makefile
+++ b/makefile
@@ -43,13 +43,14 @@ mp_reduce_2k_l.o mp_reduce_2k_setup.o mp_reduce_2k_setup_l.o mp_reduce_is_2k.o m
 mp_reduce_setup.o mp_root_n.o mp_rshd.o mp_sbin_size.o mp_set.o mp_set_double.o mp_set_i32.o mp_set_i64.o \
 mp_set_l.o mp_set_u32.o mp_set_u64.o mp_set_ul.o mp_shrink.o mp_signed_rsh.o mp_sqrmod.o mp_sqrt.o \
 mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_radix.o mp_to_sbin.o mp_to_ubin.o mp_ubin_size.o \
-mp_unpack.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_3.o s_mp_div_recursive.o \
-s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_fp_log.o s_mp_fp_log_d.o \
-s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o s_mp_log_2expt.o s_mp_montgomery_reduce_comba.o s_mp_mul.o \
-s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o \
-s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_prime_tab.o s_mp_radix_map.o \
-s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o s_mp_sqr_karatsuba.o \
-s_mp_sqr_toom.o s_mp_sub.o s_mp_zero_buf.o s_mp_zero_digs.o
+mp_unpack.o mp_warray_free.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_3.o \
+s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_fp_log.o \
+s_mp_fp_log_d.o s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o s_mp_log_2expt.o \
+s_mp_montgomery_reduce_comba.o s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o \
+s_mp_mul_high_comba.o s_mp_mul_karatsuba.o s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_prime_tab.o \
+s_mp_radix_map.o s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o \
+s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o s_mp_warray.o s_mp_warray_get.o s_mp_warray_put.o \
+s_mp_zero_buf.o s_mp_zero_digs.o
 
 #END_INS
 
@@ -172,9 +173,10 @@ c89:
 	-e 's/UINT32_MAX/0xFFFFFFFFu/g' \
 	-e 's/UINT64_MAX/(mp_u64)-1/g' \
 	-e 's/INT32_MAX/0x7FFFFFFF/g' \
-        -e 's/INT32_MIN/(-2147483647-1)/g' \
+	-e 's/INT32_MIN/(-2147483647-1)/g' \
 	-e 's/INT64_MAX/(mp_i64)(((mp_u64)1<<63)-1)/g' \
 	-e 's/INT64_MIN/(mp_i64)((mp_u64)1<<63)/g' \
+	-e 's/uintptr_t/mp_uintptr/g' \
 	-e 's/SIZE_MAX/((size_t)-1)/g' \
 	-e 's/\(PRI[ioux]64\)/MP_\1/g' \
 	-e 's/uint\([0-9][0-9]*\)_t/mp_u\1/g' \
@@ -195,10 +197,11 @@ c99:
 	-e 's/false_/MP_NO_/g' \
 	-e 's/0xFFFFFFFFu/UINT32_MAX/g' \
 	-e 's/(mp_u64)-1/UINT64_MAX/g' \
-        -e 's/(-2147483647-1)/INT32_MIN/g' \
+	-e 's/(-2147483647-1)/INT32_MIN/g' \
 	-e 's/0x7FFFFFFF/INT32_MAX/g' \
 	-e 's/(mp_i64)((mp_u64)1<<63)/INT64_MIN/g' \
 	-e 's/(mp_i64)(((mp_u64)1<<63)-1)/INT64_MAX/g' \
+	-e 's/mp_uintptr/uintptr_t/g' \
 	-e 's/((size_t)-1)/SIZE_MAX/g' \
 	-e 's/MP_\(PRI[ioux]64\)/\1/g' \
 	-e 's/mp_u\([0-9][0-9]*\)/uint\1_t/g' \
diff --git a/makefile.mingw b/makefile.mingw
index 532747be..e2445e8a 100644
--- a/makefile.mingw
+++ b/makefile.mingw
@@ -45,13 +45,14 @@ mp_reduce_2k_l.o mp_reduce_2k_setup.o mp_reduce_2k_setup_l.o mp_reduce_is_2k.o m
 mp_reduce_setup.o mp_root_n.o mp_rshd.o mp_sbin_size.o mp_set.o mp_set_double.o mp_set_i32.o mp_set_i64.o \
 mp_set_l.o mp_set_u32.o mp_set_u64.o mp_set_ul.o mp_shrink.o mp_signed_rsh.o mp_sqrmod.o mp_sqrt.o \
 mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_radix.o mp_to_sbin.o mp_to_ubin.o mp_ubin_size.o \
-mp_unpack.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_3.o s_mp_div_recursive.o \
-s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_fp_log.o s_mp_fp_log_d.o \
-s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o s_mp_log_2expt.o s_mp_montgomery_reduce_comba.o s_mp_mul.o \
-s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o \
-s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_prime_tab.o s_mp_radix_map.o \
-s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o s_mp_sqr_karatsuba.o \
-s_mp_sqr_toom.o s_mp_sub.o s_mp_zero_buf.o s_mp_zero_digs.o
+mp_unpack.o mp_warray_free.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_3.o \
+s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_fp_log.o \
+s_mp_fp_log_d.o s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o s_mp_log_2expt.o \
+s_mp_montgomery_reduce_comba.o s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o \
+s_mp_mul_high_comba.o s_mp_mul_karatsuba.o s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_prime_tab.o \
+s_mp_radix_map.o s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o \
+s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o s_mp_warray.o s_mp_warray_get.o s_mp_warray_put.o \
+s_mp_zero_buf.o s_mp_zero_digs.o
 
 HEADERS_PUB=tommath.h
 HEADERS=tommath_private.h tommath_class.h tommath_superclass.h tommath_cutoffs.h $(HEADERS_PUB)
diff --git a/makefile.msvc b/makefile.msvc
index 5d128549..8feb425c 100644
--- a/makefile.msvc
+++ b/makefile.msvc
@@ -41,13 +41,14 @@ mp_reduce_2k_l.obj mp_reduce_2k_setup.obj mp_reduce_2k_setup_l.obj mp_reduce_is_
 mp_reduce_setup.obj mp_root_n.obj mp_rshd.obj mp_sbin_size.obj mp_set.obj mp_set_double.obj mp_set_i32.obj mp_set_i64.obj \
 mp_set_l.obj mp_set_u32.obj mp_set_u64.obj mp_set_ul.obj mp_shrink.obj mp_signed_rsh.obj mp_sqrmod.obj mp_sqrt.obj \
 mp_sqrtmod_prime.obj mp_sub.obj mp_sub_d.obj mp_submod.obj mp_to_radix.obj mp_to_sbin.obj mp_to_ubin.obj mp_ubin_size.obj \
-mp_unpack.obj mp_xor.obj mp_zero.obj s_mp_add.obj s_mp_copy_digs.obj s_mp_div_3.obj s_mp_div_recursive.obj \
-s_mp_div_school.obj s_mp_div_small.obj s_mp_exptmod.obj s_mp_exptmod_fast.obj s_mp_fp_log.obj s_mp_fp_log_d.obj \
-s_mp_get_bit.obj s_mp_invmod.obj s_mp_invmod_odd.obj s_mp_log_2expt.obj s_mp_montgomery_reduce_comba.obj s_mp_mul.obj \
-s_mp_mul_balance.obj s_mp_mul_comba.obj s_mp_mul_high.obj s_mp_mul_high_comba.obj s_mp_mul_karatsuba.obj \
-s_mp_mul_toom.obj s_mp_prime_is_divisible.obj s_mp_prime_tab.obj s_mp_radix_map.obj \
-s_mp_radix_size_overestimate.obj s_mp_rand_platform.obj s_mp_sqr.obj s_mp_sqr_comba.obj s_mp_sqr_karatsuba.obj \
-s_mp_sqr_toom.obj s_mp_sub.obj s_mp_zero_buf.obj s_mp_zero_digs.obj
+mp_unpack.obj mp_warray_free.obj mp_xor.obj mp_zero.obj s_mp_add.obj s_mp_copy_digs.obj s_mp_div_3.obj \
+s_mp_div_recursive.obj s_mp_div_school.obj s_mp_div_small.obj s_mp_exptmod.obj s_mp_exptmod_fast.obj s_mp_fp_log.obj \
+s_mp_fp_log_d.obj s_mp_get_bit.obj s_mp_invmod.obj s_mp_invmod_odd.obj s_mp_log_2expt.obj \
+s_mp_montgomery_reduce_comba.obj s_mp_mul.obj s_mp_mul_balance.obj s_mp_mul_comba.obj s_mp_mul_high.obj \
+s_mp_mul_high_comba.obj s_mp_mul_karatsuba.obj s_mp_mul_toom.obj s_mp_prime_is_divisible.obj s_mp_prime_tab.obj \
+s_mp_radix_map.obj s_mp_radix_size_overestimate.obj s_mp_rand_platform.obj s_mp_sqr.obj s_mp_sqr_comba.obj \
+s_mp_sqr_karatsuba.obj s_mp_sqr_toom.obj s_mp_sub.obj s_mp_warray.obj s_mp_warray_get.obj s_mp_warray_put.obj \
+s_mp_zero_buf.obj s_mp_zero_digs.obj
 
 HEADERS_PUB=tommath.h
 HEADERS=tommath_private.h tommath_class.h tommath_superclass.h tommath_cutoffs.h $(HEADERS_PUB)
diff --git a/makefile.shared b/makefile.shared
index c9b93351..50c33526 100644
--- a/makefile.shared
+++ b/makefile.shared
@@ -40,13 +40,14 @@ mp_reduce_2k_l.o mp_reduce_2k_setup.o mp_reduce_2k_setup_l.o mp_reduce_is_2k.o m
 mp_reduce_setup.o mp_root_n.o mp_rshd.o mp_sbin_size.o mp_set.o mp_set_double.o mp_set_i32.o mp_set_i64.o \
 mp_set_l.o mp_set_u32.o mp_set_u64.o mp_set_ul.o mp_shrink.o mp_signed_rsh.o mp_sqrmod.o mp_sqrt.o \
 mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_radix.o mp_to_sbin.o mp_to_ubin.o mp_ubin_size.o \
-mp_unpack.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_3.o s_mp_div_recursive.o \
-s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_fp_log.o s_mp_fp_log_d.o \
-s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o s_mp_log_2expt.o s_mp_montgomery_reduce_comba.o s_mp_mul.o \
-s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o \
-s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_prime_tab.o s_mp_radix_map.o \
-s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o s_mp_sqr_karatsuba.o \
-s_mp_sqr_toom.o s_mp_sub.o s_mp_zero_buf.o s_mp_zero_digs.o
+mp_unpack.o mp_warray_free.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_3.o \
+s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_fp_log.o \
+s_mp_fp_log_d.o s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o s_mp_log_2expt.o \
+s_mp_montgomery_reduce_comba.o s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o \
+s_mp_mul_high_comba.o s_mp_mul_karatsuba.o s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_prime_tab.o \
+s_mp_radix_map.o s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o \
+s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o s_mp_warray.o s_mp_warray_get.o s_mp_warray_put.o \
+s_mp_zero_buf.o s_mp_zero_digs.o
 
 #END_INS
 
diff --git a/makefile.unix b/makefile.unix
index 34ebd1a8..58642098 100644
--- a/makefile.unix
+++ b/makefile.unix
@@ -46,13 +46,14 @@ mp_reduce_2k_l.o mp_reduce_2k_setup.o mp_reduce_2k_setup_l.o mp_reduce_is_2k.o m
 mp_reduce_setup.o mp_root_n.o mp_rshd.o mp_sbin_size.o mp_set.o mp_set_double.o mp_set_i32.o mp_set_i64.o \
 mp_set_l.o mp_set_u32.o mp_set_u64.o mp_set_ul.o mp_shrink.o mp_signed_rsh.o mp_sqrmod.o mp_sqrt.o \
 mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_radix.o mp_to_sbin.o mp_to_ubin.o mp_ubin_size.o \
-mp_unpack.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_3.o s_mp_div_recursive.o \
-s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_fp_log.o s_mp_fp_log_d.o \
-s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o s_mp_log_2expt.o s_mp_montgomery_reduce_comba.o s_mp_mul.o \
-s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o \
-s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_prime_tab.o s_mp_radix_map.o \
-s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o s_mp_sqr_karatsuba.o \
-s_mp_sqr_toom.o s_mp_sub.o s_mp_zero_buf.o s_mp_zero_digs.o
+mp_unpack.o mp_warray_free.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_3.o \
+s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_fp_log.o \
+s_mp_fp_log_d.o s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o s_mp_log_2expt.o \
+s_mp_montgomery_reduce_comba.o s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o \
+s_mp_mul_high_comba.o s_mp_mul_karatsuba.o s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_prime_tab.o \
+s_mp_radix_map.o s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o \
+s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o s_mp_warray.o s_mp_warray_get.o s_mp_warray_put.o \
+s_mp_zero_buf.o s_mp_zero_digs.o
 
 
 HEADERS_PUB=tommath.h
diff --git a/makefile_include.mk b/makefile_include.mk
index da897396..d47ea2ba 100644
--- a/makefile_include.mk
+++ b/makefile_include.mk
@@ -97,7 +97,7 @@ endif
 endif # COMPILE_SIZE
 
 ifneq ($(findstring clang,$(CC)),)
-LTM_CFLAGS += -Wno-typedef-redefinition -Wno-tautological-compare -Wno-builtin-requires-header
+LTM_CFLAGS += -Wno-unknown-warning-option -Wno-typedef-redefinition -Wno-tautological-compare -Wno-builtin-requires-header -Wno-incomplete-setjmp-declaration
 ifdef IGNORE_SPEED
 #for dead code eliminiation
 LTM_CFLAGS += -O1
diff --git a/mp_warray_free.c b/mp_warray_free.c
new file mode 100644
index 00000000..f7470f81
--- /dev/null
+++ b/mp_warray_free.c
@@ -0,0 +1,28 @@
+#include "tommath_private.h"
+#ifdef MP_WARRAY_FREE_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis */
+/* SPDX-License-Identifier: Unlicense */
+
+/* static check that the multiplication won't overflow */
+MP_STATIC_ASSERT(warray_free_sz_does_not_overflow, (sizeof(mp_word) * MP_WARRAY) >= MP_WARRAY)
+
+static int s_warray_free(void)
+{
+   int ret = 0;
+   if (s_mp_warray.w_used)
+      return -2;
+   if (s_mp_warray.w_free) {
+      s_mp_zero_buf(s_mp_warray.w_free, sizeof(mp_word) * MP_WARRAY);
+      MP_FREE(s_mp_warray.w_free, sizeof(mp_word) * MP_WARRAY);
+      s_mp_warray.w_free = NULL;
+   }
+   return ret;
+}
+
+int mp_warray_free(void)
+{
+   if (MP_HAS(MP_SMALL_STACK_SIZE)) return s_warray_free();
+   return -1;
+}
+
+#endif
diff --git a/s_mp_montgomery_reduce_comba.c b/s_mp_montgomery_reduce_comba.c
index 7472caf3..3858f75a 100644
--- a/s_mp_montgomery_reduce_comba.c
+++ b/s_mp_montgomery_reduce_comba.c
@@ -15,9 +15,12 @@ mp_err s_mp_montgomery_reduce_comba(mp_int *x, const mp_int *n, mp_digit rho)
 {
    int     ix, oldused;
    mp_err  err;
-   mp_word W[MP_WARRAY];
+   mp_word MP_ALLOC_WARRAY(W);
+
+   MP_CHECK_WARRAY(W);
 
    if (x->used > MP_WARRAY) {
+      MP_FREE_WARRAY(W);
       return MP_VAL;
    }
 
@@ -26,6 +29,7 @@ mp_err s_mp_montgomery_reduce_comba(mp_int *x, const mp_int *n, mp_digit rho)
 
    /* grow a as required */
    if ((err = mp_grow(x, n->used + 1)) != MP_OKAY) {
+      MP_FREE_WARRAY(W);
       return err;
    }
 
@@ -110,6 +114,7 @@ mp_err s_mp_montgomery_reduce_comba(mp_int *x, const mp_int *n, mp_digit rho)
 
    mp_clamp(x);
 
+   MP_FREE_WARRAY(W);
    /* if A >= m then A = A - m */
    if (mp_cmp_mag(x, n) != MP_LT) {
       return s_mp_sub(x, n, x);
diff --git a/s_mp_mul_comba.c b/s_mp_mul_comba.c
index ca89ff9d..5b37035e 100644
--- a/s_mp_mul_comba.c
+++ b/s_mp_mul_comba.c
@@ -23,15 +23,19 @@ mp_err s_mp_mul_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs)
 {
    int      oldused, pa, ix;
    mp_err   err;
-   mp_digit W[MP_WARRAY];
+   mp_digit MP_ALLOC_WARRAY(W);
    mp_word  _W;
 
+   MP_CHECK_WARRAY(W);
+
    if (digs < 0) {
+      MP_FREE_WARRAY(W);
       return MP_VAL;
    }
 
    /* grow the destination as required */
    if ((err = mp_grow(c, digs)) != MP_OKAY) {
+      MP_FREE_WARRAY(W);
       return err;
    }
 
@@ -77,6 +81,7 @@ mp_err s_mp_mul_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs)
    s_mp_zero_digs(c->dp + c->used, oldused - c->used);
 
    mp_clamp(c);
+   MP_FREE_WARRAY(W);
    return MP_OKAY;
 }
 #endif
diff --git a/s_mp_mul_high_comba.c b/s_mp_mul_high_comba.c
index b5ac06d7..b0096d4e 100644
--- a/s_mp_mul_high_comba.c
+++ b/s_mp_mul_high_comba.c
@@ -16,16 +16,20 @@ mp_err s_mp_mul_high_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs
 {
    int     oldused, pa, ix;
    mp_err   err;
-   mp_digit W[MP_WARRAY];
+   mp_digit MP_ALLOC_WARRAY(W);
    mp_word  _W;
 
+   MP_CHECK_WARRAY(W);
+
    if (digs < 0) {
+      MP_FREE_WARRAY(W);
       return MP_VAL;
    }
 
    /* grow the destination as required */
    pa = a->used + b->used;
    if ((err = mp_grow(c, pa)) != MP_OKAY) {
+      MP_FREE_WARRAY(W);
       return err;
    }
 
@@ -69,6 +73,7 @@ mp_err s_mp_mul_high_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs
    s_mp_zero_digs(c->dp + c->used, oldused - c->used);
 
    mp_clamp(c);
+   MP_FREE_WARRAY(W);
    return MP_OKAY;
 }
 #endif
diff --git a/s_mp_sqr_comba.c b/s_mp_sqr_comba.c
index 1bcc1f93..336a0a08 100644
--- a/s_mp_sqr_comba.c
+++ b/s_mp_sqr_comba.c
@@ -16,13 +16,16 @@ After that loop you do the squares and add them in.
 mp_err s_mp_sqr_comba(const mp_int *a, mp_int *b)
 {
    int       oldused, pa, ix;
-   mp_digit  W[MP_WARRAY];
+   mp_digit  MP_ALLOC_WARRAY(W);
    mp_word   W1;
    mp_err err;
 
+   MP_CHECK_WARRAY(W);
+
    /* grow the destination as required */
    pa = a->used + a->used;
    if ((err = mp_grow(b, pa)) != MP_OKAY) {
+      MP_FREE_WARRAY(W);
       return err;
    }
 
@@ -82,6 +85,7 @@ mp_err s_mp_sqr_comba(const mp_int *a, mp_int *b)
    s_mp_zero_digs(b->dp + b->used, oldused - b->used);
 
    mp_clamp(b);
+   MP_FREE_WARRAY(W);
    return MP_OKAY;
 }
 #endif
diff --git a/s_mp_warray.c b/s_mp_warray.c
new file mode 100644
index 00000000..1b8b068b
--- /dev/null
+++ b/s_mp_warray.c
@@ -0,0 +1,8 @@
+#include "tommath_private.h"
+#ifdef S_MP_WARRAY_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis */
+/* SPDX-License-Identifier: Unlicense */
+
+mp_thread st_warray s_mp_warray = { 0 };
+
+#endif
diff --git a/s_mp_warray_get.c b/s_mp_warray_get.c
new file mode 100644
index 00000000..26b0d7c1
--- /dev/null
+++ b/s_mp_warray_get.c
@@ -0,0 +1,18 @@
+#include "tommath_private.h"
+#ifdef S_MP_WARRAY_GET_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis */
+/* SPDX-License-Identifier: Unlicense */
+
+void *s_mp_warray_get(void)
+{
+   if (s_mp_warray.w_used)
+      return NULL;
+   if (s_mp_warray.w_free == NULL) {
+      s_mp_warray.w_free = MP_CALLOC(MP_WARRAY, sizeof(mp_word));
+   }
+   s_mp_warray.w_used = s_mp_warray.w_free;
+   s_mp_warray.w_free = NULL;
+   return s_mp_warray.w_used;
+}
+
+#endif
diff --git a/s_mp_warray_put.c b/s_mp_warray_put.c
new file mode 100644
index 00000000..79e014ac
--- /dev/null
+++ b/s_mp_warray_put.c
@@ -0,0 +1,14 @@
+#include "tommath_private.h"
+#ifdef S_MP_WARRAY_PUT_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis */
+/* SPDX-License-Identifier: Unlicense */
+
+void s_mp_warray_put(void *w)
+{
+   if (s_mp_warray.w_free || s_mp_warray.w_used != w)
+      return;
+   s_mp_warray.w_free = w;
+   s_mp_warray.w_used = NULL;
+}
+
+#endif
diff --git a/sources.cmake b/sources.cmake
index bbb2aeab..103e9c09 100644
--- a/sources.cmake
+++ b/sources.cmake
@@ -122,6 +122,7 @@ mp_to_sbin.c
 mp_to_ubin.c
 mp_ubin_size.c
 mp_unpack.c
+mp_warray_free.c
 mp_xor.c
 mp_zero.c
 s_mp_add.c
@@ -156,6 +157,9 @@ s_mp_sqr_comba.c
 s_mp_sqr_karatsuba.c
 s_mp_sqr_toom.c
 s_mp_sub.c
+s_mp_warray.c
+s_mp_warray_get.c
+s_mp_warray_put.c
 s_mp_zero_buf.c
 s_mp_zero_digs.c
 )
diff --git a/testme.sh b/testme.sh
index 089e42a7..92997a04 100755
--- a/testme.sh
+++ b/testme.sh
@@ -70,6 +70,8 @@ All other options will be tested with all MP_xBIT configurations.
                             runtime and may trigger the 30 minutes
                             timeout.
 
+    --multithread           Run tests in multi-threaded mode (via pthread).
+
 Godmode:
 
     --all                   Choose all architectures and gcc and clang
@@ -128,7 +130,7 @@ _make()
   echo -ne " Compile $1 $2"
   suffix=$(echo ${1}${2}  | tr ' ' '_')
   _fixup_cflags "$1"
-  CC="$1" CFLAGS="$2 $TEST_CFLAGS" make -j$MAKE_JOBS $3 $MAKE_OPTIONS 2>gcc_errors_${suffix}.log
+  CC="$1" CFLAGS="$2 $TEST_CFLAGS" LFLAGS="$4" LDFLAGS="$5" make -j$MAKE_JOBS $3 $MAKE_OPTIONS 2>gcc_errors_${suffix}.log
   errcnt=$(wc -l < gcc_errors_${suffix}.log)
   if [[ ${errcnt} -gt 1 ]]; then
     echo " failed"
@@ -148,10 +150,10 @@ _runtest()
     # "make tune" will run "tune_it.sh" automatically, hence "autotune", but it cannot
     # get switched off without some effort, so we just let it run twice for testing purposes
     echo -e "\rRun autotune $1 $2"
-    _make "$1" "$2" ""
+    _make "$1" "$2" "" "$3" "$4"
     $_timeout $TUNE_CMD > test_${suffix}.log || _die "running autotune" $?
   else
-    _make "$1" "$2" "test"
+    _make "$1" "$2" "test" "$3" "$4"
     echo -e "\rRun test $1 $2"
     $_timeout ./test > test_${suffix}.log || _die "running tests" $?
   fi
@@ -171,13 +173,13 @@ echo "MAKE_OPTIONS = \"$MAKE_OPTIONS\""
   if [[ "$MAKE_OPTIONS" =~ "tune"  ]]
   then
 echo "autotune branch"
-    _make "$1" "$2" ""
+    _make "$1" "$2" "" "$3" "$4"
     # The shell used for /bin/sh is DASH 0.5.7-4ubuntu1 on the author's machine which fails valgrind, so
     # we just run on instance of etc/tune with the same options as in etc/tune_it.sh
     echo -e "\rRun etc/tune $1 $2 once inside valgrind"
     $_timeout $VALGRIND_BIN $VALGRIND_OPTS $TUNE_CMD > test_${suffix}.log || _die "running etc/tune" $?
   else
-    _make "$1" "$2" "test"
+    _make "$1" "$2" "test" "$3" "$4"
     echo -e "\rRun test $1 $2 inside valgrind"
     $_timeout $VALGRIND_BIN $VALGRIND_OPTS ./test > test_${suffix}.log || _die "running tests" $?
   fi
@@ -301,6 +303,11 @@ do
     --symbols)
       CHECK_SYMBOLS="1"
     ;;
+    --multithread)
+      CFLAGS="$CFLAGS -DLTM_TEST_MULTITHREAD"
+      LFLAGS="$LFLAGS -pthread"
+      LDFLAGS="$LDFLAGS -pthread"
+    ;;
     --all)
       COMPILERS="gcc clang"
       ARCHFLAGS="-m64 -m32 -mx32"
@@ -376,9 +383,9 @@ then
   _banner "$CC"
   if [[ "$VALGRIND_BIN" != "" ]]
   then
-    _runvalgrind "$CC" ""
+    _runvalgrind "$CC" "" "$LFLAGS"  "$LDFLAGS"
   else
-    _runtest "$CC" ""
+    _runtest "$CC" ""  "$LFLAGS"  "$LDFLAGS"
   fi
   _exit
 fi
@@ -398,9 +405,9 @@ _banner
 if [[ "$TEST_VS_MTEST" != "" ]]
 then
    make clean > /dev/null
-   _make "${compilers[0]}" "${archflags[0]} $CFLAGS" "mtest_opponent"
+   _make "${compilers[0]}" "${archflags[0]} $CFLAGS" "mtest_opponent" "$LFLAGS" "$LDFLAGS"
    echo
-   _make "gcc" "$MTEST_RAND" "mtest"
+   _make "gcc" "$MTEST_RAND" "mtest" "$LFLAGS" "$LDFLAGS"
    echo
    echo "Run test vs. mtest for $TEST_VS_MTEST iterations"
    _timeout=""
@@ -429,15 +436,15 @@ do
     fi
     if [[ "$VALGRIND_BIN" != "" ]]
     then
-      _runvalgrind "$i" "$a $CFLAGS"
+      _runvalgrind "$i" "$a $CFLAGS" "$LFLAGS" "$LDFLAGS"
       [ "$WITH_LOW_MP" != "1" ] && continue
-      _runvalgrind "$i" "$a -DMP_16BIT $CFLAGS"
-      _runvalgrind "$i" "$a -DMP_32BIT $CFLAGS"
+      _runvalgrind "$i" "$a -DMP_16BIT $CFLAGS" "$LFLAGS" "$LDFLAGS"
+      _runvalgrind "$i" "$a -DMP_32BIT $CFLAGS" "$LFLAGS" "$LDFLAGS"
     else
-      _runtest "$i" "$a $CFLAGS"
+      _runtest "$i" "$a $CFLAGS" "$LFLAGS" "$LDFLAGS"
       [ "$WITH_LOW_MP" != "1" ] && continue
-      _runtest "$i" "$a -DMP_16BIT $CFLAGS"
-      _runtest "$i" "$a -DMP_32BIT $CFLAGS"
+      _runtest "$i" "$a -DMP_16BIT $CFLAGS" "$LFLAGS" "$LDFLAGS"
+      _runtest "$i" "$a -DMP_32BIT $CFLAGS" "$LFLAGS" "$LDFLAGS"
     fi
   done
 done
diff --git a/tommath.def b/tommath.def
index 86f34872..ed5aa8b0 100644
--- a/tommath.def
+++ b/tommath.def
@@ -125,6 +125,7 @@ EXPORTS
     mp_to_ubin
     mp_ubin_size
     mp_unpack
+    mp_warray_free
     mp_xor
     mp_zero
     MP_MUL_KARATSUBA_CUTOFF
diff --git a/tommath.h b/tommath.h
index 84bb0909..1820d243 100644
--- a/tommath.h
+++ b/tommath.h
@@ -588,6 +588,8 @@ mp_err mp_fread(mp_int *a, int radix, FILE *stream) MP_WUR;
 mp_err mp_fwrite(const mp_int *a, int radix, FILE *stream) MP_WUR;
 #endif
 
+int mp_warray_free(void);
+
 #define mp_to_binary(M, S, N)  mp_to_radix((M), (S), (N), NULL, 2)
 #define mp_to_octal(M, S, N)   mp_to_radix((M), (S), (N), NULL, 8)
 #define mp_to_decimal(M, S, N) mp_to_radix((M), (S), (N), NULL, 10)
diff --git a/tommath_c89.h b/tommath_c89.h
index 49400a13..22436366 100644
--- a/tommath_c89.h
+++ b/tommath_c89.h
@@ -26,6 +26,11 @@ typedef __UINT8_TYPE__  mp_u8;
 typedef __UINT16_TYPE__ mp_u16;
 typedef __UINT32_TYPE__ mp_u32;
 typedef __UINT64_TYPE__ mp_u64;
+# if __WORDSIZE == 64
+typedef __UINT64_TYPE__ mp_uintptr;
+# else
+typedef __UINT32_TYPE__ mp_uintptr;
+# endif
 
 /* inttypes.h replacement, printf format specifier */
 # if __WORDSIZE == 64
diff --git a/tommath_class.h b/tommath_class.h
index e08bc5f3..09bb3ea6 100644
--- a/tommath_class.h
+++ b/tommath_class.h
@@ -131,6 +131,7 @@
 #   define MP_TO_UBIN_C
 #   define MP_UBIN_SIZE_C
 #   define MP_UNPACK_C
+#   define MP_WARRAY_FREE_C
 #   define MP_XOR_C
 #   define MP_ZERO_C
 #   define S_MP_ADD_C
@@ -165,6 +166,9 @@
 #   define S_MP_SQR_KARATSUBA_C
 #   define S_MP_SQR_TOOM_C
 #   define S_MP_SUB_C
+#   define S_MP_WARRAY_C
+#   define S_MP_WARRAY_GET_C
+#   define S_MP_WARRAY_PUT_C
 #   define S_MP_ZERO_BUF_C
 #   define S_MP_ZERO_DIGS_C
 #endif
@@ -957,6 +961,10 @@
 #   define MP_ZERO_C
 #endif
 
+#if defined(MP_WARRAY_FREE_C)
+#   define S_MP_ZERO_BUF_C
+#endif
+
 #if defined(MP_XOR_C)
 #   define MP_CLAMP_C
 #   define MP_GROW_C
@@ -1137,6 +1145,8 @@
 #   define MP_CMP_MAG_C
 #   define MP_GROW_C
 #   define S_MP_SUB_C
+#   define S_MP_WARRAY_GET_C
+#   define S_MP_WARRAY_PUT_C
 #   define S_MP_ZERO_BUF_C
 #   define S_MP_ZERO_DIGS_C
 #endif
@@ -1165,6 +1175,8 @@
 #if defined(S_MP_MUL_COMBA_C)
 #   define MP_CLAMP_C
 #   define MP_GROW_C
+#   define S_MP_WARRAY_GET_C
+#   define S_MP_WARRAY_PUT_C
 #   define S_MP_ZERO_DIGS_C
 #endif
 
@@ -1179,6 +1191,8 @@
 #if defined(S_MP_MUL_HIGH_COMBA_C)
 #   define MP_CLAMP_C
 #   define MP_GROW_C
+#   define S_MP_WARRAY_GET_C
+#   define S_MP_WARRAY_PUT_C
 #   define S_MP_ZERO_DIGS_C
 #endif
 
@@ -1244,6 +1258,8 @@
 #if defined(S_MP_SQR_COMBA_C)
 #   define MP_CLAMP_C
 #   define MP_GROW_C
+#   define S_MP_WARRAY_GET_C
+#   define S_MP_WARRAY_PUT_C
 #   define S_MP_ZERO_DIGS_C
 #endif
 
@@ -1279,6 +1295,15 @@
 #   define S_MP_ZERO_DIGS_C
 #endif
 
+#if defined(S_MP_WARRAY_C)
+#endif
+
+#if defined(S_MP_WARRAY_GET_C)
+#endif
+
+#if defined(S_MP_WARRAY_PUT_C)
+#endif
+
 #if defined(S_MP_ZERO_BUF_C)
 #endif
 
diff --git a/tommath_private.h b/tommath_private.h
index c1fa95a0..be620dbc 100644
--- a/tommath_private.h
+++ b/tommath_private.h
@@ -234,6 +234,47 @@ MP_PRIVATE mp_err s_mp_radix_size_overestimate(const mp_int *a, const int radix,
 MP_PRIVATE mp_err s_mp_fp_log(const mp_int *a, mp_int *c) MP_WUR;
 MP_PRIVATE mp_err s_mp_fp_log_d(const mp_int *a, mp_word *c) MP_WUR;
 
+#ifdef MP_SMALL_STACK_SIZE
+
+#if defined(__GNUC__)
+/* We use TLS (Thread Local Storage) to manage the instance of the WARRAY
+ * per thread.
+ * The compilers we're usually looking at are GCC, Clang and MSVC.
+ * Both GCC and Clang are straight-forward with TLS, so it's enabled there.
+ * Using MSVC the tests were OK with the static library, but failed when
+ * the library was built as a DLL. As a result we completely disable
+ * support for MSVC.
+ * If your compiler can handle TLS properly without too much hocus pocus,
+ * feel free to open a PR to add support for it.
+ */
+#define mp_thread __thread
+#else
+#error "MP_SMALL_STACK_SIZE not supported with your compiler"
+#endif
+
+#define MP_SMALL_STACK_SIZE_C
+#define MP_ALLOC_WARRAY(name) *name = s_mp_warray_get()
+#define MP_FREE_WARRAY(name) s_mp_warray_put(name)
+#define MP_CHECK_WARRAY(name) do { if ((name) == NULL) { return MP_MEM; } } while(0)
+#else
+#define MP_ALLOC_WARRAY(name) name[MP_WARRAY]
+#define MP_FREE_WARRAY(name)
+#define MP_CHECK_WARRAY(name)
+#endif
+
+#ifndef mp_thread
+#define mp_thread
+#endif
+
+typedef struct {
+   void *w_free, *w_used;
+} st_warray;
+
+extern MP_PRIVATE mp_thread st_warray s_mp_warray;
+
+MP_PRIVATE void *s_mp_warray_get(void);
+MP_PRIVATE void s_mp_warray_put(void *w);
+
 #define MP_RADIX_MAP_REVERSE_SIZE 80u
 extern MP_PRIVATE const char s_mp_radix_map[];
 extern MP_PRIVATE const uint8_t s_mp_radix_map_reverse[];
diff --git a/tommath_superclass.h b/tommath_superclass.h
index 9245e002..10c7f12a 100644
--- a/tommath_superclass.h
+++ b/tommath_superclass.h
@@ -42,6 +42,8 @@
 #   define MP_SBIN_SIZE_C
 #   define MP_TO_RADIX_C
 #   define MP_TO_SBIN_C
+#   define MP_WARRAY_FREE_C
+#   define MP_WARRAY_INIT_C
 #   define S_MP_RAND_JENKINS_C
 #   define S_MP_RAND_PLATFORM_C
 #endif