probonopd · soyersoyer · Jan 3, 2025 · Jan 3, 2025 · Jul 24, 2025 · Jul 28, 2025
diff --git a/src/Makefile b/src/Makefile
@@ -10,7 +10,7 @@ OBJS = main.o kernel.o minidexed.o config.o userinterface.o uimenu.o \
        mididevice.o midikeyboard.o serialmididevice.o pckeyboard.o \
        sysexfileloader.o performanceconfig.o perftimer.o \
        effect_platervbstereo.o uibuttons.o midipin.o \
-       arm_float_to_q23.o arm_scale_zip_f32.o \
+       arm_float_to_q23.o arm_scale_zip_f32.o arm_scale_zip_f32_to_q23.o \
        net/ftpdaemon.o net/ftpworker.o net/applemidi.o net/udpmidi.o net/mdnspublisher.o udpmididevice.o
 
 OPTIMIZE = -O3

diff --git a/src/arm_scale_zip_f32.c b/src/arm_scale_zip_f32.c
@@ -28,29 +28,29 @@ void arm_scale_zip_f32(
 {
     uint32_t blkCnt;                               /* Loop counter */
 
-    f32x2x2_t res;
+    f32x4x2_t res;
 
-    /* Compute 2 outputs at a time */
-    blkCnt = blockSize >> 1U;
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
 
     while (blkCnt > 0U)
     {
-        res.val[0] = vmul_n_f32(vld1_f32(pSrc1), scale);
-        res.val[1] = vmul_n_f32(vld1_f32(pSrc2), scale);
-        vst2_f32(pDst, res);
+        res.val[0] = vmulq_n_f32(vld1q_f32(pSrc1), scale);
+        res.val[1] = vmulq_n_f32(vld1q_f32(pSrc2), scale);
+        vst2q_f32(pDst, res);
 
         /* Increment pointers */
-        pSrc1 += 2;
-        pSrc2 += 2;
-        pDst += 4;
+        pSrc1 += 4;
+        pSrc2 += 4;
+        pDst += 8;
 
         /* Decrement the loop counter */
         blkCnt--;
     }
 
     /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
     ** No loop unrolling is used. */
-    blkCnt = blockSize & 1;
+    blkCnt = blockSize & 3;
 
     while (blkCnt > 0U)
     {

diff --git a/src/arm_scale_zip_f32_to_q23.c b/src/arm_scale_zip_f32_to_q23.c
@@ -0,0 +1,82 @@
+#include "arm_scale_zip_f32_to_q23.h"
+
+/**
+* @brief Scale two floating-point vector with a scalar and zip after.
+* @param[in]  pSrc1      points to the input vector 1
+* @param[in]  pSrc2      points to the input vector 2
+* @param[in]  scale      scale scalar
+* @param[out] pDst       points to the output vector
+* @param[in]  blockSize  number of samples in the vector
+*/
+
+#if defined(ARM_MATH_NEON_EXPERIMENTAL)
+void arm_scale_zip_f32_to_q23(
+  const float32_t * pSrc1,
+  const float32_t * pSrc2,
+        float32_t scale,
+        q23_t * pDst,
+        uint32_t blockSize)
+{
+    uint32_t blkCnt;                               /* Loop counter */
+
+    int32x4x2_t res;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        res.val[0] = vcvtq_n_s32_f32(vmulq_n_f32(vld1q_f32(pSrc1), scale), 23);
+        res.val[0] = vminq_s32(res.val[0], vdupq_n_s32(0x007fffff));
+        res.val[0] = vmaxq_s32(res.val[0], vdupq_n_s32(0xff800000));
+
+        res.val[1] = vcvtq_n_s32_f32(vmulq_n_f32(vld1q_f32(pSrc2), scale), 23);
+        res.val[1] = vminq_s32(res.val[1], vdupq_n_s32(0x007fffff));
+        res.val[1] = vmaxq_s32(res.val[1], vdupq_n_s32(0xff800000));
+
+        vst2q_s32(pDst, res);
+
+        /* Increment pointers */
+        pSrc1 += 4;
+        pSrc2 += 4;
+        pDst += 8;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+    ** No loop unrolling is used. */
+    blkCnt = blockSize & 3;
+
+    while (blkCnt > 0U)
+    {
+        *pDst++ = (q23_t) __SSAT((q31_t) (*pSrc1++ * scale * 8388608.0f), 24);
+        *pDst++ = (q23_t) __SSAT((q31_t) (*pSrc2++ * scale * 8388608.0f), 24);
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+}
+#else
+void arm_scale_zip_f32_to_q23(
+  const float32_t * pSrc1,
+  const float32_t * pSrc2,
+        float32_t scale,
+        q23_t * pDst,
+        uint32_t blockSize)
+{
+  uint32_t blkCnt;                               /* Loop counter */
+
+  blkCnt = blockSize;
+
+  while (blkCnt > 0U)
+  {
+      *pDst++ = (q23_t) __SSAT((q31_t) (*pSrc1++ * scale * 8388608.0f), 24);
+      *pDst++ = (q23_t) __SSAT((q31_t) (*pSrc2++ * scale * 8388608.0f), 24);
+
+      /* Decrement the loop counter */
+      blkCnt--;
+  }
+}
+#endif
diff --git a/src/arm_scale_zip_f32_to_q23.h b/src/arm_scale_zip_f32_to_q23.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include "arm_math_types.h"
+
+typedef int32_t q23_t;
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/**
+* @brief Scale two floating-point vector with a scalar and zip after.
+* @param[in]  pSrc1      points to the input vector 1
+* @param[in]  pSrc2      points to the input vector 2
+* @param[in]  scale      scale scalar
+* @param[out] pDst       points to the output vector
+* @param[in]  blockSize  number of samples in the vector
+*/
+void arm_scale_zip_f32_to_q23(const float32_t * pSrc1, const float32_t * pSrc2, float32_t scale, q23_t * pDst, uint32_t blockSize);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/config.h b/src/config.h
@@ -49,8 +49,8 @@ class CConfig		// Configuration for MiniDexed
 	// These are max values, default is to support 8 in total with optional 16 TGs
 	static const unsigned TGsCore1 = 2;		// process 2 TGs on core 1
 	static const unsigned TGsCore23 = 3;		// process 3 TGs on core 2 and 3 each
-	static const unsigned TGsCore1Opt = 2;		// process optional additional 2 TGs on core 1
-	static const unsigned TGsCore23Opt = 3;		// process optional additional 3 TGs on core 2 and 3 each
+	static const unsigned TGsCore1Opt = 8;		// process optional additional 2 TGs on core 1
+	static const unsigned TGsCore23Opt = 12;	// process optional additional 3 TGs on core 2 and 3 each
 	static const unsigned MinToneGenerators = TGsCore1 + 2*TGsCore23;
 	static const unsigned AllToneGenerators = TGsCore1 + TGsCore1Opt + 2*TGsCore23 + 2*TGsCore23Opt;
 	static const unsigned DefToneGenerators = MinToneGenerators;

diff --git a/src/effect_platervbstereo.cpp b/src/effect_platervbstereo.cpp
@@ -158,7 +158,7 @@ AudioEffectPlateReverb::AudioEffectPlateReverb(float32_t samplerate)
 
 // #define sat16(n, rshift) signed_saturate_rshift((n), 16, (rshift))
 
-void AudioEffectPlateReverb::doReverb(const float32_t* inblockL, const float32_t* inblockR, float32_t* rvbblockL, float32_t* rvbblockR, uint16_t len)
+void AudioEffectPlateReverb::addReverb(const float32_t* inblockL, const float32_t* inblockR, float32_t* addblockL, float32_t* addblockR, uint16_t len)
 {
     float32_t input, acc, temp1, temp2;
     uint16_t temp16;
@@ -405,7 +405,7 @@ void AudioEffectPlateReverb::doReverb(const float32_t* inblockL, const float32_t
         temp1 = acc - master_lowpass_l;
         master_lowpass_l += temp1 * master_lowpass_f;
 
-	rvbblockL[i] = master_lowpass_l;
+	addblockL[i] += master_lowpass_l * reverb_level;
 
         // Channel R
         #ifdef TAP1_MODULATED
@@ -449,6 +449,6 @@ void AudioEffectPlateReverb::doReverb(const float32_t* inblockL, const float32_t
         temp1 = acc - master_lowpass_r;
         master_lowpass_r += temp1 * master_lowpass_f;
 
-	rvbblockR[i] = master_lowpass_r;
+	addblockR[i] += master_lowpass_r * reverb_level;
     }
 }
diff --git a/src/effect_platervbstereo.h b/src/effect_platervbstereo.h
@@ -60,7 +60,7 @@ class AudioEffectPlateReverb
 {
 public:
     AudioEffectPlateReverb(float32_t samplerate);
-    void doReverb(const float32_t* inblockL, const float32_t* inblockR, float32_t* rvbblockL, float32_t* rvbblockR,uint16_t len);
+    void addReverb(const float32_t* inblockL, const float32_t* inblockR, float32_t* addblockL, float32_t* addblockR, uint16_t len);
 
     void size(float n)
     {

diff --git a/src/kernel.cpp b/src/kernel.cpp
@@ -35,6 +35,7 @@ CKernel::CKernel (void)
 :	
 	CStdlibAppStdio ("minidexed"),
 	m_Config (&mFileSystem),
+	m_CPUThrottle (CPUSpeedMaximum),
 	m_GPIOManager (&mInterrupt),
  	m_I2CMaster (CMachineInfo::Get ()->GetDevice (DeviceI2CMaster), TRUE),
 	m_pSPIMaster (nullptr),