From 7207c03c7251d939293fc5477d99e9e9f88ca25b Mon Sep 17 00:00:00 2001
From: Jens Roth <98889125+jensroth-git@users.noreply.github.com>
Date: Wed, 6 May 2026 12:38:52 +0200
Subject: [PATCH] [rlsw] ESP32 optimizations (#5827)

* [rlsw] Add sw_rcp helper using Xtensa recip0.s for hot-path divisions

Adds a `sw_rcp(x)` inline reciprocal that on Xtensa (ESP32 / ESP32-S3
LX6/LX7) emits a `recip0.s` seed plus two Newton-Raphson refinement
steps -- 1-ULP accurate in ~7 instructions, all in FPU registers.
On every other target it expands to plain `1.0f/x`, so generated code
is byte-identical to before for non-Xtensa builds.

Replaces the hot-path `1.0f/x` calls that were previously compiling to
the `__divsf3` software helper on Xtensa:

  - perspective divide (1/w) in triangle clip-and-project (PCT and PC paths)
  - line and point clip-and-project NDC conversion
  - triangle span setup: dxRcp, blockLenRcp, wRcpA, wRcpB
  - triangle scanline setup: h02Rcp, h01Rcp, h12Rcp
  - axis-aligned quad: wRcp, hRcp
  - line rasterizer: stepRcp

Other `1.0f/x` uses (matrix translate/normalize, texture init `tx`/`ty`,
sw_matrix_rotate inverse-length) are not on the per-pixel hot path and
are left untouched.

Measured on ESP32-S3 @ 240 MHz, R5G6B5 240x240, textured 3D model:
contributes to a ~10-15% rasterization speedup.

Made-with: Cursor

* [rlsw] Use ESP-DSP for 4x4 matrix multiply and per-vertex MVP transform

Adds an opt-in ESP-DSP code path for ESP32 / ESP32-S3 builds. ESP-DSP is
ESP-IDF's official optimized math library and ships hand-vectorized
kernels that beat the scalar implementations on Xtensa.

Two integration points:

  1. `sw_matrix_mul_rst` -> `dspm_mult_4x4x4_f32` for any 4x4*4x4 multiply
     (used for MVP build, gluLookAt, push/multiply, etc.). rlsw stores
     matrices column-major and ESP-DSP reads row-major; the comment on the
     call site explains why the flat-buffer call still produces the
     correct column-major product (transpose-of-transposes equivalence).

  2. `sw_immediate_push_vertex` -> `dspm_mult_4x4x1_f32` for the per-vertex
     clip-space transform. Because ESP-DSP expects a row-major matrix in
     this case, a row-major copy `matMVP_rm[16]` is maintained alongside
     `matMVP` and refreshed once per `isDirtyMVP` rebuild in
     `sw_immediate_begin`. Cost is 16 scalar copies per matrix update,
     amortized over thousands of vertices per frame.

Detection is **opt-in** via `SW_USE_ESP_DSP` so existing ESP-IDF projects
that don't depend on the `esp-dsp` component keep building unchanged.
A user enables it from CMakeLists.txt (or anywhere before including
rlgl.h):

    target_compile_definitions(${COMPONENT_LIB} PRIVATE SW_USE_ESP_DSP=1)

and adds the dependency to `idf_component.yml`:

    espressif/esp-dsp: "^1.4.0"

Measured on ESP32-S3 @ 240 MHz, R5G6B5 240x240, textured 3D model:
contributes meaningfully to the overall frame-time improvement
(combined with sw_rcp).

Made-with: Cursor

* [rlsw] Add SW_TEXTURE_REPEAT_POT_FAST opt-in for POT bitmask wrap

Adds an opt-in compile-time flag that replaces the SW_REPEAT wrap chain
with a bitmask (`x & (size-1)`) for power-of-two textures. NPOT textures
keep using the original `sw_fract` / signed-modulo paths via a runtime
`(size & (size-1)) == 0` check, so SW_REPEAT remains correct for them.

Affects two samplers:

  - `sw_texture_sample_nearest`: drops the `floorf` + multiply + cast for
    POT textures in REPEAT mode (saves a software call on Xtensa).
  - `sw_texture_sample_linear`: replaces the `(x % w + w) % w` two-step
    modulo (a software divide on Xtensa) with a single bitwise AND for
    POT textures in REPEAT mode. Two's-complement int wrap covers
    negative coordinates correctly.

Off by default: for POT textures sampled with negative UVs, bitmask wrap
can differ from `sw_fract` wrap by one texel at the boundary. That is
imperceptible at typical resolutions but technically a behavior change,
so existing users get bit-for-bit identical output. Opt in if you
control your asset UVs and want the speedup:

    #define SW_TEXTURE_REPEAT_POT_FAST

This addresses the long-standing TODO comment "If the textures are POT,
avoid the division for SW_REPEAT" in `sw_texture_sample_linear`.

Made-with: Cursor
---
 src/external/rlsw.h | 157 ++++++++++++++++++++++++++++++++++++++------
 1 file changed, 136 insertions(+), 21 deletions(-)

diff --git a/src/external/rlsw.h b/src/external/rlsw.h
index 1155eee41..852f78ff7 100644
--- a/src/external/rlsw.h
+++ b/src/external/rlsw.h
@@ -164,6 +164,19 @@
     #endif
 #endif
 
+// Fast power-of-two texture wrap (SW_REPEAT mode only)
+// When defined, textures whose width/height are powers of two use a bitmask
+// wrap (`x & (size-1)`) instead of `floorf`-based fractional wrap or the
+// signed `%` chain in the linear sampler. Saves a software divide on Xtensa
+// and a few instructions everywhere. NPOT textures keep using the original
+// path via a runtime `(size & (size-1)) == 0` check, so SW_REPEAT remains
+// correct for them. The only observable behavior change is for POT textures
+// sampled with negative UV coordinates: bitmask wrap (two's complement) can
+// differ from `sw_fract` by one texel. Off by default to keep bit-for-bit
+// behavior; opt in if you control your asset UVs.
+//
+// #define SW_TEXTURE_REPEAT_POT_FAST
+
 //----------------------------------------------------------------------------------
 // OpenGL Compatibility Types
 //----------------------------------------------------------------------------------
@@ -844,6 +857,17 @@ SWAPI void swGetFramebufferAttachmentParameteriv(SWattachment attachment, SWatta
     #endif
 #endif
 
+// ESP-DSP acceleration: ESP-IDF ships an optimized math library that includes
+// `dspm_mult_4x4x4_f32` (4x4 matrix multiply) and `dspm_mult_4x4x1_f32`
+// (matrix * vector). These are S3-tuned hand-vectorized kernels that beat the
+// scalar versions for both throughput and code-size. Detection is opt-in to
+// keep the dependency optional: define SW_USE_ESP_DSP from your build system
+// (or rely on the `idf_component.yml` example shown in the rlsw docs).
+#if defined(ESP_PLATFORM) && defined(SW_USE_ESP_DSP)
+    #define SW_HAS_ESP_DSP
+    #include "dspm_mult.h"
+#endif
+
 #ifdef __cplusplus
     #define SW_CURLY_INIT(name) name
 #else
@@ -1038,6 +1062,9 @@ typedef struct {
     SWmatrix currentMatrixMode;                                 // Current matrix mode (e.g., sw_MODELVIEW, sw_PROJECTION)
     sw_matrix_t *currentMatrix;                                 // Pointer to the currently used matrix according to the mode
     sw_matrix_t matMVP;                                         // Model view projection matrix, calculated and used internally
+#ifdef SW_HAS_ESP_DSP
+    float matMVP_rm[16];                                        // Row-major MVP, kept in sync for esp-dsp dspm_mult_4x4x1_f32 vertex transform
+#endif
     bool isDirtyMVP;                                            // Indicates if the MVP matrix should be rebuilt
 
     sw_handle_t boundFramebufferId;                             // Framebuffer currently bound
@@ -1141,6 +1168,14 @@ static inline void sw_matrix_id(sw_matrix_t dst)
 
 static inline void sw_matrix_mul_rst(float *SW_RESTRICT dst, const float *SW_RESTRICT left, const float *SW_RESTRICT right)
 {
+#ifdef SW_HAS_ESP_DSP
+    // dspm_mult_4x4x4_f32 treats its operands as row-major. rlsw stores matrices
+    // column-major, so passing them flat is equivalent to passing transposes:
+    // dspm_mult(L^T, R^T) computes (L^T)*(R^T) = (R*L)^T, written back into a
+    // flat array gives the same bit pattern as the column-major product (R*L)
+    // -- exactly the semantic the scalar fallback below has.
+    dspm_mult_4x4x4_f32(left, right, dst);
+#else
     float l00 = left[0],  l01 = left[1],  l02 = left[2],  l03 = left[3];
     float l10 = left[4],  l11 = left[5],  l12 = left[6],  l13 = left[7];
     float l20 = left[8],  l21 = left[9],  l22 = left[10], l23 = left[11];
@@ -1165,6 +1200,7 @@ static inline void sw_matrix_mul_rst(float *SW_RESTRICT dst, const float *SW_RES
     dst[7]  = l10*right[3] + l11*right[7] + l12*right[11] + l13*right[15];
     dst[11] = l20*right[3] + l21*right[7] + l22*right[11] + l23*right[15];
     dst[15] = l30*right[3] + l31*right[7] + l32*right[11] + l33*right[15];
+#endif
 }
 
 static inline void sw_matrix_mul(sw_matrix_t dst, const sw_matrix_t left, const sw_matrix_t right)
@@ -1210,6 +1246,33 @@ static inline float sw_fract(float x)
     return (x - floorf(x));
 }
 
+// Fast reciprocal: 1-ULP accurate in ~7 instructions on Xtensa using the
+// hardware `recip0.s` seed + two Newton-Raphson refinement steps. All work
+// stays in FPU registers — no `__divsf3` software call. Hot-path divisions
+// in the rasterizer (span/triangle setup, perspective divide, etc.) call
+// this. On non-Xtensa targets it transparently expands to `1.0f / x`, so
+// generated code is identical to before.
+#if defined(__XTENSA__)
+__attribute__((always_inline))
+static inline float sw_rcp(float x)
+{
+    float result, temp;
+    __asm__(
+        "recip0.s %0, %2\n"
+        "const.s  %1, 1\n"
+        "msub.s   %1, %2, %0\n"
+        "madd.s   %0, %0, %1\n"
+        "const.s  %1, 1\n"
+        "msub.s   %1, %2, %0\n"
+        "maddn.s  %0, %0, %1\n"
+        : "=&f"(result), "=&f"(temp) : "f"(x)
+    );
+    return result;
+}
+#else
+static inline float sw_rcp(float x) { return 1.0f/x; }
+#endif
+
 static inline uint8_t sw_luminance8(const uint8_t *color)
 {
     return (uint8_t)((color[0]*77 + color[1]*150 + color[2]*29) >> 8);
@@ -2406,11 +2469,31 @@ static inline void sw_texture_free(sw_texture_t *texture)
 
 static inline void sw_texture_sample_nearest(float *SW_RESTRICT color, const sw_texture_t *SW_RESTRICT tex, float u, float v)
 {
-    u = (tex->sWrap == SW_REPEAT)? sw_fract(u) : sw_saturate(u);
-    v = (tex->tWrap == SW_REPEAT)? sw_fract(v) : sw_saturate(v);
+    int x, y;
 
-    int x = u*tex->width;
-    int y = v*tex->height;
+#ifdef SW_TEXTURE_REPEAT_POT_FAST
+    if ((tex->sWrap == SW_REPEAT) && ((tex->width & tex->wMinus1) == 0))
+    {
+        x = (int)(u*tex->width) & tex->wMinus1;
+    }
+    else
+#endif
+    {
+        u = (tex->sWrap == SW_REPEAT)? sw_fract(u) : sw_saturate(u);
+        x = (int)(u*tex->width);
+    }
+
+#ifdef SW_TEXTURE_REPEAT_POT_FAST
+    if ((tex->tWrap == SW_REPEAT) && ((tex->height & tex->hMinus1) == 0))
+    {
+        y = (int)(v*tex->height) & tex->hMinus1;
+    }
+    else
+#endif
+    {
+        v = (tex->tWrap == SW_REPEAT)? sw_fract(v) : sw_saturate(v);
+        y = (int)(v*tex->height);
+    }
 
     tex->readColor(color, tex->pixels, y*tex->width + x);
 }
@@ -2432,13 +2515,19 @@ static inline void sw_texture_sample_linear(float *SW_RESTRICT color, const sw_t
     int x1 = x0 + 1;
     int y1 = y0 + 1;
 
-    // NOTE: If the textures are POT, avoid the division for SW_REPEAT
-
     if (tex->sWrap == SW_CLAMP)
     {
         x0 = (x0 > tex->wMinus1)? tex->wMinus1 : x0;
         x1 = (x1 > tex->wMinus1)? tex->wMinus1 : x1;
     }
+#ifdef SW_TEXTURE_REPEAT_POT_FAST
+    else if ((tex->width & tex->wMinus1) == 0)
+    {
+        // POT fast path: bitmask wrap covers negative ints via two's complement
+        x0 = x0 & tex->wMinus1;
+        x1 = x1 & tex->wMinus1;
+    }
+#endif
     else
     {
         x0 = (x0%tex->width + tex->width)%tex->width;
@@ -2450,6 +2539,13 @@ static inline void sw_texture_sample_linear(float *SW_RESTRICT color, const sw_t
         y0 = (y0 > tex->hMinus1)? tex->hMinus1 : y0;
         y1 = (y1 > tex->hMinus1)? tex->hMinus1 : y1;
     }
+#ifdef SW_TEXTURE_REPEAT_POT_FAST
+    else if ((tex->height & tex->hMinus1) == 0)
+    {
+        y0 = y0 & tex->hMinus1;
+        y1 = y1 & tex->hMinus1;
+    }
+#endif
     else
     {
         y0 = (y0%tex->height + tex->height)%tex->height;
@@ -3366,7 +3462,7 @@ static void sw_triangle_clip_and_project(void)
 
             // Calculation of the reciprocal of W for normalization
             // as well as perspective-correct attributes
-            const float wRcp = 1.0f/v->position[3];
+            const float wRcp = sw_rcp(v->position[3]);
 
             // Division of XYZ coordinates by weight
             v->position[0] *= wRcp;
@@ -3481,7 +3577,7 @@ static void sw_quad_clip_and_project(void)
 
             // Calculation of the reciprocal of W for normalization
             // as well as perspective-correct attributes
-            const float wRcp = 1.0f/v->position[3];
+            const float wRcp = sw_rcp(v->position[3]);
 
             // Division of XYZ coordinates by weight
             v->position[0] *= wRcp;
@@ -3659,8 +3755,8 @@ static bool sw_line_clip_and_project(sw_vertex_t *v0, sw_vertex_t *v1)
     if (!sw_line_clip(v0, v1)) return false;
 
     // Convert clip coordinates to NDC
-    v0->position[3] = 1.0f/v0->position[3];
-    v1->position[3] = 1.0f/v1->position[3];
+    v0->position[3] = sw_rcp(v0->position[3]);
+    v1->position[3] = sw_rcp(v1->position[3]);
     for (int i = 0; i < 3; i++)
     {
         v0->position[i] *= v0->position[3];
@@ -3709,7 +3805,7 @@ static bool sw_point_clip_and_project(sw_vertex_t *v)
             if ((v->position[i] < -v->position[3]) || (v->position[i] > v->position[3])) return false;
         }
 
-        v->position[3] = 1.0f/v->position[3];
+        v->position[3] = sw_rcp(v->position[3]);
         v->position[0] *= v->position[3];
         v->position[1] *= v->position[3];
         v->position[2] *= v->position[3];
@@ -3791,6 +3887,19 @@ static void sw_immediate_begin(SWdraw mode)
             RLSW.stackModelview[RLSW.stackModelviewCounter - 1],
             RLSW.stackProjection[RLSW.stackProjectionCounter - 1]);
 
+#ifdef SW_HAS_ESP_DSP
+        // Pre-transpose to row-major so dspm_mult_4x4x1_f32(matMVP_rm, v, out)
+        // computes M*v directly in the per-vertex hot path. 16 scalar copies
+        // per MVP update vs. saving ~20 cycles per vertex transform.
+        for (int i = 0; i < 4; i++)
+        {
+            for (int j = 0; j < 4; j++)
+            {
+                RLSW.matMVP_rm[4*i + j] = RLSW.matMVP[4*j + i];
+            }
+        }
+#endif
+
         RLSW.isDirtyMVP = false;
     }
 
@@ -3842,11 +3951,17 @@ static void sw_immediate_push_vertex(const float position[4])
     sw_vertex_t *vertex = &RLSW.primitive.buffer[RLSW.primitive.vertexCount++];
 
     // Calculate clip coordinates
+#ifdef SW_HAS_ESP_DSP
+    // dspm_mult_4x4x1_f32 declares its inputs non-const; rlsw treats them as
+    // read-only and the cast is safe (the kernel only loads from B).
+    dspm_mult_4x4x1_f32(RLSW.matMVP_rm, (float *)position, vertex->position);
+#else
     const float *m = RLSW.matMVP;
     vertex->position[0] = m[0]*position[0] + m[4]*position[1] + m[8]*position[2] + m[12]*position[3];
     vertex->position[1] = m[1]*position[0] + m[5]*position[1] + m[9]*position[2] + m[13]*position[3];
     vertex->position[2] = m[2]*position[0] + m[6]*position[1] + m[10]*position[2] + m[14]*position[3];
     vertex->position[3] = m[3]*position[0] + m[7]*position[1] + m[11]*position[2] + m[15]*position[3];
+#endif
 
     // Copy the attributes in the current vertex
     for (int i = 0; i < 4; i++) vertex->color[i] = RLSW.primitive.color[i];
@@ -5272,7 +5387,7 @@ static void SW_RASTER_TRIANGLE_SPAN(const sw_vertex_t *start, const sw_vertex_t
     if (xStart == xEnd) return;
 
     // Compute the inverse horizontal distance along the X axis
-    float dxRcp = 1.0f/(end->position[0] - start->position[0]);
+    float dxRcp = sw_rcp(end->position[0] - start->position[0]);
 
     // Compute the interpolation steps along the X axis
     float dWdx = (end->position[3] - start->position[3])*dxRcp;
@@ -5326,12 +5441,12 @@ static void SW_RASTER_TRIANGLE_SPAN(const sw_vertex_t *start, const sw_vertex_t
         int blockEnd = x + SW_AFFINE_BLOCK;
         if (blockEnd > xEnd) blockEnd = xEnd;
         float blockLenF = (float)(blockEnd - x);
-        float blockLenRcp = 1.0f/blockLenF;
+        float blockLenRcp = sw_rcp(blockLenF);
 
         // Only 2 '1/w' here; none inside the pixel loop
-        float wRcpA = 1.0f/w;
+        float wRcpA = sw_rcp(w);
         float wB = w + dWdx*blockLenF;
-        float wRcpB = 1.0f/wB;
+        float wRcpB = sw_rcp(wB);
 
         // Perspective-correct color at both block endpoints, then affine gradient
         float srcColor[4] = {
@@ -5459,9 +5574,9 @@ static void SW_RASTER_TRIANGLE(const sw_vertex_t *v0, const sw_vertex_t *v1, con
     if (h02 < 1e-6f) return;
 
     // Inverse edge dy for per-edge dV/dy (scanline interpolation)
-    float h02Rcp = 1.0f/h02;
-    float h01Rcp = (h01 > 1e-6f)? 1.0f/h01 : 0.0f;
-    float h12Rcp = (h12 > 1e-6f)? 1.0f/h12 : 0.0f;
+    float h02Rcp = sw_rcp(h02);
+    float h01Rcp = (h01 > 1e-6f)? sw_rcp(h01) : 0.0f;
+    float h12Rcp = (h12 > 1e-6f)? sw_rcp(h12) : 0.0f;
 
     // Compute gradients for each side of the triangle
     sw_vertex_t dVXdy02, dVXdy01, dVXdy12;
@@ -5560,8 +5675,8 @@ static void SW_RASTER_QUAD(const sw_vertex_t *a, const sw_vertex_t *b,
     float h = (float)(yMax - yMin);
     if ((w <= 0) || (h <= 0)) return;
 
-    float wRcp = 1.0f/w;
-    float hRcp = 1.0f/h;
+    float wRcp = sw_rcp(w);
+    float hRcp = sw_rcp(h);
 
     // Subpixel corrections
     float xSubstep = 1.0f - sw_fract(tl->position[0]);
@@ -5746,7 +5861,7 @@ static void SW_RASTER_LINE(const sw_vertex_t *v0, const sw_vertex_t *v1)
     // Compute per pixel increments
     float xInc = dx/steps;
     float yInc = dy/steps;
-    float stepRcp = 1.0f/steps;
+    float stepRcp = sw_rcp(steps);
 #ifdef SW_ENABLE_DEPTH_TEST
     float zInc = (v1->position[2] - v0->position[2])*stepRcp;
 #endif