From 7207c03c7251d939293fc5477d99e9e9f88ca25b Mon Sep 17 00:00:00 2001 From: Jens Roth <98889125+jensroth-git@users.noreply.github.com> Date: Wed, 6 May 2026 12:38:52 +0200 Subject: [PATCH] [rlsw] ESP32 optimizations (#5827) * [rlsw] Add sw_rcp helper using Xtensa recip0.s for hot-path divisions Adds a `sw_rcp(x)` inline reciprocal that on Xtensa (ESP32 / ESP32-S3 LX6/LX7) emits a `recip0.s` seed plus two Newton-Raphson refinement steps -- 1-ULP accurate in ~7 instructions, all in FPU registers. On every other target it expands to plain `1.0f/x`, so generated code is byte-identical to before for non-Xtensa builds. Replaces the hot-path `1.0f/x` calls that were previously compiling to the `__divsf3` software helper on Xtensa: - perspective divide (1/w) in triangle clip-and-project (PCT and PC paths) - line and point clip-and-project NDC conversion - triangle span setup: dxRcp, blockLenRcp, wRcpA, wRcpB - triangle scanline setup: h02Rcp, h01Rcp, h12Rcp - axis-aligned quad: wRcp, hRcp - line rasterizer: stepRcp Other `1.0f/x` uses (matrix translate/normalize, texture init `tx`/`ty`, sw_matrix_rotate inverse-length) are not on the per-pixel hot path and are left untouched. Measured on ESP32-S3 @ 240 MHz, R5G6B5 240x240, textured 3D model: contributes to a ~10-15% rasterization speedup. Made-with: Cursor * [rlsw] Use ESP-DSP for 4x4 matrix multiply and per-vertex MVP transform Adds an opt-in ESP-DSP code path for ESP32 / ESP32-S3 builds. ESP-DSP is ESP-IDF's official optimized math library and ships hand-vectorized kernels that beat the scalar implementations on Xtensa. Two integration points: 1. `sw_matrix_mul_rst` -> `dspm_mult_4x4x4_f32` for any 4x4*4x4 multiply (used for MVP build, gluLookAt, push/multiply, etc.). rlsw stores matrices column-major and ESP-DSP reads row-major; the comment on the call site explains why the flat-buffer call still produces the correct column-major product (transpose-of-transposes equivalence). 2. `sw_immediate_push_vertex` -> `dspm_mult_4x4x1_f32` for the per-vertex clip-space transform. Because ESP-DSP expects a row-major matrix in this case, a row-major copy `matMVP_rm[16]` is maintained alongside `matMVP` and refreshed once per `isDirtyMVP` rebuild in `sw_immediate_begin`. Cost is 16 scalar copies per matrix update, amortized over thousands of vertices per frame. Detection is **opt-in** via `SW_USE_ESP_DSP` so existing ESP-IDF projects that don't depend on the `esp-dsp` component keep building unchanged. A user enables it from CMakeLists.txt (or anywhere before including rlgl.h): target_compile_definitions(${COMPONENT_LIB} PRIVATE SW_USE_ESP_DSP=1) and adds the dependency to `idf_component.yml`: espressif/esp-dsp: "^1.4.0" Measured on ESP32-S3 @ 240 MHz, R5G6B5 240x240, textured 3D model: contributes meaningfully to the overall frame-time improvement (combined with sw_rcp). Made-with: Cursor * [rlsw] Add SW_TEXTURE_REPEAT_POT_FAST opt-in for POT bitmask wrap Adds an opt-in compile-time flag that replaces the SW_REPEAT wrap chain with a bitmask (`x & (size-1)`) for power-of-two textures. NPOT textures keep using the original `sw_fract` / signed-modulo paths via a runtime `(size & (size-1)) == 0` check, so SW_REPEAT remains correct for them. Affects two samplers: - `sw_texture_sample_nearest`: drops the `floorf` + multiply + cast for POT textures in REPEAT mode (saves a software call on Xtensa). - `sw_texture_sample_linear`: replaces the `(x % w + w) % w` two-step modulo (a software divide on Xtensa) with a single bitwise AND for POT textures in REPEAT mode. Two's-complement int wrap covers negative coordinates correctly. Off by default: for POT textures sampled with negative UVs, bitmask wrap can differ from `sw_fract` wrap by one texel at the boundary. That is imperceptible at typical resolutions but technically a behavior change, so existing users get bit-for-bit identical output. Opt in if you control your asset UVs and want the speedup: #define SW_TEXTURE_REPEAT_POT_FAST This addresses the long-standing TODO comment "If the textures are POT, avoid the division for SW_REPEAT" in `sw_texture_sample_linear`. Made-with: Cursor --- src/external/rlsw.h | 157 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 136 insertions(+), 21 deletions(-) diff --git a/src/external/rlsw.h b/src/external/rlsw.h index 1155eee41..852f78ff7 100644 --- a/src/external/rlsw.h +++ b/src/external/rlsw.h @@ -164,6 +164,19 @@ #endif #endif +// Fast power-of-two texture wrap (SW_REPEAT mode only) +// When defined, textures whose width/height are powers of two use a bitmask +// wrap (`x & (size-1)`) instead of `floorf`-based fractional wrap or the +// signed `%` chain in the linear sampler. Saves a software divide on Xtensa +// and a few instructions everywhere. NPOT textures keep using the original +// path via a runtime `(size & (size-1)) == 0` check, so SW_REPEAT remains +// correct for them. The only observable behavior change is for POT textures +// sampled with negative UV coordinates: bitmask wrap (two's complement) can +// differ from `sw_fract` by one texel. Off by default to keep bit-for-bit +// behavior; opt in if you control your asset UVs. +// +// #define SW_TEXTURE_REPEAT_POT_FAST + //---------------------------------------------------------------------------------- // OpenGL Compatibility Types //---------------------------------------------------------------------------------- @@ -844,6 +857,17 @@ SWAPI void swGetFramebufferAttachmentParameteriv(SWattachment attachment, SWatta #endif #endif +// ESP-DSP acceleration: ESP-IDF ships an optimized math library that includes +// `dspm_mult_4x4x4_f32` (4x4 matrix multiply) and `dspm_mult_4x4x1_f32` +// (matrix * vector). These are S3-tuned hand-vectorized kernels that beat the +// scalar versions for both throughput and code-size. Detection is opt-in to +// keep the dependency optional: define SW_USE_ESP_DSP from your build system +// (or rely on the `idf_component.yml` example shown in the rlsw docs). +#if defined(ESP_PLATFORM) && defined(SW_USE_ESP_DSP) + #define SW_HAS_ESP_DSP + #include "dspm_mult.h" +#endif + #ifdef __cplusplus #define SW_CURLY_INIT(name) name #else @@ -1038,6 +1062,9 @@ typedef struct { SWmatrix currentMatrixMode; // Current matrix mode (e.g., sw_MODELVIEW, sw_PROJECTION) sw_matrix_t *currentMatrix; // Pointer to the currently used matrix according to the mode sw_matrix_t matMVP; // Model view projection matrix, calculated and used internally +#ifdef SW_HAS_ESP_DSP + float matMVP_rm[16]; // Row-major MVP, kept in sync for esp-dsp dspm_mult_4x4x1_f32 vertex transform +#endif bool isDirtyMVP; // Indicates if the MVP matrix should be rebuilt sw_handle_t boundFramebufferId; // Framebuffer currently bound @@ -1141,6 +1168,14 @@ static inline void sw_matrix_id(sw_matrix_t dst) static inline void sw_matrix_mul_rst(float *SW_RESTRICT dst, const float *SW_RESTRICT left, const float *SW_RESTRICT right) { +#ifdef SW_HAS_ESP_DSP + // dspm_mult_4x4x4_f32 treats its operands as row-major. rlsw stores matrices + // column-major, so passing them flat is equivalent to passing transposes: + // dspm_mult(L^T, R^T) computes (L^T)*(R^T) = (R*L)^T, written back into a + // flat array gives the same bit pattern as the column-major product (R*L) + // -- exactly the semantic the scalar fallback below has. + dspm_mult_4x4x4_f32(left, right, dst); +#else float l00 = left[0], l01 = left[1], l02 = left[2], l03 = left[3]; float l10 = left[4], l11 = left[5], l12 = left[6], l13 = left[7]; float l20 = left[8], l21 = left[9], l22 = left[10], l23 = left[11]; @@ -1165,6 +1200,7 @@ static inline void sw_matrix_mul_rst(float *SW_RESTRICT dst, const float *SW_RES dst[7] = l10*right[3] + l11*right[7] + l12*right[11] + l13*right[15]; dst[11] = l20*right[3] + l21*right[7] + l22*right[11] + l23*right[15]; dst[15] = l30*right[3] + l31*right[7] + l32*right[11] + l33*right[15]; +#endif } static inline void sw_matrix_mul(sw_matrix_t dst, const sw_matrix_t left, const sw_matrix_t right) @@ -1210,6 +1246,33 @@ static inline float sw_fract(float x) return (x - floorf(x)); } +// Fast reciprocal: 1-ULP accurate in ~7 instructions on Xtensa using the +// hardware `recip0.s` seed + two Newton-Raphson refinement steps. All work +// stays in FPU registers — no `__divsf3` software call. Hot-path divisions +// in the rasterizer (span/triangle setup, perspective divide, etc.) call +// this. On non-Xtensa targets it transparently expands to `1.0f / x`, so +// generated code is identical to before. +#if defined(__XTENSA__) +__attribute__((always_inline)) +static inline float sw_rcp(float x) +{ + float result, temp; + __asm__( + "recip0.s %0, %2\n" + "const.s %1, 1\n" + "msub.s %1, %2, %0\n" + "madd.s %0, %0, %1\n" + "const.s %1, 1\n" + "msub.s %1, %2, %0\n" + "maddn.s %0, %0, %1\n" + : "=&f"(result), "=&f"(temp) : "f"(x) + ); + return result; +} +#else +static inline float sw_rcp(float x) { return 1.0f/x; } +#endif + static inline uint8_t sw_luminance8(const uint8_t *color) { return (uint8_t)((color[0]*77 + color[1]*150 + color[2]*29) >> 8); @@ -2406,11 +2469,31 @@ static inline void sw_texture_free(sw_texture_t *texture) static inline void sw_texture_sample_nearest(float *SW_RESTRICT color, const sw_texture_t *SW_RESTRICT tex, float u, float v) { - u = (tex->sWrap == SW_REPEAT)? sw_fract(u) : sw_saturate(u); - v = (tex->tWrap == SW_REPEAT)? sw_fract(v) : sw_saturate(v); + int x, y; - int x = u*tex->width; - int y = v*tex->height; +#ifdef SW_TEXTURE_REPEAT_POT_FAST + if ((tex->sWrap == SW_REPEAT) && ((tex->width & tex->wMinus1) == 0)) + { + x = (int)(u*tex->width) & tex->wMinus1; + } + else +#endif + { + u = (tex->sWrap == SW_REPEAT)? sw_fract(u) : sw_saturate(u); + x = (int)(u*tex->width); + } + +#ifdef SW_TEXTURE_REPEAT_POT_FAST + if ((tex->tWrap == SW_REPEAT) && ((tex->height & tex->hMinus1) == 0)) + { + y = (int)(v*tex->height) & tex->hMinus1; + } + else +#endif + { + v = (tex->tWrap == SW_REPEAT)? sw_fract(v) : sw_saturate(v); + y = (int)(v*tex->height); + } tex->readColor(color, tex->pixels, y*tex->width + x); } @@ -2432,13 +2515,19 @@ static inline void sw_texture_sample_linear(float *SW_RESTRICT color, const sw_t int x1 = x0 + 1; int y1 = y0 + 1; - // NOTE: If the textures are POT, avoid the division for SW_REPEAT - if (tex->sWrap == SW_CLAMP) { x0 = (x0 > tex->wMinus1)? tex->wMinus1 : x0; x1 = (x1 > tex->wMinus1)? tex->wMinus1 : x1; } +#ifdef SW_TEXTURE_REPEAT_POT_FAST + else if ((tex->width & tex->wMinus1) == 0) + { + // POT fast path: bitmask wrap covers negative ints via two's complement + x0 = x0 & tex->wMinus1; + x1 = x1 & tex->wMinus1; + } +#endif else { x0 = (x0%tex->width + tex->width)%tex->width; @@ -2450,6 +2539,13 @@ static inline void sw_texture_sample_linear(float *SW_RESTRICT color, const sw_t y0 = (y0 > tex->hMinus1)? tex->hMinus1 : y0; y1 = (y1 > tex->hMinus1)? tex->hMinus1 : y1; } +#ifdef SW_TEXTURE_REPEAT_POT_FAST + else if ((tex->height & tex->hMinus1) == 0) + { + y0 = y0 & tex->hMinus1; + y1 = y1 & tex->hMinus1; + } +#endif else { y0 = (y0%tex->height + tex->height)%tex->height; @@ -3366,7 +3462,7 @@ static void sw_triangle_clip_and_project(void) // Calculation of the reciprocal of W for normalization // as well as perspective-correct attributes - const float wRcp = 1.0f/v->position[3]; + const float wRcp = sw_rcp(v->position[3]); // Division of XYZ coordinates by weight v->position[0] *= wRcp; @@ -3481,7 +3577,7 @@ static void sw_quad_clip_and_project(void) // Calculation of the reciprocal of W for normalization // as well as perspective-correct attributes - const float wRcp = 1.0f/v->position[3]; + const float wRcp = sw_rcp(v->position[3]); // Division of XYZ coordinates by weight v->position[0] *= wRcp; @@ -3659,8 +3755,8 @@ static bool sw_line_clip_and_project(sw_vertex_t *v0, sw_vertex_t *v1) if (!sw_line_clip(v0, v1)) return false; // Convert clip coordinates to NDC - v0->position[3] = 1.0f/v0->position[3]; - v1->position[3] = 1.0f/v1->position[3]; + v0->position[3] = sw_rcp(v0->position[3]); + v1->position[3] = sw_rcp(v1->position[3]); for (int i = 0; i < 3; i++) { v0->position[i] *= v0->position[3]; @@ -3709,7 +3805,7 @@ static bool sw_point_clip_and_project(sw_vertex_t *v) if ((v->position[i] < -v->position[3]) || (v->position[i] > v->position[3])) return false; } - v->position[3] = 1.0f/v->position[3]; + v->position[3] = sw_rcp(v->position[3]); v->position[0] *= v->position[3]; v->position[1] *= v->position[3]; v->position[2] *= v->position[3]; @@ -3791,6 +3887,19 @@ static void sw_immediate_begin(SWdraw mode) RLSW.stackModelview[RLSW.stackModelviewCounter - 1], RLSW.stackProjection[RLSW.stackProjectionCounter - 1]); +#ifdef SW_HAS_ESP_DSP + // Pre-transpose to row-major so dspm_mult_4x4x1_f32(matMVP_rm, v, out) + // computes M*v directly in the per-vertex hot path. 16 scalar copies + // per MVP update vs. saving ~20 cycles per vertex transform. + for (int i = 0; i < 4; i++) + { + for (int j = 0; j < 4; j++) + { + RLSW.matMVP_rm[4*i + j] = RLSW.matMVP[4*j + i]; + } + } +#endif + RLSW.isDirtyMVP = false; } @@ -3842,11 +3951,17 @@ static void sw_immediate_push_vertex(const float position[4]) sw_vertex_t *vertex = &RLSW.primitive.buffer[RLSW.primitive.vertexCount++]; // Calculate clip coordinates +#ifdef SW_HAS_ESP_DSP + // dspm_mult_4x4x1_f32 declares its inputs non-const; rlsw treats them as + // read-only and the cast is safe (the kernel only loads from B). + dspm_mult_4x4x1_f32(RLSW.matMVP_rm, (float *)position, vertex->position); +#else const float *m = RLSW.matMVP; vertex->position[0] = m[0]*position[0] + m[4]*position[1] + m[8]*position[2] + m[12]*position[3]; vertex->position[1] = m[1]*position[0] + m[5]*position[1] + m[9]*position[2] + m[13]*position[3]; vertex->position[2] = m[2]*position[0] + m[6]*position[1] + m[10]*position[2] + m[14]*position[3]; vertex->position[3] = m[3]*position[0] + m[7]*position[1] + m[11]*position[2] + m[15]*position[3]; +#endif // Copy the attributes in the current vertex for (int i = 0; i < 4; i++) vertex->color[i] = RLSW.primitive.color[i]; @@ -5272,7 +5387,7 @@ static void SW_RASTER_TRIANGLE_SPAN(const sw_vertex_t *start, const sw_vertex_t if (xStart == xEnd) return; // Compute the inverse horizontal distance along the X axis - float dxRcp = 1.0f/(end->position[0] - start->position[0]); + float dxRcp = sw_rcp(end->position[0] - start->position[0]); // Compute the interpolation steps along the X axis float dWdx = (end->position[3] - start->position[3])*dxRcp; @@ -5326,12 +5441,12 @@ static void SW_RASTER_TRIANGLE_SPAN(const sw_vertex_t *start, const sw_vertex_t int blockEnd = x + SW_AFFINE_BLOCK; if (blockEnd > xEnd) blockEnd = xEnd; float blockLenF = (float)(blockEnd - x); - float blockLenRcp = 1.0f/blockLenF; + float blockLenRcp = sw_rcp(blockLenF); // Only 2 '1/w' here; none inside the pixel loop - float wRcpA = 1.0f/w; + float wRcpA = sw_rcp(w); float wB = w + dWdx*blockLenF; - float wRcpB = 1.0f/wB; + float wRcpB = sw_rcp(wB); // Perspective-correct color at both block endpoints, then affine gradient float srcColor[4] = { @@ -5459,9 +5574,9 @@ static void SW_RASTER_TRIANGLE(const sw_vertex_t *v0, const sw_vertex_t *v1, con if (h02 < 1e-6f) return; // Inverse edge dy for per-edge dV/dy (scanline interpolation) - float h02Rcp = 1.0f/h02; - float h01Rcp = (h01 > 1e-6f)? 1.0f/h01 : 0.0f; - float h12Rcp = (h12 > 1e-6f)? 1.0f/h12 : 0.0f; + float h02Rcp = sw_rcp(h02); + float h01Rcp = (h01 > 1e-6f)? sw_rcp(h01) : 0.0f; + float h12Rcp = (h12 > 1e-6f)? sw_rcp(h12) : 0.0f; // Compute gradients for each side of the triangle sw_vertex_t dVXdy02, dVXdy01, dVXdy12; @@ -5560,8 +5675,8 @@ static void SW_RASTER_QUAD(const sw_vertex_t *a, const sw_vertex_t *b, float h = (float)(yMax - yMin); if ((w <= 0) || (h <= 0)) return; - float wRcp = 1.0f/w; - float hRcp = 1.0f/h; + float wRcp = sw_rcp(w); + float hRcp = sw_rcp(h); // Subpixel corrections float xSubstep = 1.0f - sw_fract(tl->position[0]); @@ -5746,7 +5861,7 @@ static void SW_RASTER_LINE(const sw_vertex_t *v0, const sw_vertex_t *v1) // Compute per pixel increments float xInc = dx/steps; float yInc = dy/steps; - float stepRcp = 1.0f/steps; + float stepRcp = sw_rcp(steps); #ifdef SW_ENABLE_DEPTH_TEST float zInc = (v1->position[2] - v0->position[2])*stepRcp; #endif