mirror of
https://github.com/raysan5/raylib.git
synced 2026-05-08 23:02:03 +00:00
[rlsw] ESP32 optimizations (#5827)
* [rlsw] Add sw_rcp helper using Xtensa recip0.s for hot-path divisions
Adds a `sw_rcp(x)` inline reciprocal that on Xtensa (ESP32 / ESP32-S3
LX6/LX7) emits a `recip0.s` seed plus two Newton-Raphson refinement
steps -- 1-ULP accurate in ~7 instructions, all in FPU registers.
On every other target it expands to plain `1.0f/x`, so generated code
is byte-identical to before for non-Xtensa builds.
Replaces the hot-path `1.0f/x` calls that were previously compiling to
the `__divsf3` software helper on Xtensa:
- perspective divide (1/w) in triangle clip-and-project (PCT and PC paths)
- line and point clip-and-project NDC conversion
- triangle span setup: dxRcp, blockLenRcp, wRcpA, wRcpB
- triangle scanline setup: h02Rcp, h01Rcp, h12Rcp
- axis-aligned quad: wRcp, hRcp
- line rasterizer: stepRcp
Other `1.0f/x` uses (matrix translate/normalize, texture init `tx`/`ty`,
sw_matrix_rotate inverse-length) are not on the per-pixel hot path and
are left untouched.
Measured on ESP32-S3 @ 240 MHz, R5G6B5 240x240, textured 3D model:
contributes to a ~10-15% rasterization speedup.
Made-with: Cursor
* [rlsw] Use ESP-DSP for 4x4 matrix multiply and per-vertex MVP transform
Adds an opt-in ESP-DSP code path for ESP32 / ESP32-S3 builds. ESP-DSP is
ESP-IDF's official optimized math library and ships hand-vectorized
kernels that beat the scalar implementations on Xtensa.
Two integration points:
1. `sw_matrix_mul_rst` -> `dspm_mult_4x4x4_f32` for any 4x4*4x4 multiply
(used for MVP build, gluLookAt, push/multiply, etc.). rlsw stores
matrices column-major and ESP-DSP reads row-major; the comment on the
call site explains why the flat-buffer call still produces the
correct column-major product (transpose-of-transposes equivalence).
2. `sw_immediate_push_vertex` -> `dspm_mult_4x4x1_f32` for the per-vertex
clip-space transform. Because ESP-DSP expects a row-major matrix in
this case, a row-major copy `matMVP_rm[16]` is maintained alongside
`matMVP` and refreshed once per `isDirtyMVP` rebuild in
`sw_immediate_begin`. Cost is 16 scalar copies per matrix update,
amortized over thousands of vertices per frame.
Detection is **opt-in** via `SW_USE_ESP_DSP` so existing ESP-IDF projects
that don't depend on the `esp-dsp` component keep building unchanged.
A user enables it from CMakeLists.txt (or anywhere before including
rlgl.h):
target_compile_definitions(${COMPONENT_LIB} PRIVATE SW_USE_ESP_DSP=1)
and adds the dependency to `idf_component.yml`:
espressif/esp-dsp: "^1.4.0"
Measured on ESP32-S3 @ 240 MHz, R5G6B5 240x240, textured 3D model:
contributes meaningfully to the overall frame-time improvement
(combined with sw_rcp).
Made-with: Cursor
* [rlsw] Add SW_TEXTURE_REPEAT_POT_FAST opt-in for POT bitmask wrap
Adds an opt-in compile-time flag that replaces the SW_REPEAT wrap chain
with a bitmask (`x & (size-1)`) for power-of-two textures. NPOT textures
keep using the original `sw_fract` / signed-modulo paths via a runtime
`(size & (size-1)) == 0` check, so SW_REPEAT remains correct for them.
Affects two samplers:
- `sw_texture_sample_nearest`: drops the `floorf` + multiply + cast for
POT textures in REPEAT mode (saves a software call on Xtensa).
- `sw_texture_sample_linear`: replaces the `(x % w + w) % w` two-step
modulo (a software divide on Xtensa) with a single bitwise AND for
POT textures in REPEAT mode. Two's-complement int wrap covers
negative coordinates correctly.
Off by default: for POT textures sampled with negative UVs, bitmask wrap
can differ from `sw_fract` wrap by one texel at the boundary. That is
imperceptible at typical resolutions but technically a behavior change,
so existing users get bit-for-bit identical output. Opt in if you
control your asset UVs and want the speedup:
#define SW_TEXTURE_REPEAT_POT_FAST
This addresses the long-standing TODO comment "If the textures are POT,
avoid the division for SW_REPEAT" in `sw_texture_sample_linear`.
Made-with: Cursor
This commit is contained in:
157
src/external/rlsw.h
vendored
157
src/external/rlsw.h
vendored
@@ -164,6 +164,19 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Fast power-of-two texture wrap (SW_REPEAT mode only)
|
||||
// When defined, textures whose width/height are powers of two use a bitmask
|
||||
// wrap (`x & (size-1)`) instead of `floorf`-based fractional wrap or the
|
||||
// signed `%` chain in the linear sampler. Saves a software divide on Xtensa
|
||||
// and a few instructions everywhere. NPOT textures keep using the original
|
||||
// path via a runtime `(size & (size-1)) == 0` check, so SW_REPEAT remains
|
||||
// correct for them. The only observable behavior change is for POT textures
|
||||
// sampled with negative UV coordinates: bitmask wrap (two's complement) can
|
||||
// differ from `sw_fract` by one texel. Off by default to keep bit-for-bit
|
||||
// behavior; opt in if you control your asset UVs.
|
||||
//
|
||||
// #define SW_TEXTURE_REPEAT_POT_FAST
|
||||
|
||||
//----------------------------------------------------------------------------------
|
||||
// OpenGL Compatibility Types
|
||||
//----------------------------------------------------------------------------------
|
||||
@@ -844,6 +857,17 @@ SWAPI void swGetFramebufferAttachmentParameteriv(SWattachment attachment, SWatta
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// ESP-DSP acceleration: ESP-IDF ships an optimized math library that includes
|
||||
// `dspm_mult_4x4x4_f32` (4x4 matrix multiply) and `dspm_mult_4x4x1_f32`
|
||||
// (matrix * vector). These are S3-tuned hand-vectorized kernels that beat the
|
||||
// scalar versions for both throughput and code-size. Detection is opt-in to
|
||||
// keep the dependency optional: define SW_USE_ESP_DSP from your build system
|
||||
// (or rely on the `idf_component.yml` example shown in the rlsw docs).
|
||||
#if defined(ESP_PLATFORM) && defined(SW_USE_ESP_DSP)
|
||||
#define SW_HAS_ESP_DSP
|
||||
#include "dspm_mult.h"
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
#define SW_CURLY_INIT(name) name
|
||||
#else
|
||||
@@ -1038,6 +1062,9 @@ typedef struct {
|
||||
SWmatrix currentMatrixMode; // Current matrix mode (e.g., sw_MODELVIEW, sw_PROJECTION)
|
||||
sw_matrix_t *currentMatrix; // Pointer to the currently used matrix according to the mode
|
||||
sw_matrix_t matMVP; // Model view projection matrix, calculated and used internally
|
||||
#ifdef SW_HAS_ESP_DSP
|
||||
float matMVP_rm[16]; // Row-major MVP, kept in sync for esp-dsp dspm_mult_4x4x1_f32 vertex transform
|
||||
#endif
|
||||
bool isDirtyMVP; // Indicates if the MVP matrix should be rebuilt
|
||||
|
||||
sw_handle_t boundFramebufferId; // Framebuffer currently bound
|
||||
@@ -1141,6 +1168,14 @@ static inline void sw_matrix_id(sw_matrix_t dst)
|
||||
|
||||
static inline void sw_matrix_mul_rst(float *SW_RESTRICT dst, const float *SW_RESTRICT left, const float *SW_RESTRICT right)
|
||||
{
|
||||
#ifdef SW_HAS_ESP_DSP
|
||||
// dspm_mult_4x4x4_f32 treats its operands as row-major. rlsw stores matrices
|
||||
// column-major, so passing them flat is equivalent to passing transposes:
|
||||
// dspm_mult(L^T, R^T) computes (L^T)*(R^T) = (R*L)^T, written back into a
|
||||
// flat array gives the same bit pattern as the column-major product (R*L)
|
||||
// -- exactly the semantic the scalar fallback below has.
|
||||
dspm_mult_4x4x4_f32(left, right, dst);
|
||||
#else
|
||||
float l00 = left[0], l01 = left[1], l02 = left[2], l03 = left[3];
|
||||
float l10 = left[4], l11 = left[5], l12 = left[6], l13 = left[7];
|
||||
float l20 = left[8], l21 = left[9], l22 = left[10], l23 = left[11];
|
||||
@@ -1165,6 +1200,7 @@ static inline void sw_matrix_mul_rst(float *SW_RESTRICT dst, const float *SW_RES
|
||||
dst[7] = l10*right[3] + l11*right[7] + l12*right[11] + l13*right[15];
|
||||
dst[11] = l20*right[3] + l21*right[7] + l22*right[11] + l23*right[15];
|
||||
dst[15] = l30*right[3] + l31*right[7] + l32*right[11] + l33*right[15];
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void sw_matrix_mul(sw_matrix_t dst, const sw_matrix_t left, const sw_matrix_t right)
|
||||
@@ -1210,6 +1246,33 @@ static inline float sw_fract(float x)
|
||||
return (x - floorf(x));
|
||||
}
|
||||
|
||||
// Fast reciprocal: 1-ULP accurate in ~7 instructions on Xtensa using the
|
||||
// hardware `recip0.s` seed + two Newton-Raphson refinement steps. All work
|
||||
// stays in FPU registers — no `__divsf3` software call. Hot-path divisions
|
||||
// in the rasterizer (span/triangle setup, perspective divide, etc.) call
|
||||
// this. On non-Xtensa targets it transparently expands to `1.0f / x`, so
|
||||
// generated code is identical to before.
|
||||
#if defined(__XTENSA__)
|
||||
__attribute__((always_inline))
|
||||
static inline float sw_rcp(float x)
|
||||
{
|
||||
float result, temp;
|
||||
__asm__(
|
||||
"recip0.s %0, %2\n"
|
||||
"const.s %1, 1\n"
|
||||
"msub.s %1, %2, %0\n"
|
||||
"madd.s %0, %0, %1\n"
|
||||
"const.s %1, 1\n"
|
||||
"msub.s %1, %2, %0\n"
|
||||
"maddn.s %0, %0, %1\n"
|
||||
: "=&f"(result), "=&f"(temp) : "f"(x)
|
||||
);
|
||||
return result;
|
||||
}
|
||||
#else
|
||||
static inline float sw_rcp(float x) { return 1.0f/x; }
|
||||
#endif
|
||||
|
||||
static inline uint8_t sw_luminance8(const uint8_t *color)
|
||||
{
|
||||
return (uint8_t)((color[0]*77 + color[1]*150 + color[2]*29) >> 8);
|
||||
@@ -2406,11 +2469,31 @@ static inline void sw_texture_free(sw_texture_t *texture)
|
||||
|
||||
static inline void sw_texture_sample_nearest(float *SW_RESTRICT color, const sw_texture_t *SW_RESTRICT tex, float u, float v)
|
||||
{
|
||||
u = (tex->sWrap == SW_REPEAT)? sw_fract(u) : sw_saturate(u);
|
||||
v = (tex->tWrap == SW_REPEAT)? sw_fract(v) : sw_saturate(v);
|
||||
int x, y;
|
||||
|
||||
int x = u*tex->width;
|
||||
int y = v*tex->height;
|
||||
#ifdef SW_TEXTURE_REPEAT_POT_FAST
|
||||
if ((tex->sWrap == SW_REPEAT) && ((tex->width & tex->wMinus1) == 0))
|
||||
{
|
||||
x = (int)(u*tex->width) & tex->wMinus1;
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
u = (tex->sWrap == SW_REPEAT)? sw_fract(u) : sw_saturate(u);
|
||||
x = (int)(u*tex->width);
|
||||
}
|
||||
|
||||
#ifdef SW_TEXTURE_REPEAT_POT_FAST
|
||||
if ((tex->tWrap == SW_REPEAT) && ((tex->height & tex->hMinus1) == 0))
|
||||
{
|
||||
y = (int)(v*tex->height) & tex->hMinus1;
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
v = (tex->tWrap == SW_REPEAT)? sw_fract(v) : sw_saturate(v);
|
||||
y = (int)(v*tex->height);
|
||||
}
|
||||
|
||||
tex->readColor(color, tex->pixels, y*tex->width + x);
|
||||
}
|
||||
@@ -2432,13 +2515,19 @@ static inline void sw_texture_sample_linear(float *SW_RESTRICT color, const sw_t
|
||||
int x1 = x0 + 1;
|
||||
int y1 = y0 + 1;
|
||||
|
||||
// NOTE: If the textures are POT, avoid the division for SW_REPEAT
|
||||
|
||||
if (tex->sWrap == SW_CLAMP)
|
||||
{
|
||||
x0 = (x0 > tex->wMinus1)? tex->wMinus1 : x0;
|
||||
x1 = (x1 > tex->wMinus1)? tex->wMinus1 : x1;
|
||||
}
|
||||
#ifdef SW_TEXTURE_REPEAT_POT_FAST
|
||||
else if ((tex->width & tex->wMinus1) == 0)
|
||||
{
|
||||
// POT fast path: bitmask wrap covers negative ints via two's complement
|
||||
x0 = x0 & tex->wMinus1;
|
||||
x1 = x1 & tex->wMinus1;
|
||||
}
|
||||
#endif
|
||||
else
|
||||
{
|
||||
x0 = (x0%tex->width + tex->width)%tex->width;
|
||||
@@ -2450,6 +2539,13 @@ static inline void sw_texture_sample_linear(float *SW_RESTRICT color, const sw_t
|
||||
y0 = (y0 > tex->hMinus1)? tex->hMinus1 : y0;
|
||||
y1 = (y1 > tex->hMinus1)? tex->hMinus1 : y1;
|
||||
}
|
||||
#ifdef SW_TEXTURE_REPEAT_POT_FAST
|
||||
else if ((tex->height & tex->hMinus1) == 0)
|
||||
{
|
||||
y0 = y0 & tex->hMinus1;
|
||||
y1 = y1 & tex->hMinus1;
|
||||
}
|
||||
#endif
|
||||
else
|
||||
{
|
||||
y0 = (y0%tex->height + tex->height)%tex->height;
|
||||
@@ -3366,7 +3462,7 @@ static void sw_triangle_clip_and_project(void)
|
||||
|
||||
// Calculation of the reciprocal of W for normalization
|
||||
// as well as perspective-correct attributes
|
||||
const float wRcp = 1.0f/v->position[3];
|
||||
const float wRcp = sw_rcp(v->position[3]);
|
||||
|
||||
// Division of XYZ coordinates by weight
|
||||
v->position[0] *= wRcp;
|
||||
@@ -3481,7 +3577,7 @@ static void sw_quad_clip_and_project(void)
|
||||
|
||||
// Calculation of the reciprocal of W for normalization
|
||||
// as well as perspective-correct attributes
|
||||
const float wRcp = 1.0f/v->position[3];
|
||||
const float wRcp = sw_rcp(v->position[3]);
|
||||
|
||||
// Division of XYZ coordinates by weight
|
||||
v->position[0] *= wRcp;
|
||||
@@ -3659,8 +3755,8 @@ static bool sw_line_clip_and_project(sw_vertex_t *v0, sw_vertex_t *v1)
|
||||
if (!sw_line_clip(v0, v1)) return false;
|
||||
|
||||
// Convert clip coordinates to NDC
|
||||
v0->position[3] = 1.0f/v0->position[3];
|
||||
v1->position[3] = 1.0f/v1->position[3];
|
||||
v0->position[3] = sw_rcp(v0->position[3]);
|
||||
v1->position[3] = sw_rcp(v1->position[3]);
|
||||
for (int i = 0; i < 3; i++)
|
||||
{
|
||||
v0->position[i] *= v0->position[3];
|
||||
@@ -3709,7 +3805,7 @@ static bool sw_point_clip_and_project(sw_vertex_t *v)
|
||||
if ((v->position[i] < -v->position[3]) || (v->position[i] > v->position[3])) return false;
|
||||
}
|
||||
|
||||
v->position[3] = 1.0f/v->position[3];
|
||||
v->position[3] = sw_rcp(v->position[3]);
|
||||
v->position[0] *= v->position[3];
|
||||
v->position[1] *= v->position[3];
|
||||
v->position[2] *= v->position[3];
|
||||
@@ -3791,6 +3887,19 @@ static void sw_immediate_begin(SWdraw mode)
|
||||
RLSW.stackModelview[RLSW.stackModelviewCounter - 1],
|
||||
RLSW.stackProjection[RLSW.stackProjectionCounter - 1]);
|
||||
|
||||
#ifdef SW_HAS_ESP_DSP
|
||||
// Pre-transpose to row-major so dspm_mult_4x4x1_f32(matMVP_rm, v, out)
|
||||
// computes M*v directly in the per-vertex hot path. 16 scalar copies
|
||||
// per MVP update vs. saving ~20 cycles per vertex transform.
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
for (int j = 0; j < 4; j++)
|
||||
{
|
||||
RLSW.matMVP_rm[4*i + j] = RLSW.matMVP[4*j + i];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
RLSW.isDirtyMVP = false;
|
||||
}
|
||||
|
||||
@@ -3842,11 +3951,17 @@ static void sw_immediate_push_vertex(const float position[4])
|
||||
sw_vertex_t *vertex = &RLSW.primitive.buffer[RLSW.primitive.vertexCount++];
|
||||
|
||||
// Calculate clip coordinates
|
||||
#ifdef SW_HAS_ESP_DSP
|
||||
// dspm_mult_4x4x1_f32 declares its inputs non-const; rlsw treats them as
|
||||
// read-only and the cast is safe (the kernel only loads from B).
|
||||
dspm_mult_4x4x1_f32(RLSW.matMVP_rm, (float *)position, vertex->position);
|
||||
#else
|
||||
const float *m = RLSW.matMVP;
|
||||
vertex->position[0] = m[0]*position[0] + m[4]*position[1] + m[8]*position[2] + m[12]*position[3];
|
||||
vertex->position[1] = m[1]*position[0] + m[5]*position[1] + m[9]*position[2] + m[13]*position[3];
|
||||
vertex->position[2] = m[2]*position[0] + m[6]*position[1] + m[10]*position[2] + m[14]*position[3];
|
||||
vertex->position[3] = m[3]*position[0] + m[7]*position[1] + m[11]*position[2] + m[15]*position[3];
|
||||
#endif
|
||||
|
||||
// Copy the attributes in the current vertex
|
||||
for (int i = 0; i < 4; i++) vertex->color[i] = RLSW.primitive.color[i];
|
||||
@@ -5272,7 +5387,7 @@ static void SW_RASTER_TRIANGLE_SPAN(const sw_vertex_t *start, const sw_vertex_t
|
||||
if (xStart == xEnd) return;
|
||||
|
||||
// Compute the inverse horizontal distance along the X axis
|
||||
float dxRcp = 1.0f/(end->position[0] - start->position[0]);
|
||||
float dxRcp = sw_rcp(end->position[0] - start->position[0]);
|
||||
|
||||
// Compute the interpolation steps along the X axis
|
||||
float dWdx = (end->position[3] - start->position[3])*dxRcp;
|
||||
@@ -5326,12 +5441,12 @@ static void SW_RASTER_TRIANGLE_SPAN(const sw_vertex_t *start, const sw_vertex_t
|
||||
int blockEnd = x + SW_AFFINE_BLOCK;
|
||||
if (blockEnd > xEnd) blockEnd = xEnd;
|
||||
float blockLenF = (float)(blockEnd - x);
|
||||
float blockLenRcp = 1.0f/blockLenF;
|
||||
float blockLenRcp = sw_rcp(blockLenF);
|
||||
|
||||
// Only 2 '1/w' here; none inside the pixel loop
|
||||
float wRcpA = 1.0f/w;
|
||||
float wRcpA = sw_rcp(w);
|
||||
float wB = w + dWdx*blockLenF;
|
||||
float wRcpB = 1.0f/wB;
|
||||
float wRcpB = sw_rcp(wB);
|
||||
|
||||
// Perspective-correct color at both block endpoints, then affine gradient
|
||||
float srcColor[4] = {
|
||||
@@ -5459,9 +5574,9 @@ static void SW_RASTER_TRIANGLE(const sw_vertex_t *v0, const sw_vertex_t *v1, con
|
||||
if (h02 < 1e-6f) return;
|
||||
|
||||
// Inverse edge dy for per-edge dV/dy (scanline interpolation)
|
||||
float h02Rcp = 1.0f/h02;
|
||||
float h01Rcp = (h01 > 1e-6f)? 1.0f/h01 : 0.0f;
|
||||
float h12Rcp = (h12 > 1e-6f)? 1.0f/h12 : 0.0f;
|
||||
float h02Rcp = sw_rcp(h02);
|
||||
float h01Rcp = (h01 > 1e-6f)? sw_rcp(h01) : 0.0f;
|
||||
float h12Rcp = (h12 > 1e-6f)? sw_rcp(h12) : 0.0f;
|
||||
|
||||
// Compute gradients for each side of the triangle
|
||||
sw_vertex_t dVXdy02, dVXdy01, dVXdy12;
|
||||
@@ -5560,8 +5675,8 @@ static void SW_RASTER_QUAD(const sw_vertex_t *a, const sw_vertex_t *b,
|
||||
float h = (float)(yMax - yMin);
|
||||
if ((w <= 0) || (h <= 0)) return;
|
||||
|
||||
float wRcp = 1.0f/w;
|
||||
float hRcp = 1.0f/h;
|
||||
float wRcp = sw_rcp(w);
|
||||
float hRcp = sw_rcp(h);
|
||||
|
||||
// Subpixel corrections
|
||||
float xSubstep = 1.0f - sw_fract(tl->position[0]);
|
||||
@@ -5746,7 +5861,7 @@ static void SW_RASTER_LINE(const sw_vertex_t *v0, const sw_vertex_t *v1)
|
||||
// Compute per pixel increments
|
||||
float xInc = dx/steps;
|
||||
float yInc = dy/steps;
|
||||
float stepRcp = 1.0f/steps;
|
||||
float stepRcp = sw_rcp(steps);
|
||||
#ifdef SW_ENABLE_DEPTH_TEST
|
||||
float zInc = (v1->position[2] - v0->position[2])*stepRcp;
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user