diff --git a/src/external/rlsw.h b/src/external/rlsw.h index ba2716790..cfb2d236f 100644 --- a/src/external/rlsw.h +++ b/src/external/rlsw.h @@ -616,69 +616,74 @@ SWAPI void swBindTexture(uint32_t id); #include // Required for: floorf(), fabsf() #if defined(__FMA__) && defined(__AVX2__) -# define SW_HAS_FMA_AVX2 -# include + #define SW_HAS_FMA_AVX2 + #include #endif #if defined(__FMA__) && defined(__AVX__) -# define SW_HAS_FMA_AVX -# include + #define SW_HAS_FMA_AVX + #include #endif #if defined(__AVX2__) -# define SW_HAS_AVX2 -# include + #define SW_HAS_AVX2 + #include #endif #if defined(__AVX__) -# define SW_HAS_AVX -# include + #define SW_HAS_AVX + #include #endif #if defined(__SSE4_2__) -# define SW_HAS_SSE42 -# include + #define SW_HAS_SSE42 + #include #endif #if defined(__SSE4_1__) -# define SW_HAS_SSE41 -# include + #define SW_HAS_SSE41 + #include #endif #if defined(__SSSE3__) -# define SW_HAS_SSSE3 -# include + #define SW_HAS_SSSE3 + #include #endif #if defined(__SSE3__) -# define SW_HAS_SSE3 -# include + #define SW_HAS_SSE3 + #include #endif #if defined(__SSE2__) -# define SW_HAS_SSE2 -# include + #define SW_HAS_SSE2 + #include #endif #if defined(__SSE__) -# define SW_HAS_SSE -# include + #define SW_HAS_SSE + #include #endif #if defined(__ARM_NEON) || defined(__aarch64__) -# if defined(__ARM_FEATURE_FMA) -# define SW_HAS_NEON_FMA -# else -# define SW_HAS_NEON -# endif -# include + #if defined(__ARM_FEATURE_FMA) + #define SW_HAS_NEON_FMA + #else + #define SW_HAS_NEON + #endif + #include +#endif + +#ifdef __riscv_vector + #define SW_HAS_RVV + #include #endif //---------------------------------------------------------------------------------- // Defines and Macros //---------------------------------------------------------------------------------- #define SW_PI 3.14159265358979323846f -#define SW_INV_255 0.00392156862745098f +#define SW_INV_255 0.00392156862745098f // 1.0f/255.0f #define SW_DEG2RAD (SW_PI/180.0f) #define SW_RAD2DEG (180.0f/SW_PI) @@ -1102,6 +1107,27 @@ static inline void sw_float_to_unorm8_simd(uint8_t dst[4], const float src[4]) clamped = _mm_packs_epi32(clamped, clamped); // s32 -> s16 (saturated) clamped = _mm_packus_epi16(clamped, clamped); // s16 -> u8 (saturated < 0 to 0) *(uint32_t*)dst = _mm_cvtsi128_si32(clamped); +#elif defined(SW_HAS_RVV) + // TODO: Sample code generated by AI, needs testing and review + size_t vl = vsetvl_e32m1(4); // Load up to 4 floats into a vector register + vfloat32m1_t vsrc = vle32_v_f32m1(src, vl); // Load float32 values + + // Clamp to [0.0f, 1.0f] + vfloat32m1_t vzero = vfmv_v_f_f32m1(0.0f, vl); + vfloat32m1_t vone = vfmv_v_f_f32m1(1.0f, vl); + vsrc = vfmin_vv_f32m1(vsrc, vone, vl); + vsrc = vfmax_vv_f32m1(vsrc, vzero, vl); + + // Multiply by 255.0f and add 0.5f for rounding + vfloat32m1_t vscaled = vfmul_vf_f32m1(vsrc, 255.0f, vl); + vscaled = vfadd_vf_f32m1(vscaled, 0.5f, vl); + + // Convert to unsigned integer (truncate toward zero) + vuint32m1_t vu32 = vfcvt_xu_f_v_u32m1(vscaled, vl); + + // Narrow from u32 -> u8 + vuint8m1_t vu8 = vnclipu_wx_u8m1(vu32, 0, vl); // Round toward zero + vse8_v_u8m1(dst, vu8, vl); // Store result #else for (int i = 0; i < 4; i++) { @@ -1123,18 +1149,26 @@ static inline void sw_float_from_unorm8_simd(float dst[4], const uint8_t src[4]) floats = vmulq_n_f32(floats, SW_INV_255); vst1q_f32(dst, floats); #elif defined(SW_HAS_SSE41) - __m128i bytes = _mm_cvtsi32_si128(*(const uint32_t*)src); + __m128i bytes = _mm_cvtsi32_si128(*(const uint32_t *)src); __m128i ints = _mm_cvtepu8_epi32(bytes); __m128 floats = _mm_cvtepi32_ps(ints); floats = _mm_mul_ps(floats, _mm_set1_ps(SW_INV_255)); _mm_storeu_ps(dst, floats); #elif defined(SW_HAS_SSE2) - __m128i bytes = _mm_cvtsi32_si128(*(const uint32_t*)src); + __m128i bytes = _mm_cvtsi32_si128(*(const uint32_t *)src); bytes = _mm_unpacklo_epi8(bytes, _mm_setzero_si128()); __m128i ints = _mm_unpacklo_epi16(bytes, _mm_setzero_si128()); __m128 floats = _mm_cvtepi32_ps(ints); floats = _mm_mul_ps(floats, _mm_set1_ps(SW_INV_255)); _mm_storeu_ps(dst, floats); +#elif defined(SW_HAS_RVV) + // TODO: Sample code generated by AI, needs testing and review + size_t vl = vsetvl_e8m1(4); // Set vector length for 8-bit input elements + vuint8m1_t vsrc_u8 = vle8_v_u8m1(src, vl); // Load 4 unsigned 8-bit integers + vuint32m1_t vsrc_u32 = vwcvt_xu_u_v_u32m1(vsrc_u8, vl); // Widen to 32-bit unsigned integers + vfloat32m1_t vsrc_f32 = vfcvt_f_xu_v_f32m1(vsrc_u32, vl); // Convert to float32 + vfloat32m1_t vnorm = vfmul_vf_f32m1(vsrc_f32, SW_INV_255, vl); // Multiply by 1/255.0 to normalize + vse32_v_f32m1(dst, vnorm, vl); // Store result #else dst[0] = (float)src[0]*SW_INV_255; dst[1] = (float)src[1]*SW_INV_255; @@ -2672,8 +2706,8 @@ static inline void FUNC_NAME(void) float ySubstep = 1.0f - sw_fract(v0->screen[1]); \ \ /* Calculation of vertex gradients in X and Y */ \ - float dUdx, dVdx; \ - float dUdy, dVdy; \ + float dUdx = 0.0f, dVdx = 0.0f; \ + float dUdy = 0.0f, dVdy = 0.0f; \ if (ENABLE_TEXTURE) { \ dUdx = (v1->texcoord[0] - v0->texcoord[0])*wRcp; \ dVdx = (v1->texcoord[1] - v0->texcoord[1])*wRcp; \