From e9231bc4f1b3df2c1eec24cb535130d2e6aabd6a Mon Sep 17 00:00:00 2001 From: Ray Date: Sun, 29 Mar 2026 21:24:20 +0200 Subject: [PATCH] Update stb_image_resize2.h --- src/external/stb_image_resize2.h | 662 +++++++++++++++++-------------- 1 file changed, 370 insertions(+), 292 deletions(-) diff --git a/src/external/stb_image_resize2.h b/src/external/stb_image_resize2.h index 2f2627463..079897658 100644 --- a/src/external/stb_image_resize2.h +++ b/src/external/stb_image_resize2.h @@ -1,4 +1,4 @@ -/* stb_image_resize2 - v2.12 - public domain image resizing +/* stb_image_resize2 - v2.18 - public domain image resizing by Jeff Roberts (v2) and Jorge L Rodriguez http://github.com/nothings/stb @@ -141,13 +141,13 @@ COLOR+ALPHA buffer types tell the resizer to do. When you use the pixel layouts STBIR_RGBA, STBIR_BGRA, STBIR_ARGB, - STBIR_ABGR, STBIR_RX, or STBIR_XR you are telling us that the pixels are + STBIR_ABGR, STBIR_RA, or STBIR_AR you are telling us that the pixels are non-premultiplied. In these cases, the resizer will alpha weight the colors (effectively creating the premultiplied image), do the filtering, and then convert back to non-premult on exit. - When you use the pixel layouts STBIR_RGBA_PM, STBIR_RGBA_PM, STBIR_RGBA_PM, - STBIR_RGBA_PM, STBIR_RX_PM or STBIR_XR_PM, you are telling that the pixels + When you use the pixel layouts STBIR_RGBA_PM, STBIR_BGRA_PM, STBIR_ARGB_PM, + STBIR_ABGR_PM, STBIR_RA_PM or STBIR_AR_PM, you are telling that the pixels ARE premultiplied. In this case, the resizer doesn't have to do the premultipling - it can filter directly on the input. This about twice as fast as the non-premultiplied case, so it's the right option if your data is @@ -254,7 +254,7 @@ using the stbir_set_filter_callbacks function. PROGRESS - For interactive use with slow resize operations, you can use the the + For interactive use with slow resize operations, you can use the scanline callbacks in the extended API. It would have to be a *very* large image resample to need progress though - we're very fast. @@ -307,6 +307,8 @@ some pixel reconversion, but probably dwarfed by things falling out of cache. Probably also something possible with alternating between scattering and gathering at high resize scales? + * Should we have a multiple MIPs at the same time function (could keep + more memory in cache during multiple resizes)? * Rewrite the coefficient generator to do many at once. * AVX-512 vertical kernels - worried about downclocking here. * Convert the reincludes to macros when we know they aren't changing. @@ -327,6 +329,24 @@ Nathan Reed: warning fixes for 1.0 REVISIONS + 2.18 (2026-03-25) fixed coefficient calculation when skipping a coefficient off + the left side of the window, added non-aligned access safe + memcpy mode for scalar path, fixed various typos, and fixed + define error in the float clamp output mode. + 2.17 (2025-10-25) silly format bug in easy-to-use APIs. + 2.16 (2025-10-21) fixed the easy-to-use APIs to allow inverted bitmaps (negative + strides), fix vertical filter kernel callback, fix threaded + gather buffer priming (and assert). + (thanks adipose, TainZerL, and Harrison Green) + 2.15 (2025-07-17) fixed an assert in debug mode when using floats with input + callbacks, work around GCC warning when adding to null ptr + (thanks Johannes Spohr and Pyry Kovanen). + 2.14 (2025-05-09) fixed a bug using downsampling gather horizontal first, and + scatter with vertical first. + 2.13 (2025-02-27) fixed a bug when using input callbacks, turned off simd for + tiny-c, fixed some variables that should have been static, + fixes a bug when calculating temp memory with resizes that + exceed 2GB of temp memory (very large resizes). 2.12 (2024-10-18) fix incorrect use of user_data with STBIR_FREE 2.11 (2024-09-08) fix harmless asan warnings in 2-channel and 3-channel mode with AVX-2, fix some weird scaling edge conditions with @@ -382,62 +402,6 @@ typedef uint32_t stbir_uint32; typedef uint64_t stbir_uint64; #endif -#ifdef _M_IX86_FP -#if ( _M_IX86_FP >= 1 ) -#ifndef STBIR_SSE -#define STBIR_SSE -#endif -#endif -#endif - -#if defined(_x86_64) || defined( __x86_64__ ) || defined( _M_X64 ) || defined(__x86_64) || defined(_M_AMD64) || defined(__SSE2__) || defined(STBIR_SSE) || defined(STBIR_SSE2) - #ifndef STBIR_SSE2 - #define STBIR_SSE2 - #endif - #if defined(__AVX__) || defined(STBIR_AVX2) - #ifndef STBIR_AVX - #ifndef STBIR_NO_AVX - #define STBIR_AVX - #endif - #endif - #endif - #if defined(__AVX2__) || defined(STBIR_AVX2) - #ifndef STBIR_NO_AVX2 - #ifndef STBIR_AVX2 - #define STBIR_AVX2 - #endif - #if defined( _MSC_VER ) && !defined(__clang__) - #ifndef STBIR_FP16C // FP16C instructions are on all AVX2 cpus, so we can autoselect it here on microsoft - clang needs -m16c - #define STBIR_FP16C - #endif - #endif - #endif - #endif - #ifdef __F16C__ - #ifndef STBIR_FP16C // turn on FP16C instructions if the define is set (for clang and gcc) - #define STBIR_FP16C - #endif - #endif -#endif - -#if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) || ((__ARM_NEON_FP & 4) != 0) || defined(__ARM_NEON__) -#ifndef STBIR_NEON -#define STBIR_NEON -#endif -#endif - -#if defined(_M_ARM) || defined(__arm__) -#ifdef STBIR_USE_FMA -#undef STBIR_USE_FMA // no FMA for 32-bit arm on MSVC -#endif -#endif - -#if defined(__wasm__) && defined(__wasm_simd128__) -#ifndef STBIR_WASM -#define STBIR_WASM -#endif -#endif - #ifndef STBIRDEF #ifdef STB_IMAGE_RESIZE_STATIC #define STBIRDEF static @@ -1036,7 +1000,7 @@ typedef struct char no_cache_straddle[64]; } stbir__per_split_info; -typedef void stbir__decode_pixels_func( float * decode, int width_times_channels, void const * input ); +typedef float * stbir__decode_pixels_func( float * decode, int width_times_channels, void const * input ); typedef void stbir__alpha_weight_func( float * decode_buffer, int width_times_channels ); typedef void stbir__horizontal_gather_channels_func( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width ); @@ -1099,8 +1063,8 @@ struct stbir__info #define stbir__max_uint8_as_float 255.0f #define stbir__max_uint16_as_float 65535.0f -#define stbir__max_uint8_as_float_inverted (1.0f/255.0f) -#define stbir__max_uint16_as_float_inverted (1.0f/65535.0f) +#define stbir__max_uint8_as_float_inverted 3.9215689e-03f // (1.0f/255.0f) +#define stbir__max_uint16_as_float_inverted 1.5259022e-05f // (1.0f/65535.0f) #define stbir__small_float ((float)1 / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20)) // min/max friendly @@ -1205,6 +1169,69 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in) #define STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS 4 // when threading, what is the minimum number of scanlines for a split? #endif +#define STBIR_INPUT_CALLBACK_PADDING 3 + +#ifdef _M_IX86_FP +#if ( _M_IX86_FP >= 1 ) +#ifndef STBIR_SSE +#define STBIR_SSE +#endif +#endif +#endif + +#ifdef __TINYC__ + // tiny c has no intrinsics yet - this can become a version check if they add them + #define STBIR_NO_SIMD +#endif + +#if defined(_x86_64) || defined( __x86_64__ ) || defined( _M_X64 ) || defined(__x86_64) || defined(_M_AMD64) || defined(__SSE2__) || defined(STBIR_SSE) || defined(STBIR_SSE2) + #ifndef STBIR_SSE2 + #define STBIR_SSE2 + #endif + #if defined(__AVX__) || defined(STBIR_AVX2) + #ifndef STBIR_AVX + #ifndef STBIR_NO_AVX + #define STBIR_AVX + #endif + #endif + #endif + #if defined(__AVX2__) || defined(STBIR_AVX2) + #ifndef STBIR_NO_AVX2 + #ifndef STBIR_AVX2 + #define STBIR_AVX2 + #endif + #if defined( _MSC_VER ) && !defined(__clang__) + #ifndef STBIR_FP16C // FP16C instructions are on all AVX2 cpus, so we can autoselect it here on microsoft - clang needs -mf16c + #define STBIR_FP16C + #endif + #endif + #endif + #endif + #ifdef __F16C__ + #ifndef STBIR_FP16C // turn on FP16C instructions if the define is set (for clang and gcc) + #define STBIR_FP16C + #endif + #endif +#endif + +#if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) || ((__ARM_NEON_FP & 4) != 0) || defined(__ARM_NEON__) +#ifndef STBIR_NEON +#define STBIR_NEON +#endif +#endif + +#if defined(_M_ARM) || defined(__arm__) +#ifdef STBIR_USE_FMA +#undef STBIR_USE_FMA // no FMA for 32-bit arm on MSVC +#endif +#endif + +#if defined(__wasm__) && defined(__wasm_simd128__) +#ifndef STBIR_WASM +#define STBIR_WASM +#endif +#endif + // restrict pointers for the output pointers, other loop and unroll control #if defined( _MSC_VER ) && !defined(__clang__) #define STBIR_STREAMOUT_PTR( star ) star __restrict @@ -1451,8 +1478,8 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in) #include #define stbir__simdf_pack_to_8words(out,reg0,reg1) out = _mm_packus_epi32(_mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg0,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())), _mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg1,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps()))) #else - STBIR__SIMDI_CONST(stbir__s32_32768, 32768); - STBIR__SIMDI_CONST(stbir__s16_32768, ((32768<<16)|32768)); + static STBIR__SIMDI_CONST(stbir__s32_32768, 32768); + static STBIR__SIMDI_CONST(stbir__s16_32768, ((32768<<16)|32768)); #define stbir__simdf_pack_to_8words(out,reg0,reg1) \ { \ @@ -2545,7 +2572,7 @@ static const STBIR__SIMDF_CONST(STBIR_simd_point5, 0.5f); static const STBIR__SIMDF_CONST(STBIR_ones, 1.0f); static const STBIR__SIMDI_CONST(STBIR_almost_zero, (127 - 13) << 23); static const STBIR__SIMDI_CONST(STBIR_almost_one, 0x3f7fffff); -static const STBIR__SIMDI_CONST(STBIR_mastissa_mask, 0xff); +static const STBIR__SIMDI_CONST(STBIR_mantissa_mask, 0xff); static const STBIR__SIMDI_CONST(STBIR_topscale, 0x02000000); // Basically, in simd mode, we unroll the proper amount, and we don't want @@ -2816,16 +2843,34 @@ static void stbir_overlapping_memcpy( void * dest, void const * src, size_t byte char STBIR_SIMD_STREAMOUT_PTR( * ) s_end = ((char*) src) + bytes; ptrdiff_t ofs_to_dest = (char*)dest - (char*)src; - if ( ofs_to_dest >= 8 ) // is the overlap more than 8 away? + if ( ofs_to_dest >= 8 ) // is the overlap more than 8 away { char STBIR_SIMD_STREAMOUT_PTR( * ) s_end8 = ((char*) src) + (bytes&~7); - STBIR_NO_UNROLL_LOOP_START - do + + if ( ( ( ((ptrdiff_t)dest)|((ptrdiff_t)src) ) & 7 ) == 0 ) // is it 8byte aligned? { - STBIR_NO_UNROLL(sd); - *(stbir_uint64*)( sd + ofs_to_dest ) = *(stbir_uint64*) sd; - sd += 8; - } while ( sd < s_end8 ); + STBIR_NO_UNROLL_LOOP_START + do + { + STBIR_NO_UNROLL(sd); + *(stbir_uint64*)( sd + ofs_to_dest ) = *(stbir_uint64*) sd; + sd += 8; + } while ( sd < s_end8 ); + } + else + { + STBIR_NO_UNROLL_LOOP_START + do + { + int a,b; + STBIR_NO_UNROLL(sd); + a = ((int*)sd)[0]; + b = ((int*)sd)[1]; + ((int*)( sd + ofs_to_dest ))[0] = a; + ((int*)( sd + ofs_to_dest ))[1] = b; + sd += 8; + } while ( sd < s_end8 ); + } if ( sd == s_end ) return; @@ -3217,10 +3262,9 @@ static void stbir__get_extents( stbir__sampler * samp, stbir__extents * scanline newspan->n0 = -left_margin; newspan->n1 = ( max_left - min_left ) - left_margin; scanline_extents->edge_sizes[0] = 0; // don't need to copy the left margin, since we are directly decoding into the margin - return; } - - // if we can't merge the min_left range, add it as a second range + // if we can't merge the min_right range, add it as a second range + else if ( ( right_margin ) && ( min_right != 0x7fffffff ) ) { stbir__span * newspan = scanline_extents->spans + 1; @@ -3235,7 +3279,14 @@ static void stbir__get_extents( stbir__sampler * samp, stbir__extents * scanline newspan->n0 = scanline_extents->spans[1].n1 + 1; newspan->n1 = scanline_extents->spans[1].n1 + 1 + ( max_right - min_right ); scanline_extents->edge_sizes[1] = 0; // don't need to copy the right margin, since we are directly decoding into the margin - return; + } + + // sort the spans into write output order + if ( ( scanline_extents->spans[1].n1 > scanline_extents->spans[1].n0 ) && ( scanline_extents->spans[0].n0 > scanline_extents->spans[1].n0 ) ) + { + stbir__span tspan = scanline_extents->spans[0]; + scanline_extents->spans[0] = scanline_extents->spans[1]; + scanline_extents->spans[1] = tspan; } } @@ -3328,23 +3379,29 @@ static void stbir__calculate_coefficients_for_gather_upsample( float out_filter_ static void stbir__insert_coeff( stbir__contributors * contribs, float * coeffs, int new_pixel, float new_coeff, int max_width ) { - if ( new_pixel <= contribs->n1 ) // before the end + if ( contribs->n1 < contribs->n0 ) // this first clause should never happen, but handle in case + { + contribs->n0 = contribs->n1 = new_pixel; + coeffs[0] = new_coeff; + } + else if ( new_pixel <= contribs->n1 ) // before the end { if ( new_pixel < contribs->n0 ) // before the front? { if ( ( contribs->n1 - new_pixel + 1 ) <= max_width ) { int j, o = contribs->n0 - new_pixel; - for ( j = contribs->n1 - contribs->n0 ; j <= 0 ; j-- ) + for ( j = contribs->n1 - contribs->n0 ; j >= 0 ; j-- ) coeffs[ j + o ] = coeffs[ j ]; - for ( j = 1 ; j < o ; j-- ) - coeffs[ j ] = coeffs[ 0 ]; + for ( j = 1 ; j < o ; j++ ) + coeffs[ j ] = 0; coeffs[ 0 ] = new_coeff; contribs->n0 = new_pixel; } } else { + // add new weight to existing coeff if already there coeffs[ new_pixel - contribs->n0 ] += new_coeff; } } @@ -3791,7 +3848,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors* } } - // some horizontal routines read one float off the end (which is then masked off), so put in a sentinal so we don't read an snan or denormal + // some horizontal routines read one float off the end (which is then masked off), so put in a sentinel so we don't read an snan or denormal coefficents[ widest * num_contributors ] = 8888.0f; // the minimum we might read for unrolled filters widths is 12. So, we need to @@ -4560,7 +4617,8 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float int row = stbir__edge_wrap(edge_vertical, n, stbir_info->vertical.scale_info.input_full_size); const void* input_plane_data = ( (char *) stbir_info->input_data ) + (size_t)row * (size_t) stbir_info->input_stride_bytes; stbir__span const * spans = stbir_info->scanline_extents.spans; - float* full_decode_buffer = output_buffer - stbir_info->scanline_extents.conservative.n0 * effective_channels; + float * full_decode_buffer = output_buffer - stbir_info->scanline_extents.conservative.n0 * effective_channels; + float * last_decoded = 0; // if we are on edge_zero, and we get in here with an out of bounds n, then the calculate filters has failed STBIR_ASSERT( !(edge_vertical == STBIR_EDGE_ZERO && (n < 0 || n >= stbir_info->vertical.scale_info.input_full_size)) ); @@ -4588,12 +4646,12 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float if ( stbir_info->in_pixels_cb ) { // call the callback with a temp buffer (that they can choose to use or not). the temp is just right aligned memory in the decode_buffer itself - input_data = stbir_info->in_pixels_cb( ( (char*) end_decode ) - ( width * input_sample_in_bytes ), input_plane_data, width, spans->pixel_offset_for_input, row, stbir_info->user_data ); + input_data = stbir_info->in_pixels_cb( ( (char*) end_decode ) - ( width * input_sample_in_bytes ) + ( ( stbir_info->input_type != STBIR_TYPE_FLOAT ) ? ( sizeof(float)*STBIR_INPUT_CALLBACK_PADDING ) : 0 ), input_plane_data, width, spans->pixel_offset_for_input, row, stbir_info->user_data ); } STBIR_PROFILE_START( decode ); // convert the pixels info the float decode_buffer, (we index from end_decode, so that when channelsdecode_pixels( (float*)end_decode - width_times_channels, width_times_channels, input_data ); + last_decoded = stbir_info->decode_pixels( (float*)end_decode - width_times_channels, width_times_channels, input_data ); STBIR_PROFILE_END( decode ); if (stbir_info->alpha_weight) @@ -4628,9 +4686,19 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float float * marg = full_decode_buffer + x * effective_channels; float const * src = full_decode_buffer + stbir__edge_wrap(edge_horizontal, x, input_full_size) * effective_channels; STBIR_MEMCPY( marg, src, margin * effective_channels * sizeof(float) ); + if ( e == 1 ) last_decoded = marg + margin * effective_channels; } } } + + // some of the horizontal gathers read one float off the edge (which is masked out), but we force a zero here to make sure no NaNs leak in + // (we can't pre-zero it, because the input callback can use that area as padding) + last_decoded[0] = 0.0f; + + // we clear this extra float, because the final output pixel filter kernel might have used one less coeff than the max filter width + // when this happens, we do read that pixel from the input, so it too could be Nan, so just zero an extra one. + // this fits because each scanline is padded by three floats (STBIR_INPUT_CALLBACK_PADDING) + last_decoded[1] = 0.0f; } @@ -6209,6 +6277,8 @@ static void stbir__resample_vertical_gather(stbir__info const * stbir_info, stbi if ( vertical_first ) { // Now resample the gathered vertical data in the horizontal axis into the encode buffer + decode_buffer[ width_times_channels ] = 0.0f; // clear two over for horizontals with a remnant of 3 + decode_buffer[ width_times_channels+1 ] = 0.0f; stbir__resample_horizontal_gather(stbir_info, encode_buffer, decode_buffer STBIR_ONLY_PROFILE_SET_SPLIT_INFO ); } @@ -6380,6 +6450,8 @@ static void stbir__vertical_scatter_loop( stbir__info const * stbir_info, stbir_ void * scanline_scatter_buffer; void * scanline_scatter_buffer_end; int on_first_input_y, last_input_y; + int width = (stbir_info->vertical_first) ? ( stbir_info->scanline_extents.conservative.n1-stbir_info->scanline_extents.conservative.n0+1 ) : stbir_info->horizontal.scale_info.output_sub_size; + int width_times_channels = stbir_info->effective_channels * width; STBIR_ASSERT( !stbir_info->vertical.is_gather ); @@ -6414,7 +6486,12 @@ static void stbir__vertical_scatter_loop( stbir__info const * stbir_info, stbir_ // mark all the buffers as empty to start for( y = 0 ; y < stbir_info->ring_buffer_num_entries ; y++ ) - stbir__get_ring_buffer_entry( stbir_info, split_info, y )[0] = STBIR__FLOAT_EMPTY_MARKER; // only used on scatter + { + float * decode_buffer = stbir__get_ring_buffer_entry( stbir_info, split_info, y ); + decode_buffer[ width_times_channels ] = 0.0f; // clear two over for horizontals with a remnant of 3 + decode_buffer[ width_times_channels+1 ] = 0.0f; + decode_buffer[0] = STBIR__FLOAT_EMPTY_MARKER; // only used on scatter + } // do the loop in input space on_first_input_y = 1; last_input_y = start_input_y; @@ -6562,7 +6639,7 @@ static void stbir__set_sampler(stbir__sampler * samp, stbir_filter filter, stbir samp->num_contributors = stbir__get_contributors(samp, samp->is_gather); samp->contributors_size = samp->num_contributors * sizeof(stbir__contributors); - samp->coefficients_size = samp->num_contributors * samp->coefficient_width * sizeof(float) + sizeof(float); // extra sizeof(float) is padding + samp->coefficients_size = samp->num_contributors * samp->coefficient_width * sizeof(float) + sizeof(float)*STBIR_INPUT_CALLBACK_PADDING; // extra sizeof(float) is padding samp->gather_prescatter_contributors = 0; samp->gather_prescatter_coefficients = 0; @@ -6667,7 +6744,7 @@ static void stbir__get_conservative_extents( stbir__sampler * samp, stbir__contr } } -static void stbir__get_split_info( stbir__per_split_info* split_info, int splits, int output_height, int vertical_pixel_margin, int input_full_height ) +static void stbir__get_split_info( stbir__per_split_info* split_info, int splits, int output_height, int vertical_pixel_margin, int input_full_height, int is_gather, stbir__contributors * contribs ) { int i, cur; int left = output_height; @@ -6676,9 +6753,58 @@ static void stbir__get_split_info( stbir__per_split_info* split_info, int splits for( i = 0 ; i < splits ; i++ ) { int each; + split_info[i].start_output_y = cur; each = left / ( splits - i ); split_info[i].end_output_y = cur + each; + + // ok, when we are gathering, we need to make sure we are starting on a y offset that doesn't have + // a "special" set of coefficients. Basically, with exactly the right filter at exactly the right + // resize at exactly the right phase, some of the coefficents can be zero. When they are zero, we + // don't process them at all. But this leads to a tricky thing with the thread splits, where we + // might have a set of two coeffs like this for example: (4,4) and (3,6). The 4,4 means there was + // just one single coeff because things worked out perfectly (normally, they all have 4 coeffs + // like the range 3,6. The problem is that if we start right on the (4,4) on a brand new thread, + // then when we get to (3,6), we don't have the "3" sample in memory (because we didn't load + // it on the initial (4,4) range because it didn't have a 3 (we only add new samples that are + // larger than our existing samples - it's just how the eviction works). So, our solution here + // is pretty simple, if we start right on a range that has samples that start earlier, then we + // simply bump up our previous thread split range to include it, and then start this threads + // range with the smaller sample. It just moves one scanline from one thread split to another, + // so that we end with the unusual one, instead of start with it. To do this, we check 2-4 + // sample at each thread split start and then occassionally move them. + + if ( ( is_gather ) && ( i ) ) + { + stbir__contributors * small_contribs; + int j, smallest, stop, start_n0; + stbir__contributors * split_contribs = contribs + cur; + + // scan for a max of 3x the filter width or until the next thread split + stop = vertical_pixel_margin * 3; + if ( each < stop ) + stop = each; + + // loops a few times before early out + smallest = 0; + small_contribs = split_contribs; + start_n0 = small_contribs->n0; + for( j = 1 ; j <= stop ; j++ ) + { + ++split_contribs; + if ( split_contribs->n0 > start_n0 ) + break; + if ( split_contribs->n0 < small_contribs->n0 ) + { + small_contribs = split_contribs; + smallest = j; + } + } + + split_info[i-1].end_output_y += smallest; + split_info[i].start_output_y += smallest; + } + cur += each; left -= each; @@ -6774,45 +6900,45 @@ static float stbir__compute_weights[5][STBIR_RESIZE_CLASSIFICATIONS][4]= // 5 = { 0.56250f, 0.59375f, 0.00000f, 0.96875f }, { 1.00000f, 0.06250f, 0.00000f, 1.00000f }, { 0.00000f, 0.09375f, 1.00000f, 1.00000f }, - { 1.00000f, 1.00000f, 1.00000f, 1.00000f }, + { 1.00000f, 1.00000f, 0.31250f, 1.00000f }, { 0.03125f, 0.12500f, 1.00000f, 1.00000f }, - { 0.06250f, 0.12500f, 0.00000f, 1.00000f }, + { 1.00000f, 1.00000f, 0.06250f, 1.00000f }, { 0.00000f, 1.00000f, 0.00000f, 0.03125f }, }, { { 0.00000f, 0.84375f, 0.00000f, 0.03125f }, { 0.09375f, 0.93750f, 0.00000f, 0.78125f }, { 0.87500f, 0.21875f, 0.00000f, 0.96875f }, { 0.09375f, 0.09375f, 1.00000f, 1.00000f }, - { 1.00000f, 1.00000f, 1.00000f, 1.00000f }, + { 0.00000f, 0.84375f, 0.00000f, 0.03125f }, { 0.03125f, 0.12500f, 1.00000f, 1.00000f }, - { 0.06250f, 0.12500f, 0.00000f, 1.00000f }, + { 1.00000f, 1.00000f, 0.06250f, 1.00000f }, { 0.00000f, 1.00000f, 0.00000f, 0.53125f }, }, { { 0.00000f, 0.53125f, 0.00000f, 0.03125f }, { 0.06250f, 0.96875f, 0.00000f, 0.53125f }, { 0.87500f, 0.18750f, 0.00000f, 0.93750f }, { 0.00000f, 0.09375f, 1.00000f, 1.00000f }, - { 1.00000f, 1.00000f, 1.00000f, 1.00000f }, + { 0.00000f, 0.53125f, 0.00000f, 0.03125f }, { 0.03125f, 0.12500f, 1.00000f, 1.00000f }, - { 0.06250f, 0.12500f, 0.00000f, 1.00000f }, + { 1.00000f, 1.00000f, 0.06250f, 1.00000f }, { 0.00000f, 1.00000f, 0.00000f, 0.56250f }, }, { { 0.00000f, 0.50000f, 0.00000f, 0.71875f }, { 0.06250f, 0.84375f, 0.00000f, 0.87500f }, { 1.00000f, 0.50000f, 0.50000f, 0.96875f }, { 1.00000f, 0.09375f, 0.31250f, 0.50000f }, - { 1.00000f, 1.00000f, 1.00000f, 1.00000f }, + { 0.00000f, 0.50000f, 0.00000f, 0.71875f }, { 1.00000f, 0.03125f, 0.03125f, 0.53125f }, - { 0.18750f, 0.12500f, 0.00000f, 1.00000f }, + { 1.00000f, 1.00000f, 0.06250f, 1.00000f }, { 0.00000f, 1.00000f, 0.03125f, 0.18750f }, }, { { 0.00000f, 0.59375f, 0.00000f, 0.96875f }, { 0.06250f, 0.81250f, 0.06250f, 0.59375f }, { 0.75000f, 0.43750f, 0.12500f, 0.96875f }, { 0.87500f, 0.06250f, 0.18750f, 0.43750f }, - { 1.00000f, 1.00000f, 1.00000f, 1.00000f }, + { 0.00000f, 0.59375f, 0.00000f, 0.96875f }, { 0.15625f, 0.12500f, 1.00000f, 1.00000f }, - { 0.06250f, 0.12500f, 0.00000f, 1.00000f }, + { 1.00000f, 1.00000f, 0.06250f, 1.00000f }, { 0.00000f, 1.00000f, 0.03125f, 0.34375f }, } }; @@ -6866,16 +6992,16 @@ static int stbir__should_do_vertical_first( float weights_table[STBIR_RESIZE_CLA // categorize the resize into buckets if ( ( vertical_output_size <= 4 ) || ( horizontal_output_size <= 4 ) ) v_classification = ( vertical_output_size < horizontal_output_size ) ? 6 : 7; + else if ( ( !is_gather ) && ( ( vertical_output_size <= 16 ) || ( horizontal_output_size <= 16 ) ) ) + v_classification = 4; else if ( vertical_scale <= 1.0f ) v_classification = ( is_gather ) ? 1 : 0; else if ( vertical_scale <= 2.0f) v_classification = 2; else if ( vertical_scale <= 3.0f) v_classification = 3; - else if ( vertical_scale <= 4.0f) - v_classification = 5; - else - v_classification = 6; + else + v_classification = 5; // everything bigger than 3x // use the right weights weights = weights_table[ v_classification ]; @@ -6927,7 +7053,8 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample void * alloced = 0; size_t alloced_total = 0; int vertical_first; - int decode_buffer_size, ring_buffer_length_bytes, ring_buffer_size, vertical_buffer_size, alloc_ring_buffer_num_entries; + size_t decode_buffer_size, ring_buffer_length_bytes, ring_buffer_size, vertical_buffer_size; + int alloc_ring_buffer_num_entries; int alpha_weighting_type = 0; // 0=none, 1=simple, 2=fancy int conservative_split_output_size = stbir__get_max_split( splits, vertical->scale_info.output_sub_size ); @@ -6972,14 +7099,16 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample vertical_first = stbir__should_do_vertical_first( stbir__compute_weights[ (int)stbir_channel_count_index[ effective_channels ] ], horizontal->filter_pixel_width, horizontal->scale_info.scale, horizontal->scale_info.output_sub_size, vertical->filter_pixel_width, vertical->scale_info.scale, vertical->scale_info.output_sub_size, vertical->is_gather, STBIR__V_FIRST_INFO_POINTER ); // sometimes read one float off in some of the unrolled loops (with a weight of zero coeff, so it doesn't have an effect) - decode_buffer_size = ( conservative->n1 - conservative->n0 + 1 ) * effective_channels * sizeof(float) + sizeof(float); // extra float for padding + // we use a few extra floats instead of just 1, so that input callback buffer can overlap with the decode buffer without + // the conversion routines overwriting the callback input data. + decode_buffer_size = ( conservative->n1 - conservative->n0 + 1 ) * effective_channels * sizeof(float) + sizeof(float)*STBIR_INPUT_CALLBACK_PADDING; // extra floats for input callback stagger #if defined( STBIR__SEPARATE_ALLOCATIONS ) && defined(STBIR_SIMD8) if ( effective_channels == 3 ) decode_buffer_size += sizeof(float); // avx in 3 channel mode needs one float at the start of the buffer (only with separate allocations) #endif - ring_buffer_length_bytes = horizontal->scale_info.output_sub_size * effective_channels * sizeof(float) + sizeof(float); // extra float for padding + ring_buffer_length_bytes = (size_t)horizontal->scale_info.output_sub_size * (size_t)effective_channels * sizeof(float) + sizeof(float)*STBIR_INPUT_CALLBACK_PADDING; // extra floats for padding // if we do vertical first, the ring buffer holds a whole decoded line if ( vertical_first ) @@ -6994,13 +7123,13 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample if ( ( !vertical->is_gather ) && ( alloc_ring_buffer_num_entries > conservative_split_output_size ) ) alloc_ring_buffer_num_entries = conservative_split_output_size; - ring_buffer_size = alloc_ring_buffer_num_entries * ring_buffer_length_bytes; + ring_buffer_size = (size_t)alloc_ring_buffer_num_entries * (size_t)ring_buffer_length_bytes; // The vertical buffer is used differently, depending on whether we are scattering // the vertical scanlines, or gathering them. // If scattering, it's used at the temp buffer to accumulate each output. // If gathering, it's just the output buffer. - vertical_buffer_size = horizontal->scale_info.output_sub_size * effective_channels * sizeof(float) + sizeof(float); // extra float for padding + vertical_buffer_size = (size_t)horizontal->scale_info.output_sub_size * (size_t)effective_channels * sizeof(float) + sizeof(float); // extra float for padding // we make two passes through this loop, 1st to add everything up, 2nd to allocate and init for(;;) @@ -7013,7 +7142,7 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample #ifdef STBIR__SEPARATE_ALLOCATIONS #define STBIR__NEXT_PTR( ptr, size, ntype ) if ( alloced ) { void * p = STBIR_MALLOC( size, user_data); if ( p == 0 ) { stbir__free_internal_mem( info ); return 0; } (ptr) = (ntype*)p; } #else - #define STBIR__NEXT_PTR( ptr, size, ntype ) advance_mem = (void*) ( ( ((size_t)advance_mem) + 15 ) & ~15 ); if ( alloced ) ptr = (ntype*)advance_mem; advance_mem = ((char*)advance_mem) + (size); + #define STBIR__NEXT_PTR( ptr, size, ntype ) advance_mem = (void*) ( ( ((size_t)advance_mem) + 15 ) & ~15 ); if ( alloced ) ptr = (ntype*)advance_mem; advance_mem = (char*)(((size_t)advance_mem) + (size)); #endif STBIR__NEXT_PTR( info, sizeof( stbir__info ), stbir__info ); @@ -7036,9 +7165,9 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample info->offset_x = new_x; info->offset_y = new_y; - info->alloc_ring_buffer_num_entries = alloc_ring_buffer_num_entries; + info->alloc_ring_buffer_num_entries = (int)alloc_ring_buffer_num_entries; info->ring_buffer_num_entries = 0; - info->ring_buffer_length_bytes = ring_buffer_length_bytes; + info->ring_buffer_length_bytes = (int)ring_buffer_length_bytes; info->splits = splits; info->vertical_first = vertical_first; @@ -7119,14 +7248,14 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample // alloc memory for to-be-pivoted coeffs (if necessary) if ( vertical->is_gather == 0 ) { - int both; - int temp_mem_amt; + size_t both; + size_t temp_mem_amt; // when in vertical scatter mode, we first build the coefficients in gather mode, and then pivot after, // that means we need two buffers, so we try to use the decode buffer and ring buffer for this. if that // is too small, we just allocate extra memory to use as this temp. - both = vertical->gather_prescatter_contributors_size + vertical->gather_prescatter_coefficients_size; + both = (size_t)vertical->gather_prescatter_contributors_size + (size_t)vertical->gather_prescatter_coefficients_size; #ifdef STBIR__SEPARATE_ALLOCATIONS temp_mem_amt = decode_buffer_size; @@ -7136,7 +7265,7 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample --temp_mem_amt; // avx in 3 channel mode needs one float at the start of the buffer #endif #else - temp_mem_amt = ( decode_buffer_size + ring_buffer_size + vertical_buffer_size ) * splits; + temp_mem_amt = (size_t)( decode_buffer_size + ring_buffer_size + vertical_buffer_size ) * (size_t)splits; #endif if ( temp_mem_amt >= both ) { @@ -7222,7 +7351,7 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample } // setup the vertical split ranges - stbir__get_split_info( info->split_info, info->splits, info->vertical.scale_info.output_sub_size, info->vertical.filter_pixel_margin, info->vertical.scale_info.input_full_size ); + stbir__get_split_info( info->split_info, info->splits, info->vertical.scale_info.output_sub_size, info->vertical.filter_pixel_margin, info->vertical.scale_info.input_full_size, info->vertical.is_gather, info->vertical.contributors ); // now we know precisely how many entries we need info->ring_buffer_num_entries = info->vertical.extent_info.widest; @@ -7231,39 +7360,7 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample if ( ( !info->vertical.is_gather ) && ( info->ring_buffer_num_entries > conservative_split_output_size ) ) info->ring_buffer_num_entries = conservative_split_output_size; STBIR_ASSERT( info->ring_buffer_num_entries <= info->alloc_ring_buffer_num_entries ); - - // a few of the horizontal gather functions read past the end of the decode (but mask it out), - // so put in normal values so no snans or denormals accidentally sneak in (also, in the ring - // buffer for vertical first) - for( i = 0 ; i < splits ; i++ ) - { - int t, ofs, start; - - ofs = decode_buffer_size / 4; - - #if defined( STBIR__SEPARATE_ALLOCATIONS ) && defined(STBIR_SIMD8) - if ( effective_channels == 3 ) - --ofs; // avx in 3 channel mode needs one float at the start of the buffer, so we snap back for clearing - #endif - - start = ofs - 4; - if ( start < 0 ) start = 0; - - for( t = start ; t < ofs; t++ ) - info->split_info[i].decode_buffer[ t ] = 9999.0f; - - if ( vertical_first ) - { - int j; - for( j = 0; j < info->ring_buffer_num_entries ; j++ ) - { - for( t = start ; t < ofs; t++ ) - stbir__get_ring_buffer_entry( info, info->split_info + i, j )[ t ] = 9999.0f; - } - } - } } - #undef STBIR__NEXT_PTR @@ -7528,7 +7625,7 @@ static int stbir__double_to_rational(double f, stbir_uint32 limit, stbir_uint32 numer_estimate = temp; } - // we didn't fine anything good enough for float, use a full range estimate + // we didn't find anything good enough for float, use a full range estimate if ( limit_denom ) { numer_estimate= (stbir_uint64)( f * (double)limit + 0.5 ); @@ -7818,7 +7915,7 @@ static int stbir__perform_build( STBIR_RESIZE * resize, int splits ) stbir__set_sampler(&horizontal, resize->horizontal_filter, resize->horizontal_filter_kernel, resize->horizontal_filter_support, resize->horizontal_edge, &horizontal.scale_info, 1, resize->user_data ); stbir__get_conservative_extents( &horizontal, &conservative, resize->user_data ); - stbir__set_sampler(&vertical, resize->vertical_filter, resize->horizontal_filter_kernel, resize->vertical_filter_support, resize->vertical_edge, &vertical.scale_info, 0, resize->user_data ); + stbir__set_sampler(&vertical, resize->vertical_filter, resize->vertical_filter_kernel, resize->vertical_filter_support, resize->vertical_edge, &vertical.scale_info, 0, resize->user_data ); if ( ( vertical.scale_info.output_sub_size / splits ) < STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS ) // each split should be a minimum of 4 scanlines (handwavey choice) { @@ -7849,7 +7946,7 @@ static int stbir__perform_build( STBIR_RESIZE * resize, int splits ) return 0; } -void stbir_free_samplers( STBIR_RESIZE * resize ) +STBIRDEF void stbir_free_samplers( STBIR_RESIZE * resize ) { if ( resize->samplers ) { @@ -7943,138 +8040,64 @@ STBIRDEF int stbir_resize_extended_split( STBIR_RESIZE * resize, int split_start return stbir__perform_resize( resize->samplers, split_start, split_count ); } -static int stbir__check_output_stuff( void ** ret_ptr, int * ret_pitch, void * output_pixels, int type_size, int output_w, int output_h, int output_stride_in_bytes, stbir_internal_pixel_layout pixel_layout ) + +static void * stbir_quick_resize_helper( const void *input_pixels , int input_w , int input_h, int input_stride_in_bytes, + void *output_pixels, int output_w, int output_h, int output_stride_in_bytes, + stbir_pixel_layout pixel_layout, stbir_datatype data_type, stbir_edge edge, stbir_filter filter ) { - size_t size; - int pitch; - void * ptr; + STBIR_RESIZE resize; + int scanline_output_in_bytes; + int positive_output_stride_in_bytes; + void * start_ptr; + void * free_ptr; - pitch = output_w * type_size * stbir__pixel_channels[ pixel_layout ]; - if ( pitch == 0 ) + scanline_output_in_bytes = output_w * stbir__type_size[ data_type ] * stbir__pixel_channels[ stbir__pixel_layout_convert_public_to_internal[ pixel_layout ] ]; + if ( scanline_output_in_bytes == 0 ) return 0; + // if zero stride, use scanline output if ( output_stride_in_bytes == 0 ) - output_stride_in_bytes = pitch; + output_stride_in_bytes = scanline_output_in_bytes; - if ( output_stride_in_bytes < pitch ) + // abs value for inverted images (negative pitches) + positive_output_stride_in_bytes = output_stride_in_bytes; + if ( positive_output_stride_in_bytes < 0 ) + positive_output_stride_in_bytes = -positive_output_stride_in_bytes; + + // is the requested stride smaller than the scanline output? if so, just fail + if ( positive_output_stride_in_bytes < scanline_output_in_bytes ) return 0; - size = (size_t)output_stride_in_bytes * (size_t)output_h; - if ( size == 0 ) - return 0; - - *ret_ptr = 0; - *ret_pitch = output_stride_in_bytes; + start_ptr = output_pixels; + free_ptr = 0; // no free pointer, since they passed buffer to use + // did they pass a zero for the dest? if so, allocate the buffer if ( output_pixels == 0 ) { - ptr = STBIR_MALLOC( size, 0 ); + size_t size; + char * ptr; + + size = (size_t)positive_output_stride_in_bytes * (size_t)output_h; + if ( size == 0 ) + return 0; + + ptr = (char*) STBIR_MALLOC( size, 0 ); if ( ptr == 0 ) return 0; - *ret_ptr = ptr; - *ret_pitch = pitch; + free_ptr = ptr; + + // point at the last scanline, if they requested a flipped image + if ( output_stride_in_bytes < 0 ) + start_ptr = ptr + ( (size_t)positive_output_stride_in_bytes * (size_t)( output_h - 1 ) ); + else + start_ptr = ptr; } - return 1; -} - - -STBIRDEF unsigned char * stbir_resize_uint8_linear( const unsigned char *input_pixels , int input_w , int input_h, int input_stride_in_bytes, - unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes, - stbir_pixel_layout pixel_layout ) -{ - STBIR_RESIZE resize; - unsigned char * optr; - int opitch; - - if ( !stbir__check_output_stuff( (void**)&optr, &opitch, output_pixels, sizeof( unsigned char ), output_w, output_h, output_stride_in_bytes, stbir__pixel_layout_convert_public_to_internal[ pixel_layout ] ) ) - return 0; - + // ok, now do the resize stbir_resize_init( &resize, input_pixels, input_w, input_h, input_stride_in_bytes, - (optr) ? optr : output_pixels, output_w, output_h, opitch, - pixel_layout, STBIR_TYPE_UINT8 ); - - if ( !stbir_resize_extended( &resize ) ) - { - if ( optr ) - STBIR_FREE( optr, 0 ); - return 0; - } - - return (optr) ? optr : output_pixels; -} - -STBIRDEF unsigned char * stbir_resize_uint8_srgb( const unsigned char *input_pixels , int input_w , int input_h, int input_stride_in_bytes, - unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes, - stbir_pixel_layout pixel_layout ) -{ - STBIR_RESIZE resize; - unsigned char * optr; - int opitch; - - if ( !stbir__check_output_stuff( (void**)&optr, &opitch, output_pixels, sizeof( unsigned char ), output_w, output_h, output_stride_in_bytes, stbir__pixel_layout_convert_public_to_internal[ pixel_layout ] ) ) - return 0; - - stbir_resize_init( &resize, - input_pixels, input_w, input_h, input_stride_in_bytes, - (optr) ? optr : output_pixels, output_w, output_h, opitch, - pixel_layout, STBIR_TYPE_UINT8_SRGB ); - - if ( !stbir_resize_extended( &resize ) ) - { - if ( optr ) - STBIR_FREE( optr, 0 ); - return 0; - } - - return (optr) ? optr : output_pixels; -} - - -STBIRDEF float * stbir_resize_float_linear( const float *input_pixels , int input_w , int input_h, int input_stride_in_bytes, - float *output_pixels, int output_w, int output_h, int output_stride_in_bytes, - stbir_pixel_layout pixel_layout ) -{ - STBIR_RESIZE resize; - float * optr; - int opitch; - - if ( !stbir__check_output_stuff( (void**)&optr, &opitch, output_pixels, sizeof( float ), output_w, output_h, output_stride_in_bytes, stbir__pixel_layout_convert_public_to_internal[ pixel_layout ] ) ) - return 0; - - stbir_resize_init( &resize, - input_pixels, input_w, input_h, input_stride_in_bytes, - (optr) ? optr : output_pixels, output_w, output_h, opitch, - pixel_layout, STBIR_TYPE_FLOAT ); - - if ( !stbir_resize_extended( &resize ) ) - { - if ( optr ) - STBIR_FREE( optr, 0 ); - return 0; - } - - return (optr) ? optr : output_pixels; -} - - -STBIRDEF void * stbir_resize( const void *input_pixels , int input_w , int input_h, int input_stride_in_bytes, - void *output_pixels, int output_w, int output_h, int output_stride_in_bytes, - stbir_pixel_layout pixel_layout, stbir_datatype data_type, - stbir_edge edge, stbir_filter filter ) -{ - STBIR_RESIZE resize; - float * optr; - int opitch; - - if ( !stbir__check_output_stuff( (void**)&optr, &opitch, output_pixels, stbir__type_size[data_type], output_w, output_h, output_stride_in_bytes, stbir__pixel_layout_convert_public_to_internal[ pixel_layout ] ) ) - return 0; - - stbir_resize_init( &resize, - input_pixels, input_w, input_h, input_stride_in_bytes, - (optr) ? optr : output_pixels, output_w, output_h, output_stride_in_bytes, + start_ptr, output_w, output_h, output_stride_in_bytes, pixel_layout, data_type ); resize.horizontal_edge = edge; @@ -8084,19 +8107,60 @@ STBIRDEF void * stbir_resize( const void *input_pixels , int input_w , int input if ( !stbir_resize_extended( &resize ) ) { - if ( optr ) - STBIR_FREE( optr, 0 ); + if ( free_ptr ) + STBIR_FREE( free_ptr, 0 ); return 0; } - return (optr) ? optr : output_pixels; + return (free_ptr) ? free_ptr : start_ptr; +} + + + +STBIRDEF unsigned char * stbir_resize_uint8_linear( const unsigned char *input_pixels , int input_w , int input_h, int input_stride_in_bytes, + unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes, + stbir_pixel_layout pixel_layout ) +{ + return (unsigned char *) stbir_quick_resize_helper( input_pixels , input_w , input_h, input_stride_in_bytes, + output_pixels, output_w, output_h, output_stride_in_bytes, + pixel_layout, STBIR_TYPE_UINT8, STBIR_EDGE_CLAMP, STBIR_FILTER_DEFAULT ); +} + +STBIRDEF unsigned char * stbir_resize_uint8_srgb( const unsigned char *input_pixels , int input_w , int input_h, int input_stride_in_bytes, + unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes, + stbir_pixel_layout pixel_layout ) +{ + return (unsigned char *) stbir_quick_resize_helper( input_pixels , input_w , input_h, input_stride_in_bytes, + output_pixels, output_w, output_h, output_stride_in_bytes, + pixel_layout, STBIR_TYPE_UINT8_SRGB, STBIR_EDGE_CLAMP, STBIR_FILTER_DEFAULT ); +} + + +STBIRDEF float * stbir_resize_float_linear( const float *input_pixels , int input_w , int input_h, int input_stride_in_bytes, + float *output_pixels, int output_w, int output_h, int output_stride_in_bytes, + stbir_pixel_layout pixel_layout ) +{ + return (float *) stbir_quick_resize_helper( input_pixels , input_w , input_h, input_stride_in_bytes, + output_pixels, output_w, output_h, output_stride_in_bytes, + pixel_layout, STBIR_TYPE_FLOAT, STBIR_EDGE_CLAMP, STBIR_FILTER_DEFAULT ); +} + + +STBIRDEF void * stbir_resize( const void *input_pixels , int input_w , int input_h, int input_stride_in_bytes, + void *output_pixels, int output_w, int output_h, int output_stride_in_bytes, + stbir_pixel_layout pixel_layout, stbir_datatype data_type, + stbir_edge edge, stbir_filter filter ) +{ + return (void *) stbir_quick_resize_helper( input_pixels , input_w , input_h, input_stride_in_bytes, + output_pixels, output_w, output_h, output_stride_in_bytes, + pixel_layout, data_type, edge, filter ); } #ifdef STBIR_PROFILE STBIRDEF void stbir_resize_build_profile_info( STBIR_PROFILE_INFO * info, STBIR_RESIZE const * resize ) { - static char const * bdescriptions[6] = { "Building", "Allocating", "Horizontal sampler", "Vertical sampler", "Coefficient cleanup", "Coefficient piovot" } ; + static char const * bdescriptions[6] = { "Building", "Allocating", "Horizontal sampler", "Vertical sampler", "Coefficient cleanup", "Coefficient pivot" } ; stbir__info* samp = resize->samplers; int i; @@ -8147,7 +8211,7 @@ STBIRDEF void stbir_resize_split_profile_info( STBIR_PROFILE_INFO * info, STBIR_ info->clocks[i] = sum; } - info->total_clocks = split_info->profile.named.total; + info->total_clocks = split_info->profile.named.total; info->descriptions = descriptions; info->count = STBIR__ARRAY_SIZE( descriptions ); } @@ -8226,7 +8290,7 @@ STBIRDEF void stbir_resize_extended_profile_info( STBIR_PROFILE_INFO * info, STB #define stbir__encode_simdfX_unflip stbir__encode_simdf4_unflip #endif -static void STBIR__CODER_NAME( stbir__decode_uint8_linear_scaled )( float * decodep, int width_times_channels, void const * inputp ) +static float * STBIR__CODER_NAME( stbir__decode_uint8_linear_scaled )( float * decodep, int width_times_channels, void const * inputp ) { float STBIR_STREAMOUT_PTR( * ) decode = decodep; float * decode_end = (float*) decode + width_times_channels; @@ -8286,7 +8350,7 @@ static void STBIR__CODER_NAME( stbir__decode_uint8_linear_scaled )( float * deco decode = decode_end; // backup and do last couple input = end_input_m16; } - return; + return decode_end + 16; } #endif @@ -8324,6 +8388,8 @@ static void STBIR__CODER_NAME( stbir__decode_uint8_linear_scaled )( float * deco input += stbir__coder_min_num; } #endif + + return decode_end; } static void STBIR__CODER_NAME( stbir__encode_uint8_linear_scaled )( void * outputp, int width_times_channels, float const * encode ) @@ -8443,7 +8509,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear_scaled )( void * outpu #endif } -static void STBIR__CODER_NAME(stbir__decode_uint8_linear)( float * decodep, int width_times_channels, void const * inputp ) +static float * STBIR__CODER_NAME(stbir__decode_uint8_linear)( float * decodep, int width_times_channels, void const * inputp ) { float STBIR_STREAMOUT_PTR( * ) decode = decodep; float * decode_end = (float*) decode + width_times_channels; @@ -8497,7 +8563,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_linear)( float * decodep, int decode = decode_end; // backup and do last couple input = end_input_m16; } - return; + return decode_end + 16; } #endif @@ -8535,6 +8601,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_linear)( float * decodep, int input += stbir__coder_min_num; } #endif + return decode_end; } static void STBIR__CODER_NAME( stbir__encode_uint8_linear )( void * outputp, int width_times_channels, float const * encode ) @@ -8636,10 +8703,10 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear )( void * outputp, int #endif } -static void STBIR__CODER_NAME(stbir__decode_uint8_srgb)( float * decodep, int width_times_channels, void const * inputp ) +static float * STBIR__CODER_NAME(stbir__decode_uint8_srgb)( float * decodep, int width_times_channels, void const * inputp ) { float STBIR_STREAMOUT_PTR( * ) decode = decodep; - float const * decode_end = (float*) decode + width_times_channels; + float * decode_end = (float*) decode + width_times_channels; unsigned char const * input = (unsigned char const *)inputp; // try to do blocks of 4 when you can @@ -8674,6 +8741,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_srgb)( float * decodep, int wi input += stbir__coder_min_num; } #endif + return decode_end; } #define stbir__min_max_shift20( i, f ) \ @@ -8691,7 +8759,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_srgb)( float * decodep, int wi { \ stbir__simdi temp; \ stbir__simdi_32shr( temp, stbir_simdi_castf( f ), 12 ) ; \ - stbir__simdi_and( temp, temp, STBIR__CONSTI(STBIR_mastissa_mask) ); \ + stbir__simdi_and( temp, temp, STBIR__CONSTI(STBIR_mantissa_mask) ); \ stbir__simdi_or( temp, temp, STBIR__CONSTI(STBIR_topscale) ); \ stbir__simdi_16madd( i, i, temp ); \ stbir__simdi_32shr( i, i, 16 ); \ @@ -8826,11 +8894,12 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb )( void * outputp, int w #if ( stbir__coder_min_num == 4 ) || ( ( stbir__coder_min_num == 1 ) && ( !defined(stbir__decode_swizzle) ) ) -static void STBIR__CODER_NAME(stbir__decode_uint8_srgb4_linearalpha)( float * decodep, int width_times_channels, void const * inputp ) +static float * STBIR__CODER_NAME(stbir__decode_uint8_srgb4_linearalpha)( float * decodep, int width_times_channels, void const * inputp ) { float STBIR_STREAMOUT_PTR( * ) decode = decodep; - float const * decode_end = (float*) decode + width_times_channels; + float * decode_end = (float*) decode + width_times_channels; unsigned char const * input = (unsigned char const *)inputp; + do { decode[0] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order0] ]; decode[1] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order1] ]; @@ -8839,6 +8908,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_srgb4_linearalpha)( float * de input += 4; decode += 4; } while( decode < decode_end ); + return decode_end; } @@ -8911,11 +8981,12 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb4_linearalpha )( void * o #if ( stbir__coder_min_num == 2 ) || ( ( stbir__coder_min_num == 1 ) && ( !defined(stbir__decode_swizzle) ) ) -static void STBIR__CODER_NAME(stbir__decode_uint8_srgb2_linearalpha)( float * decodep, int width_times_channels, void const * inputp ) +static float * STBIR__CODER_NAME(stbir__decode_uint8_srgb2_linearalpha)( float * decodep, int width_times_channels, void const * inputp ) { float STBIR_STREAMOUT_PTR( * ) decode = decodep; - float const * decode_end = (float*) decode + width_times_channels; + float * decode_end = (float*) decode + width_times_channels; unsigned char const * input = (unsigned char const *)inputp; + decode += 4; while( decode <= decode_end ) { @@ -8929,9 +9000,10 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_srgb2_linearalpha)( float * de decode -= 4; if( decode < decode_end ) { - decode[0] = stbir__srgb_uchar_to_linear_float[ stbir__decode_order0 ]; + decode[0] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order0] ]; decode[1] = ( (float) input[stbir__decode_order1] ) * stbir__max_uint8_as_float_inverted; } + return decode_end; } static void STBIR__CODER_NAME( stbir__encode_uint8_srgb2_linearalpha )( void * outputp, int width_times_channels, float const * encode ) @@ -8997,7 +9069,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb2_linearalpha )( void * o #endif -static void STBIR__CODER_NAME(stbir__decode_uint16_linear_scaled)( float * decodep, int width_times_channels, void const * inputp ) +static float * STBIR__CODER_NAME(stbir__decode_uint16_linear_scaled)( float * decodep, int width_times_channels, void const * inputp ) { float STBIR_STREAMOUT_PTR( * ) decode = decodep; float * decode_end = (float*) decode + width_times_channels; @@ -9045,7 +9117,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint16_linear_scaled)( float * decod decode = decode_end; // backup and do last couple input = end_input_m8; } - return; + return decode_end + 8; } #endif @@ -9083,6 +9155,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint16_linear_scaled)( float * decod input += stbir__coder_min_num; } #endif + return decode_end; } @@ -9202,7 +9275,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear_scaled)( void * output #endif } -static void STBIR__CODER_NAME(stbir__decode_uint16_linear)( float * decodep, int width_times_channels, void const * inputp ) +static float * STBIR__CODER_NAME(stbir__decode_uint16_linear)( float * decodep, int width_times_channels, void const * inputp ) { float STBIR_STREAMOUT_PTR( * ) decode = decodep; float * decode_end = (float*) decode + width_times_channels; @@ -9247,7 +9320,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint16_linear)( float * decodep, int decode = decode_end; // backup and do last couple input = end_input_m8; } - return; + return decode_end + 8; } #endif @@ -9285,6 +9358,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint16_linear)( float * decodep, int input += stbir__coder_min_num; } #endif + return decode_end; } static void STBIR__CODER_NAME(stbir__encode_uint16_linear)( void * outputp, int width_times_channels, float const * encode ) @@ -9385,7 +9459,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear)( void * outputp, int #endif } -static void STBIR__CODER_NAME(stbir__decode_half_float_linear)( float * decodep, int width_times_channels, void const * inputp ) +static float * STBIR__CODER_NAME(stbir__decode_half_float_linear)( float * decodep, int width_times_channels, void const * inputp ) { float STBIR_STREAMOUT_PTR( * ) decode = decodep; float * decode_end = (float*) decode + width_times_channels; @@ -9431,7 +9505,7 @@ static void STBIR__CODER_NAME(stbir__decode_half_float_linear)( float * decodep, decode = decode_end; // backup and do last couple input = end_input_m8; } - return; + return decode_end + 8; } #endif @@ -9469,6 +9543,7 @@ static void STBIR__CODER_NAME(stbir__decode_half_float_linear)( float * decodep, input += stbir__coder_min_num; } #endif + return decode_end; } static void STBIR__CODER_NAME( stbir__encode_half_float_linear )( void * outputp, int width_times_channels, float const * encode ) @@ -9555,7 +9630,7 @@ static void STBIR__CODER_NAME( stbir__encode_half_float_linear )( void * outputp #endif } -static void STBIR__CODER_NAME(stbir__decode_float_linear)( float * decodep, int width_times_channels, void const * inputp ) +static float * STBIR__CODER_NAME(stbir__decode_float_linear)( float * decodep, int width_times_channels, void const * inputp ) { #ifdef stbir__decode_swizzle float STBIR_STREAMOUT_PTR( * ) decode = decodep; @@ -9609,7 +9684,7 @@ static void STBIR__CODER_NAME(stbir__decode_float_linear)( float * decodep, int decode = decode_end; // backup and do last couple input = end_input_m16; } - return; + return decode_end + 16; } #endif @@ -9647,18 +9722,21 @@ static void STBIR__CODER_NAME(stbir__decode_float_linear)( float * decodep, int input += stbir__coder_min_num; } #endif + return decode_end; #else if ( (void*)decodep != inputp ) STBIR_MEMCPY( decodep, inputp, width_times_channels * sizeof( float ) ); + return decodep + width_times_channels; + #endif } static void STBIR__CODER_NAME( stbir__encode_float_linear )( void * outputp, int width_times_channels, float const * encode ) { - #if !defined( STBIR_FLOAT_HIGH_CLAMP ) && !defined(STBIR_FLOAT_LO_CLAMP) && !defined(stbir__decode_swizzle) + #if !defined( STBIR_FLOAT_HIGH_CLAMP ) && !defined(STBIR_FLOAT_LOW_CLAMP) && !defined(stbir__decode_swizzle) if ( (void*)outputp != (void*) encode ) STBIR_MEMCPY( outputp, encode, width_times_channels * sizeof( float ) ); @@ -9713,7 +9791,7 @@ static void STBIR__CODER_NAME( stbir__encode_float_linear )( void * outputp, int stbir__simdfX_store( output+stbir__simdfX_float_count, e1 ); encode += stbir__simdfX_float_count * 2; output += stbir__simdfX_float_count * 2; - if ( output < end_output ) + if ( output <= end_output ) continue; if ( output == ( end_output + ( stbir__simdfX_float_count * 2 ) ) ) break;