diff --git a/avs_core/filters/intel/resample_avx2.cpp b/avs_core/filters/intel/resample_avx2.cpp index 1929021eb..b28c74b24 100644 --- a/avs_core/filters/intel/resample_avx2.cpp +++ b/avs_core/filters/intel/resample_avx2.cpp @@ -365,7 +365,7 @@ static void internal_resizer_h_avx2_generic_uint8_16_t(BYTE* dst8, const BYTE* s dst_pitch /= sizeof(pixel_t); src_pitch /= sizeof(pixel_t); - const int w_safe_mod8 = (program->overread_possible ? program->source_overread_beyond_targetx : width) / 8 * 8; + const int w_safe_mod8 = (program->safelimit_filter_size_aligned.overread_possible ? program->safelimit_filter_size_aligned.source_overread_beyond_targetx : width) / 8 * 8; for (int y = 0; y < height; y++) { const short* AVS_RESTRICT current_coeff_base = program->pixel_coefficient; @@ -586,7 +586,7 @@ static void internal_resizer_h_avx2_generic_float(BYTE* dst8, const BYTE* src8, dst_pitch = dst_pitch / sizeof(float); src_pitch = src_pitch / sizeof(float); - const int w_safe_mod8 = (program->overread_possible ? program->source_overread_beyond_targetx : width) / 8 * 8; + const int w_safe_mod8 = (program->safelimit_filter_size_aligned.overread_possible ? program->safelimit_filter_size_aligned.source_overread_beyond_targetx : width) / 8 * 8; for (int y = 0; y < height; y++) { float* current_coeff_base = program->pixel_coefficient_float; @@ -627,7 +627,7 @@ void resizer_h_avx2_generic_float(BYTE* dst8, const BYTE* src8, int dst_pitch, i // end of H float //-------- 256 bit Verticals -/* +#if 0 void resize_v_avx2_planar_uint8_t(BYTE* AVS_RESTRICT dst, const BYTE* src, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel) { AVS_UNUSED(bits_per_pixel); @@ -702,7 +702,7 @@ void resize_v_avx2_planar_uint8_t(BYTE* AVS_RESTRICT dst, const BYTE* src, int d current_coeff += filter_size; } } -*/ +#else void resize_v_avx2_planar_uint8_t(BYTE* AVS_RESTRICT dst, const BYTE* src, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel) { @@ -822,7 +822,7 @@ void resize_v_avx2_planar_uint8_t(BYTE* AVS_RESTRICT dst, const BYTE* src, int d current_coeff += filter_size; } } - +#endif template void resize_v_avx2_planar_uint16_t(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel) { @@ -987,6 +987,102 @@ void resize_v_avx2_planar_float(BYTE* dst8, const BYTE* src8, int dst_pitch, int } } +// Memory-transfer optimized version +void resize_v_avx2_planar_float_w_sr(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel) +{ + AVS_UNUSED(bits_per_pixel); + + const int filter_size = program->filter_size; + const float* AVS_RESTRICT current_coeff = program->pixel_coefficient_float; + + const float* src = (const float*)src8; + float* AVS_RESTRICT dst = (float*)dst8; + dst_pitch = dst_pitch / sizeof(float); + src_pitch = src_pitch / sizeof(float); + + const int kernel_size = program->filter_size_real; // not the aligned + const int kernel_size_mod2 = (kernel_size / 2) * 2; // Process pairs of rows for better efficiency + const bool notMod2 = kernel_size_mod2 < kernel_size; + + const int width_mod32 = (width / 32) * 32; // Process by 4x 256bit (8 x 8 floats) to make memory read/write linear streams longer, 16x256 bit registers in 64bit mode should be enough + + for (int y = 0; y < target_height; y++) { + int offset = program->pixel_offset[y]; + const float* src_ptr = src + offset * src_pitch; + + for (int x = 0; x < width_mod32; x += 32) { + __m256 result_1 = _mm256_setzero_ps(); + __m256 result_2 = _mm256_setzero_ps(); + __m256 result_3 = _mm256_setzero_ps(); + __m256 result_4 = _mm256_setzero_ps(); + + const float* AVS_RESTRICT src2_ptr = src_ptr + x; // __restrict here + + int i = 0; + for (; i < kernel_size; i ++) { + // coefs are equal for all H-samples + __m256 coeff = _mm256_set1_ps(current_coeff[i]); + + __m256 src_1 = _mm256_load_ps(src2_ptr); // why was loadu ? source always aligned in V-resizers ? + __m256 src_2 = _mm256_load_ps(src2_ptr + 8); + __m256 src_3 = _mm256_load_ps(src2_ptr + 16); + __m256 src_4 = _mm256_load_ps(src2_ptr + 24); + + result_1 = _mm256_fmadd_ps(src_1, coeff, result_1); + result_2 = _mm256_fmadd_ps(src_2, coeff, result_2); + result_3 = _mm256_fmadd_ps(src_3, coeff, result_3); + result_4 = _mm256_fmadd_ps(src_4, coeff, result_4); + + src2_ptr += src_pitch; + } + + _mm256_store_ps(dst + x, result_1); + _mm256_store_ps(dst + x + 8, result_2); + _mm256_store_ps(dst + x + 16, result_3); + _mm256_store_ps(dst + x + 24, result_4); + } // width_mod32 + + // 32 byte 8 floats (AVX2 register holds 8 floats) + // no need for wmod8, alignment is safe 32 bytes at least + for (int x = width_mod32; x < width; x += 8) { + __m256 result_single = _mm256_setzero_ps(); + __m256 result_single_2 = _mm256_setzero_ps(); + + const float* AVS_RESTRICT src2_ptr = src_ptr + x; // __restrict here + + // Process pairs of rows for better efficiency (2 coeffs/cycle) + // two result variables for potential parallel operation + int i = 0; + for (; i < kernel_size_mod2; i += 2) { + __m256 coeff_even = _mm256_set1_ps(current_coeff[i]); + __m256 coeff_odd = _mm256_set1_ps(current_coeff[i + 1]); + + __m256 src_even = _mm256_load_ps(src2_ptr); + __m256 src_odd = _mm256_load_ps(src2_ptr + src_pitch); + + result_single = _mm256_fmadd_ps(src_even, coeff_even, result_single); + result_single_2 = _mm256_fmadd_ps(src_odd, coeff_odd, result_single_2); + + src2_ptr += 2 * src_pitch; + } + + result_single = _mm256_add_ps(result_single, result_single_2); + + // Process the last odd row if needed + if (notMod2) { + __m256 coeff = _mm256_set1_ps(current_coeff[i]); + __m256 src_val = _mm256_load_ps(src2_ptr); + result_single = _mm256_fmadd_ps(src_val, coeff, result_single); + } + + _mm256_store_ps(dst + x, result_single); + } + + dst += dst_pitch; + current_coeff += filter_size; + } +} + // avx2 16bit template void resizer_h_avx2_generic_uint16_t(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel); // avx2 10-14bit @@ -998,3 +1094,874 @@ template void resizer_h_avx2_generic_uint16_t(BYTE* dst8, const BYTE* src8 template void resize_v_avx2_planar_uint16_t(BYTE* dst0, const BYTE* src0, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel); // avx2 10-14bit template void resize_v_avx2_planar_uint16_t(BYTE* dst0, const BYTE* src0, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel); + + +// Transpose-based SIMD +void resize_h_planar_float_avx_transpose(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel) { + int filter_size = program->filter_size; + + const float* AVS_RESTRICT current_coeff; + + src_pitch = src_pitch / sizeof(float); + dst_pitch = dst_pitch / sizeof(float); + + float* src = (float*)src8; + float* dst = (float*)dst8; + + const int kernel_size = program->filter_size_real; + const int ksmod4 = kernel_size / 4 * 4; +// const int ksmod8 = kernel_size / 8 * 8; +#if 0 + + for (int y = 0; y < height; y ++) { + current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float; + + float* AVS_RESTRICT dst2_ptr = dst + y * dst_pitch; + const float* src_ptr = src + y * src_pitch; + + for (int x = 0; x < width; x += 8) { + + __m256 result = _mm256_setzero_ps(); + + for (int i = 0; i < ksmod4; i += 4) { + + __m256 data_1_data_5 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 0] + i, src_ptr + program->pixel_offset[x + 4] + i); + __m256 data_2_data_6 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 1] + i, src_ptr + program->pixel_offset[x + 5] + i); + __m256 data_3_data_7 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 2] + i, src_ptr + program->pixel_offset[x + 6] + i); + __m256 data_4_data_8 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 3] + i, src_ptr + program->pixel_offset[x + 7] + i); + + __m256 coef_1_coef_5 = _mm256_load_2_m128(current_coeff + i + filter_size * 0, current_coeff + i + filter_size * 4); + __m256 coef_2_coef_6 = _mm256_load_2_m128(current_coeff + i + filter_size * 1, current_coeff + i + filter_size * 5); + __m256 coef_3_coef_7 = _mm256_load_2_m128(current_coeff + i + filter_size * 2, current_coeff + i + filter_size * 6); + __m256 coef_4_coef_8 = _mm256_load_2_m128(current_coeff + i + filter_size * 3, current_coeff + i + filter_size * 7); + + _MM_TRANSPOSE8_LANE4_PS(data_1_data_5, data_2_data_6, data_3_data_7, data_4_data_8); + _MM_TRANSPOSE8_LANE4_PS(coef_1_coef_5, coef_2_coef_6, coef_3_coef_7, coef_4_coef_8); + + result = _mm256_fmadd_ps(data_1_data_5, coef_1_coef_5, result); + result = _mm256_fmadd_ps(data_2_data_6, coef_2_coef_6, result); + result = _mm256_fmadd_ps(data_3_data_7, coef_3_coef_7, result); + result = _mm256_fmadd_ps(data_4_data_8, coef_4_coef_8, result); + + } + + _mm256_store_ps(dst2_ptr + x, result); + current_coeff += filter_size * 8; + } + } +#endif + + for (int y = 0; y < height; y+=2) { + current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float; + + float* AVS_RESTRICT dst2_ptr = dst + y * dst_pitch; + float* AVS_RESTRICT dst2_ptr2 = dst + (y + 1) * dst_pitch; + const float* src_ptr = src + y * src_pitch; + const float* src_ptr2 = src + (y + 1) * src_pitch; + + for (int x = 0; x < width; x += 8) { + + __m256 result = _mm256_setzero_ps(); + __m256 result2 = _mm256_setzero_ps(); + + for (int i = 0; i < kernel_size; i += 4) { // is it always mod4 ? + __m256 data_1_data_5 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 0] + i, src_ptr + program->pixel_offset[x + 4] + i); + __m256 data_2_data_6 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 1] + i, src_ptr + program->pixel_offset[x + 5] + i); + __m256 data_3_data_7 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 2] + i, src_ptr + program->pixel_offset[x + 6] + i); + __m256 data_4_data_8 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 3] + i, src_ptr + program->pixel_offset[x + 7] + i); + + __m256 data_1_data_5_2 = _mm256_loadu_2_m128(src_ptr2 + program->pixel_offset[x + 0] + i, src_ptr2 + program->pixel_offset[x + 4] + i); + __m256 data_2_data_6_2 = _mm256_loadu_2_m128(src_ptr2 + program->pixel_offset[x + 1] + i, src_ptr2 + program->pixel_offset[x + 5] + i); + __m256 data_3_data_7_2 = _mm256_loadu_2_m128(src_ptr2 + program->pixel_offset[x + 2] + i, src_ptr2 + program->pixel_offset[x + 6] + i); + __m256 data_4_data_8_2 = _mm256_loadu_2_m128(src_ptr2 + program->pixel_offset[x + 3] + i, src_ptr2 + program->pixel_offset[x + 7] + i); + + __m256 coef_1_coef_5 = _mm256_load_2_m128(current_coeff + i + filter_size * 0, current_coeff + i + filter_size * 4); + __m256 coef_2_coef_6 = _mm256_load_2_m128(current_coeff + i + filter_size * 1, current_coeff + i + filter_size * 5); + __m256 coef_3_coef_7 = _mm256_load_2_m128(current_coeff + i + filter_size * 2, current_coeff + i + filter_size * 6); + __m256 coef_4_coef_8 = _mm256_load_2_m128(current_coeff + i + filter_size * 3, current_coeff + i + filter_size * 7); + + _MM_TRANSPOSE8_LANE4_PS(data_1_data_5, data_2_data_6, data_3_data_7, data_4_data_8); + _MM_TRANSPOSE8_LANE4_PS(data_1_data_5_2, data_2_data_6_2, data_3_data_7_2, data_4_data_8_2); + _MM_TRANSPOSE8_LANE4_PS(coef_1_coef_5, coef_2_coef_6, coef_3_coef_7, coef_4_coef_8); + + result = _mm256_fmadd_ps(data_1_data_5, coef_1_coef_5, result); + result = _mm256_fmadd_ps(data_2_data_6, coef_2_coef_6, result); + result = _mm256_fmadd_ps(data_3_data_7, coef_3_coef_7, result); + result = _mm256_fmadd_ps(data_4_data_8, coef_4_coef_8, result); + + result2 = _mm256_fmadd_ps(data_1_data_5_2, coef_1_coef_5, result2); + result2 = _mm256_fmadd_ps(data_2_data_6_2, coef_2_coef_6, result2); + result2 = _mm256_fmadd_ps(data_3_data_7_2, coef_3_coef_7, result2); + result2 = _mm256_fmadd_ps(data_4_data_8_2, coef_4_coef_8, result2); + + } + + // need to process last non-mod4 kernel samples in scalar way. or can we do over-read up to 3 kernel and source samples safely with main 4-kernel_samples loop ? + + _mm256_store_ps(dst2_ptr + x, result); + _mm256_store_ps(dst2_ptr2 + x, result2); + current_coeff += filter_size * 8; + } + } + +} + + +// Safe dual lane partial load with AVX +// Read exactly N pixels, avoiding +// - reading beyond the end of the source buffer. +// - avoid NaN contamination, since event with zero coefficients NaN * 0 = NaN +template +AVS_FORCEINLINE static __m256 _mm256_load_partial_safe_2_m128(const float* src_ptr_offsetted1, const float* src_ptr_offsetted2) { + __m128 s1; + __m128 s2; + switch (Nmod4) { + case 1: + s1 = _mm_set_ps(0.0f, 0.0f, 0.0f, src_ptr_offsetted1[0]); + s2 = _mm_set_ps(0.0f, 0.0f, 0.0f, src_ptr_offsetted2[0]); + // ideally: movss + break; + case 2: + s1 = _mm_set_ps(0.0f, 0.0f, src_ptr_offsetted1[1], src_ptr_offsetted1[0]); + s2 = _mm_set_ps(0.0f, 0.0f, src_ptr_offsetted2[1], src_ptr_offsetted2[0]); + // ideally: movsd + break; + case 3: + s1 = _mm_set_ps(0.0f, src_ptr_offsetted1[2], src_ptr_offsetted1[1], src_ptr_offsetted1[0]); + s2 = _mm_set_ps(0.0f, src_ptr_offsetted2[2], src_ptr_offsetted2[1], src_ptr_offsetted2[0]); + // ideally: movss + movsd + shuffle or movsd + insert + break; + case 0: + s1 = _mm_set_ps(src_ptr_offsetted1[3], src_ptr_offsetted1[2], src_ptr_offsetted1[1], src_ptr_offsetted1[0]); + s2 = _mm_set_ps(src_ptr_offsetted2[3], src_ptr_offsetted2[2], src_ptr_offsetted2[1], src_ptr_offsetted2[0]); + // ideally: movups + break; + default: + s1 = _mm_setzero_ps(); // n/a cannot happen + s2 = _mm_setzero_ps(); + } + return _mm256_set_m128(s2, s1); +} + + +// Processes a horizontal resampling kernel of up to four coefficients for float pixel types. +// Supports BilinearResize, BicubicResize, or sinc with up to 2 taps (filter size <= 4). +// AVX optimization loads and processes four float coefficients and eight pixels simultaneously. +// The 'filtersizemod4' template parameter (0-3) helps optimize for different filter sizes modulo 4. +// This AVX2 requires only filter_size_alignment of 4. +template +void resize_h_planar_float_avx_transpose_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel) { + assert(filtersizemod4 >= 0 && filtersizemod4 <= 3); + + const int filter_size = program->filter_size; // aligned, practically the coeff table stride + + src_pitch /= sizeof(float); + dst_pitch /= sizeof(float); + + float* src = (float*)src8; + float* dst = (float*)dst8; + + const float* AVS_RESTRICT current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float; + + constexpr int PIXELS_AT_A_TIME = 8; // Process eight pixels in parallel using AVX2 (2x4 using m128 lanes) + + // 'source_overread_beyond_targetx' indicates if the filter kernel can read beyond the target width. + // Even if the filter alignment allows larger reads, our safety boundary for unaligned loads starts at 4 pixels back + // from the target width, as we load 4 floats at once with '_mm_loadu_ps'. + const int width_safe_mod = (program->safelimit_4_pixels.overread_possible ? program->safelimit_4_pixels.source_overread_beyond_targetx : width) / PIXELS_AT_A_TIME * PIXELS_AT_A_TIME; + + // Preconditions: + assert(program->filter_size_real <= 4); // We preload all relevant coefficients (up to 4) before the height loop. + + // 'target_size_alignment' ensures we can safely access coefficients using offsets like + // 'filter_size * 7' when processing 8 H pixels at a time or + // 'filter_size * 15' when processing 16 H pixels at a time + assert(program->target_size_alignment >= 8); + + // Ensure that coefficient loading beyond the valid target size is safe for 4x4 float loads. + assert(program->filter_size_alignment >= 4); + + int x = 0; + + // This 'auto' lambda construct replaces the need of templates + auto do_h_float_core = [&](auto partial_load) { + // Load up to 2x4 coefficients at once before the height loop. + // Pre-loading and transposing coefficients keeps register usage efficient. + // Assumes 'filter_size_aligned' is at least 4. + + // Coefficients for the source pixel offset (for src_ptr + begin1 [0..3] and for src_ptr + begin5 [0..3] ) + __m256 coef_1_coef_5 = _mm256_load_2_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4); + __m256 coef_2_coef_6 = _mm256_load_2_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5); + __m256 coef_3_coef_7 = _mm256_load_2_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6); + __m256 coef_4_coef_8 = _mm256_load_2_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7); + + _MM_TRANSPOSE8_LANE4_PS(coef_1_coef_5, coef_2_coef_6, coef_3_coef_7, coef_4_coef_8); + + float* AVS_RESTRICT dst_ptr = dst + x; + const float* src_ptr = src; + + // Pixel offsets for the current target x-positions. + // Even for x >= width, these offsets are guaranteed to be within the allocated 'target_size_alignment'. + const int begin1 = program->pixel_offset[x + 0]; + const int begin2 = program->pixel_offset[x + 1]; + const int begin3 = program->pixel_offset[x + 2]; + const int begin4 = program->pixel_offset[x + 3]; + const int begin5 = program->pixel_offset[x + 4]; + const int begin6 = program->pixel_offset[x + 5]; + const int begin7 = program->pixel_offset[x + 6]; + const int begin8 = program->pixel_offset[x + 7]; + + for (int y = 0; y < height; y++) + { + __m256 data_1_data_5; + __m256 data_2_data_6; + __m256 data_3_data_7; + __m256 data_4_data_8; + + if constexpr (partial_load) { + // In the potentially unsafe zone (near the right edge of the image), we use a safe loading function + // to prevent reading beyond the allocated source scanline. This handles cases where loading 4 floats + // starting from 'src_ptr + beginX' might exceed the source buffer. + + // Example of the unsafe scenario: If target width is 320, a naive load at src_ptr + 317 + // would attempt to read floats at indices 317, 318, 319, and 320, potentially going out of bounds. + + // Two main issues in the unsafe zone: + // 1.) Out-of-bounds memory access: Reading beyond the allocated memory for the source scanline can + // lead to access violations and crashes. '_mm_loadu_ps' attempts to load 16 bytes, so even if + // the starting address is within bounds, subsequent reads might not be. + // 2.) Garbage or NaN values: Even if a read doesn't cause a crash, accessing uninitialized or + // out-of-bounds memory (especially for float types) can result in garbage data, including NaN. + // Multiplying by a valid coefficient and accumulating this NaN can contaminate the final result. + + // '_mm256_load_partial_safe_2_m128' safely loads up to 'filter_size_real' pixels and pads with zeros if needed, + // preventing out-of-bounds reads and ensuring predictable results even near the image edges. + + data_1_data_5 = _mm256_load_partial_safe_2_m128(src_ptr + begin1, src_ptr + begin5); + data_2_data_6 = _mm256_load_partial_safe_2_m128(src_ptr + begin2, src_ptr + begin6); + data_3_data_7 = _mm256_load_partial_safe_2_m128(src_ptr + begin3, src_ptr + begin7); + data_4_data_8 = _mm256_load_partial_safe_2_m128(src_ptr + begin4, src_ptr + begin8); + } + else { + // In the safe zone, we can directly load 4 pixels at a time using unaligned loads. + data_1_data_5 = _mm256_loadu_2_m128(src_ptr + begin1, src_ptr + begin5); + data_2_data_6 = _mm256_loadu_2_m128(src_ptr + begin2, src_ptr + begin6); + data_3_data_7 = _mm256_loadu_2_m128(src_ptr + begin3, src_ptr + begin7); + data_4_data_8 = _mm256_loadu_2_m128(src_ptr + begin4, src_ptr + begin8); + } + + _MM_TRANSPOSE8_LANE4_PS(data_1_data_5, data_2_data_6, data_3_data_7, data_4_data_8); + + __m256 result = _mm256_mul_ps(data_1_data_5, coef_1_coef_5); + result = _mm256_fmadd_ps(data_2_data_6, coef_2_coef_6, result); + result = _mm256_fmadd_ps(data_3_data_7, coef_3_coef_7, result); + result = _mm256_fmadd_ps(data_4_data_8, coef_4_coef_8, result); + + _mm256_store_ps(dst_ptr, result); + + dst_ptr += dst_pitch; + src_ptr += src_pitch; + } // y + current_coeff += filter_size * 8; // Move to the next set of coefficients for the next 8 output pixels + }; // end of lambda + + // Process the 'safe zone' where direct full unaligned loads are acceptable. + for (; x < width_safe_mod; x += PIXELS_AT_A_TIME) + { + do_h_float_core(std::false_type{}); // partial_load == false, use direct _mm_loadu_ps + } + + // Process the potentially 'unsafe zone' near the image edge, using safe loading. + for (; x < width; x += PIXELS_AT_A_TIME) + { + do_h_float_core(std::true_type{}); // partial_load == true, use the safer '_mm256_load_partial_safe_2_m128' + } +} + +// Instantiate them +template void resize_h_planar_float_avx_transpose_vstripe_ks4<0>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel); +template void resize_h_planar_float_avx_transpose_vstripe_ks4<1>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel); +template void resize_h_planar_float_avx_transpose_vstripe_ks4<2>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel); +template void resize_h_planar_float_avx_transpose_vstripe_ks4<3>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel); + +void resize_h_planar_float_avx_gather_vstripe_ks8(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel) +{ + int filter_size = program->filter_size; + + const float* AVS_RESTRICT current_coeff; + + src_pitch = src_pitch / sizeof(float); + dst_pitch = dst_pitch / sizeof(float); + + float* src = (float*)src8; + float* dst = (float*)dst8; + + current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float; + __m256i one_epi32 = _mm256_set1_epi32(1); + + for (int x = 0; x < width; x += 8) + { + __m256 coef_0 = _mm256_load_ps(current_coeff + filter_size * 0); + __m256 coef_1 = _mm256_load_ps(current_coeff + filter_size * 1); + __m256 coef_2 = _mm256_load_ps(current_coeff + filter_size * 2); + __m256 coef_3 = _mm256_load_ps(current_coeff + filter_size * 3); + __m256 coef_4 = _mm256_load_ps(current_coeff + filter_size * 4); + __m256 coef_5 = _mm256_load_ps(current_coeff + filter_size * 5); + __m256 coef_6 = _mm256_load_ps(current_coeff + filter_size * 6); + __m256 coef_7 = _mm256_load_ps(current_coeff + filter_size * 7); + + _MM_TRANSPOSE8_PS(coef_0, coef_1, coef_2, coef_3, coef_4, coef_5, coef_6, coef_7); + + float* AVS_RESTRICT dst_ptr = dst + x; + const float* src_ptr = src; + + for (int y = 0; y < height; y++) + { + // __m256i offsets = _mm256_load_si256(program->pixel_offset + x); // hope it is always aligned ? + __m256i offsets = _mm256_set_epi32(program->pixel_offset[x + 7], program->pixel_offset[x + 6], program->pixel_offset[x + 5], program->pixel_offset[x + 4], program->pixel_offset[x + 3], program->pixel_offset[x + 2], program->pixel_offset[x + 1], program->pixel_offset[x + 0]); + // __m256i offsets = _mm256_set1_epi32(program->pixel_offset[x]); // test + __m256 data_0 = _mm256_i32gather_ps(src_ptr, offsets, 4); + + offsets = _mm256_add_epi32(offsets, one_epi32); + __m256 data_1 = _mm256_i32gather_ps(src_ptr, offsets, 4); + + offsets = _mm256_add_epi32(offsets, one_epi32); + __m256 data_2 = _mm256_i32gather_ps(src_ptr, offsets, 4); + + offsets = _mm256_add_epi32(offsets, one_epi32); + __m256 data_3 = _mm256_i32gather_ps(src_ptr, offsets, 4); + + offsets = _mm256_add_epi32(offsets, one_epi32); + __m256 data_4 = _mm256_i32gather_ps(src_ptr, offsets, 4); + + offsets = _mm256_add_epi32(offsets, one_epi32); + __m256 data_5 = _mm256_i32gather_ps(src_ptr, offsets, 4); + + offsets = _mm256_add_epi32(offsets, one_epi32); + __m256 data_6 = _mm256_i32gather_ps(src_ptr, offsets, 4); + + offsets = _mm256_add_epi32(offsets, one_epi32); + __m256 data_7 = _mm256_i32gather_ps(src_ptr, offsets, 4); + + __m256 result0 = _mm256_mul_ps(data_0, coef_0); + __m256 result1 = _mm256_mul_ps(data_4, coef_4); + + result0 = _mm256_fmadd_ps(data_1, coef_1, result0); + result1 = _mm256_fmadd_ps(data_5, coef_5, result1); + + result0 = _mm256_fmadd_ps(data_2, coef_2, result0); + result1 = _mm256_fmadd_ps(data_6, coef_6, result1); + + result0 = _mm256_fmadd_ps(data_3, coef_3, result0); + result1 = _mm256_fmadd_ps(data_7, coef_7, result1); + + _mm256_store_ps(dst_ptr, _mm256_add_ps(result0, result1)); + + dst_ptr += dst_pitch; + src_ptr += src_pitch; + } + current_coeff += filter_size * 8; + } +} + +void resize_h_planar_float_avx2_permutex_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel) { + + // assert - check if max pixel_offset is not above single load of 8 src floats (or need several loads and more complex permute program) + // probably this is a valid assumption; there can be no jumps in source pixel indexes, it would mean that the + // filter would neglect some pixels in the source image, which is not allowed by the filter design +#if 1 //def _DEBUG + for (int x = 0; x < width; x += 8) + { + int start_off = program->pixel_offset[x + 0]; + int end_off = program->pixel_offset[x + 7]; + assert((end_off - start_off) <= 7); + + start_off = program->pixel_offset[x + 1]; + end_off = program->pixel_offset[x + 7 + 1]; + assert((end_off - start_off) <= 7); + + start_off = program->pixel_offset[x + 2]; + end_off = program->pixel_offset[x + 7 + 2]; + assert((end_off - start_off) <= 7); + + start_off = program->pixel_offset[x + 3]; + end_off = program->pixel_offset[x + 7 + 3]; + assert((end_off - start_off) <= 7); + } +#endif + + int filter_size = program->filter_size; + + const float* AVS_RESTRICT current_coeff; + __m256i one_epi32 = _mm256_set1_epi32(1); + + src_pitch = src_pitch / sizeof(float); + dst_pitch = dst_pitch / sizeof(float); + + float* src = (float*)src8; + float* dst = (float*)dst8; + + current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float; + + for (int x = 0; x < width; x += 8) + { + // prepare coefs in transposed V-form + __m256 coef_0 = _mm256_load_2_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4); + __m256 coef_1 = _mm256_load_2_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5); + __m256 coef_2 = _mm256_load_2_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6); + __m256 coef_3 = _mm256_load_2_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7); + + _MM_TRANSPOSE8_LANE4_PS(coef_0, coef_1, coef_2, coef_3); + + // convert resampling program in H-form into permuting indexes for src transposition in V-form + int iStart = program->pixel_offset[x + 0]; + + __m256i perm_0 = _mm256_set_epi32( + program->pixel_offset[x + 7] - iStart, + program->pixel_offset[x + 6] - iStart, + program->pixel_offset[x + 5] - iStart, + program->pixel_offset[x + 4] - iStart, + program->pixel_offset[x + 3] - iStart, + program->pixel_offset[x + 2] - iStart, + program->pixel_offset[x + 1] - iStart, + 0); + __m256i perm_1 = _mm256_add_epi32(perm_0, one_epi32); + one_epi32 = _mm256_set1_epi32(program->pixel_offset[x + 2] - program->pixel_offset[x + 1]); + __m256i perm_2 = _mm256_add_epi32(perm_1, one_epi32); + one_epi32 = _mm256_set1_epi32(program->pixel_offset[x + 3] - program->pixel_offset[x + 2]); + __m256i perm_3 = _mm256_add_epi32(perm_2, one_epi32); + /* + one_epi32 = _mm256_set1_epi32(program->pixel_offset[x + 1] - program->pixel_offset[x + 0]); + __m256i perm_1 = _mm256_add_epi32(perm_0, one_epi32); + one_epi32 = _mm256_set1_epi32(program->pixel_offset[x + 2] - program->pixel_offset[x + 1]); + __m256i perm_2 = _mm256_add_epi32(perm_1, one_epi32); + one_epi32 = _mm256_set1_epi32(program->pixel_offset[x + 3] - program->pixel_offset[x + 2]); + __m256i perm_3 = _mm256_add_epi32(perm_2, one_epi32); + __m256i perm_1 = _mm256_set_epi32( + program->pixel_offset[x + 7] - iStart, + program->pixel_offset[x + 6] - iStart, + program->pixel_offset[x + 5] - iStart, + program->pixel_offset[x + 4] - iStart, + program->pixel_offset[x + 3] - iStart, + program->pixel_offset[x + 2] - iStart, + program->pixel_offset[x + 1] - iStart, + 0); + */ + + float* AVS_RESTRICT dst_ptr = dst + x; + const float* src_ptr = src + program->pixel_offset[x + 0]; // all permute offsets relative to this start offset + + for (int y = 0; y < height; y++) + { + __m256 data_src = _mm256_loadu_ps(src_ptr); + + __m256 data_0 = _mm256_permutevar8x32_ps(data_src, perm_0); + __m256 data_1 = _mm256_permutevar8x32_ps(data_src, perm_1); + __m256 data_2 = _mm256_permutevar8x32_ps(data_src, perm_2); + __m256 data_3 = _mm256_permutevar8x32_ps(data_src, perm_3); + + __m256 result0 = _mm256_mul_ps(data_0, coef_0); + __m256 result1 = _mm256_mul_ps(data_2, coef_2); + + result0 = _mm256_fmadd_ps(data_1, coef_1, result0); + result1 = _mm256_fmadd_ps(data_3, coef_3, result1); + + _mm256_store_ps(dst_ptr, _mm256_add_ps(result0, result1)); + + dst_ptr += dst_pitch; + src_ptr += src_pitch; + } + current_coeff += filter_size * 8; + } +} + + +/* Universal function supporting 2 ways of processing depending on the max offset of the source samples to read in the resampling program : +1. For high upsampling ratios it uses low read (single 8 float source samples) and permute-transpose before V-fma +2. For downsample and no-resize convolution - use each input sequence gathering by direct addressing +*/ +template +void resize_h_planar_float_avx2_gather_permutex_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel) +{ + assert(filtersizemod4 >= 0 && filtersizemod4 <= 3); + + const int filter_size = program->filter_size; // aligned, practically the coeff table stride + + src_pitch /= sizeof(float); + dst_pitch /= sizeof(float); + + float* src = (float*)src8; + float* dst = (float*)dst8; + + const float* AVS_RESTRICT current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float; + + constexpr int PIXELS_AT_A_TIME = 8; // Process eight pixels in parallel using AVX2 (2x4 using m128 lanes) + + // 'source_overread_beyond_targetx' indicates if the filter kernel can read beyond the target width. + // Even if the filter alignment allows larger reads, our safety boundary for unaligned loads starts at 4 pixels back + // from the target width, as we load 4 floats at once with '_mm_loadu_ps'. + const int width_safe_mod = (program->safelimit_4_pixels.overread_possible ? program->safelimit_4_pixels.source_overread_beyond_targetx : width) / PIXELS_AT_A_TIME * PIXELS_AT_A_TIME; + + // Preconditions: + assert(program->filter_size_real <= 4); // We preload all relevant coefficients (up to 4) before the height loop. + + // 'target_size_alignment' ensures we can safely access coefficients using offsets like + // 'filter_size * 7' when processing 8 H pixels at a time or + // 'filter_size * 15' when processing 16 H pixels at a time + assert(program->target_size_alignment >= 8); + + // Ensure that coefficient loading beyond the valid target size is safe for 4x4 float loads. + assert(program->filter_size_alignment >= 4); + + bool bDoGather = false; + // Analyse input resampling program to select method of processing + for (int x = 0; x < width - 8; x += 8) // -8 to save from vector overrread at program->pixel_offset[x + 7 + 3]; ? + { + int start_off = program->pixel_offset[x + 0]; + int end_off = program->pixel_offset[x + 7]; + if ((end_off - start_off) + (program->filter_size_real - 1) > 8) bDoGather = true; + + start_off = program->pixel_offset[x + 1]; + end_off = program->pixel_offset[x + 7 + 1]; + if ((end_off - start_off) + (program->filter_size_real - 1) > 8) bDoGather = true; + + start_off = program->pixel_offset[x + 2]; + end_off = program->pixel_offset[x + 7 + 2]; + if ((end_off - start_off) + (program->filter_size_real - 1) > 8) bDoGather = true; + + start_off = program->pixel_offset[x + 3]; + end_off = program->pixel_offset[x + 7 + 3]; + if ((end_off - start_off) + (program->filter_size_real - 1) > 8) bDoGather = true; + } + + if (bDoGather) + { + int x = 0; + + // This 'auto' lambda construct replaces the need of templates + auto do_h_float_core = [&](auto partial_load) { + // Load up to 2x4 coefficients at once before the height loop. + // Pre-loading and transposing coefficients keeps register usage efficient. + // Assumes 'filter_size_aligned' is at least 4. + + // Coefficients for the source pixel offset (for src_ptr + begin1 [0..3] and for src_ptr + begin5 [0..3] ) + __m256 coef_1_coef_5 = _mm256_load_2_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4); + __m256 coef_2_coef_6 = _mm256_load_2_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5); + __m256 coef_3_coef_7 = _mm256_load_2_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6); + __m256 coef_4_coef_8 = _mm256_load_2_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7); + + _MM_TRANSPOSE8_LANE4_PS(coef_1_coef_5, coef_2_coef_6, coef_3_coef_7, coef_4_coef_8); + + float* AVS_RESTRICT dst_ptr = dst + x; + const float* src_ptr = src; + + // Pixel offsets for the current target x-positions. + // Even for x >= width, these offsets are guaranteed to be within the allocated 'target_size_alignment'. + const int begin1 = program->pixel_offset[x + 0]; + const int begin2 = program->pixel_offset[x + 1]; + const int begin3 = program->pixel_offset[x + 2]; + const int begin4 = program->pixel_offset[x + 3]; + const int begin5 = program->pixel_offset[x + 4]; + const int begin6 = program->pixel_offset[x + 5]; + const int begin7 = program->pixel_offset[x + 6]; + const int begin8 = program->pixel_offset[x + 7]; + + for (int y = 0; y < height; y++) + { + __m256 data_1_data_5; + __m256 data_2_data_6; + __m256 data_3_data_7; + __m256 data_4_data_8; + + if constexpr (partial_load) { + // In the potentially unsafe zone (near the right edge of the image), we use a safe loading function + // to prevent reading beyond the allocated source scanline. This handles cases where loading 4 floats + // starting from 'src_ptr + beginX' might exceed the source buffer. + + // Example of the unsafe scenario: If target width is 320, a naive load at src_ptr + 317 + // would attempt to read floats at indices 317, 318, 319, and 320, potentially going out of bounds. + + // Two main issues in the unsafe zone: + // 1.) Out-of-bounds memory access: Reading beyond the allocated memory for the source scanline can + // lead to access violations and crashes. '_mm_loadu_ps' attempts to load 16 bytes, so even if + // the starting address is within bounds, subsequent reads might not be. + // 2.) Garbage or NaN values: Even if a read doesn't cause a crash, accessing uninitialized or + // out-of-bounds memory (especially for float types) can result in garbage data, including NaN. + // Multiplying by a valid coefficient and accumulating this NaN can contaminate the final result. + + // '_mm256_load_partial_safe_2_m128' safely loads up to 'filter_size_real' pixels and pads with zeros if needed, + // preventing out-of-bounds reads and ensuring predictable results even near the image edges. + + data_1_data_5 = _mm256_load_partial_safe_2_m128(src_ptr + begin1, src_ptr + begin5); + data_2_data_6 = _mm256_load_partial_safe_2_m128(src_ptr + begin2, src_ptr + begin6); + data_3_data_7 = _mm256_load_partial_safe_2_m128(src_ptr + begin3, src_ptr + begin7); + data_4_data_8 = _mm256_load_partial_safe_2_m128(src_ptr + begin4, src_ptr + begin8); + } + else { + // In the safe zone, we can directly load 4 pixels at a time using unaligned loads. + data_1_data_5 = _mm256_loadu_2_m128(src_ptr + begin1, src_ptr + begin5); + data_2_data_6 = _mm256_loadu_2_m128(src_ptr + begin2, src_ptr + begin6); + data_3_data_7 = _mm256_loadu_2_m128(src_ptr + begin3, src_ptr + begin7); + data_4_data_8 = _mm256_loadu_2_m128(src_ptr + begin4, src_ptr + begin8); + } + + _MM_TRANSPOSE8_LANE4_PS(data_1_data_5, data_2_data_6, data_3_data_7, data_4_data_8); + + __m256 result = _mm256_mul_ps(data_1_data_5, coef_1_coef_5); + result = _mm256_fmadd_ps(data_2_data_6, coef_2_coef_6, result); + result = _mm256_fmadd_ps(data_3_data_7, coef_3_coef_7, result); + result = _mm256_fmadd_ps(data_4_data_8, coef_4_coef_8, result); + + _mm256_store_ps(dst_ptr, result); + + dst_ptr += dst_pitch; + src_ptr += src_pitch; + } // y + current_coeff += filter_size * 8; // Move to the next set of coefficients for the next 8 output pixels + }; // end of lambda + + // Process the 'safe zone' where direct full unaligned loads are acceptable. + for (; x < width_safe_mod; x += PIXELS_AT_A_TIME) + { + do_h_float_core(std::false_type{}); // partial_load == false, use direct _mm_loadu_ps + } + + // Process the potentially 'unsafe zone' near the image edge, using safe loading. + for (; x < width; x += PIXELS_AT_A_TIME) + { + do_h_float_core(std::true_type{}); // partial_load == true, use the safer '_mm256_load_partial_safe_2_m128' + } + } // if bDoGather + else + { + // do permutex-based upsample + for (int x = 0; x < width; x += 8) + { + // prepare coefs in transposed V-form + __m256 coef_0 = _mm256_load_2_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4); + __m256 coef_1 = _mm256_load_2_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5); + __m256 coef_2 = _mm256_load_2_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6); + __m256 coef_3 = _mm256_load_2_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7); + + _MM_TRANSPOSE8_LANE4_PS(coef_0, coef_1, coef_2, coef_3); + + // convert resampling program in H-form into permuting indexes for src transposition in V-form + int iStart = program->pixel_offset[x + 0]; + + __m256i perm_0 = _mm256_set_epi32( + program->pixel_offset[x + 7] - iStart, + program->pixel_offset[x + 6] - iStart, + program->pixel_offset[x + 5] - iStart, + program->pixel_offset[x + 4] - iStart, + program->pixel_offset[x + 3] - iStart, + program->pixel_offset[x + 2] - iStart, + program->pixel_offset[x + 1] - iStart, + 0); + __m256i one_epi32 = _mm256_set1_epi32(1); + __m256i perm_1 = _mm256_add_epi32(perm_0, one_epi32); + one_epi32 = _mm256_set1_epi32(program->pixel_offset[x + 2] - program->pixel_offset[x + 1]); + __m256i perm_2 = _mm256_add_epi32(perm_1, one_epi32); + one_epi32 = _mm256_set1_epi32(program->pixel_offset[x + 3] - program->pixel_offset[x + 2]); + __m256i perm_3 = _mm256_add_epi32(perm_2, one_epi32); + /* + one_epi32 = _mm256_set1_epi32(program->pixel_offset[x + 1] - program->pixel_offset[x + 0]); + __m256i perm_1 = _mm256_add_epi32(perm_0, one_epi32); + one_epi32 = _mm256_set1_epi32(program->pixel_offset[x + 2] - program->pixel_offset[x + 1]); + __m256i perm_2 = _mm256_add_epi32(perm_1, one_epi32); + one_epi32 = _mm256_set1_epi32(program->pixel_offset[x + 3] - program->pixel_offset[x + 2]); + __m256i perm_3 = _mm256_add_epi32(perm_2, one_epi32); + __m256i perm_1 = _mm256_set_epi32( + program->pixel_offset[x + 7] - iStart, + program->pixel_offset[x + 6] - iStart, + program->pixel_offset[x + 5] - iStart, + program->pixel_offset[x + 4] - iStart, + program->pixel_offset[x + 3] - iStart, + program->pixel_offset[x + 2] - iStart, + program->pixel_offset[x + 1] - iStart, + 0); + */ + + float* AVS_RESTRICT dst_ptr = dst + x; + const float* src_ptr = src + program->pixel_offset[x + 0]; // all permute offsets relative to this start offset + + for (int y = 0; y < height; y++) + { + __m256 data_src = _mm256_loadu_ps(src_ptr); + + __m256 data_0 = _mm256_permutevar8x32_ps(data_src, perm_0); + __m256 data_1 = _mm256_permutevar8x32_ps(data_src, perm_1); + __m256 data_2 = _mm256_permutevar8x32_ps(data_src, perm_2); + __m256 data_3 = _mm256_permutevar8x32_ps(data_src, perm_3); + + __m256 result0 = _mm256_mul_ps(data_0, coef_0); + __m256 result1 = _mm256_mul_ps(data_2, coef_2); + + result0 = _mm256_fmadd_ps(data_1, coef_1, result0); + result1 = _mm256_fmadd_ps(data_3, coef_3, result1); + + _mm256_store_ps(dst_ptr, _mm256_add_ps(result0, result1)); + + dst_ptr += dst_pitch; + src_ptr += src_pitch; + } + current_coeff += filter_size * 8; + } + } +} + +// Instantiate them +template void resize_h_planar_float_avx2_gather_permutex_vstripe_ks4<0>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel); +template void resize_h_planar_float_avx2_gather_permutex_vstripe_ks4<1>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel); +template void resize_h_planar_float_avx2_gather_permutex_vstripe_ks4<2>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel); +template void resize_h_planar_float_avx2_gather_permutex_vstripe_ks4<3>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel); + + +void resize_h_planar_float_avx2_permutex_vstripe_ks8(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel) +{ + + // assert - check if max pixel_offset is not above single load of 8 src floats (or need several loads and more complex permute program) +#ifdef _DEBUG + for (int x = 0; x < width; x += 8) + { + int start_off = program->pixel_offset[x + 0]; + int end_off = program->pixel_offset[x + 7]; + assert((end_off - start_off) > 7); + } +#endif + + int filter_size = program->filter_size; // must be 8 + assert(filter_size != 8); + + const float* AVS_RESTRICT current_coeff; + __m256i one_epi32 = _mm256_set1_epi32(1); + + src_pitch = src_pitch / sizeof(float); + dst_pitch = dst_pitch / sizeof(float); + + float* src = (float*)src8; + float* dst = (float*)dst8; + + current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float; + + for (int x = 0; x < width; x += 8) + { + // prepare coefs in transposed V-form + __m256 coef_0 = _mm256_load_2_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4); + __m256 coef_1 = _mm256_load_2_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5); + __m256 coef_2 = _mm256_load_2_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6); + __m256 coef_3 = _mm256_load_2_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7); + + __m256 coef_4 = _mm256_load_2_m128(current_coeff + filter_size * 0 + 4, current_coeff + filter_size * 4 + 4); + __m256 coef_5 = _mm256_load_2_m128(current_coeff + filter_size * 1 + 4, current_coeff + filter_size * 5 + 4); + __m256 coef_6 = _mm256_load_2_m128(current_coeff + filter_size * 2 + 4, current_coeff + filter_size * 6 + 4); + __m256 coef_7 = _mm256_load_2_m128(current_coeff + filter_size * 3 + 4, current_coeff + filter_size * 7 + 4); + + _MM_TRANSPOSE8_LANE4_PS(coef_0, coef_1, coef_2, coef_3); + _MM_TRANSPOSE8_LANE4_PS(coef_4, coef_5, coef_6, coef_7); + + // convert resampling program in H-form into permuting indexes for src transposition in V-form + int iStart = program->pixel_offset[x + 0]; + __m256i perm_0 = _mm256_set_epi32(program->pixel_offset[x + 7] - iStart, program->pixel_offset[x + 6] - iStart, program->pixel_offset[x + 5] - iStart, program->pixel_offset[x + 4] - iStart, program->pixel_offset[x + 3] - iStart, program->pixel_offset[x + 2] - iStart, program->pixel_offset[x + 1] - iStart, 0); + __m256i perm_1 = _mm256_add_epi32(perm_0, one_epi32); + __m256i perm_2 = _mm256_add_epi32(perm_1, one_epi32); + __m256i perm_3 = _mm256_add_epi32(perm_2, one_epi32); + + float* AVS_RESTRICT dst_ptr = dst + x; + const float* src_ptr = src + program->pixel_offset[x + 0]; // all permute offsets relative to this start offset + + for (int y = 0; y < height; y++) + { + __m256 result; + __m256 data_src = _mm256_loadu_ps(src_ptr); + + __m256 data_0 = _mm256_permutevar8x32_ps(data_src, perm_0); + __m256 data_1 = _mm256_permutevar8x32_ps(data_src, perm_1); + __m256 data_2 = _mm256_permutevar8x32_ps(data_src, perm_2); + __m256 data_3 = _mm256_permutevar8x32_ps(data_src, perm_3); + + __m256 result0 = _mm256_mul_ps(data_0, coef_0); + __m256 result1 = _mm256_mul_ps(data_2, coef_2); + + result0 = _mm256_fmadd_ps(data_1, coef_1, result0); + result1 = _mm256_fmadd_ps(data_3, coef_3, result1); + + result = _mm256_add_ps(result0, result1); + + // next next 4 samples + 4 coefs + data_src = _mm256_loadu_ps(src_ptr + 4); + + data_0 = _mm256_permutevar8x32_ps(data_src, perm_0); + data_1 = _mm256_permutevar8x32_ps(data_src, perm_1); + data_2 = _mm256_permutevar8x32_ps(data_src, perm_2); + data_3 = _mm256_permutevar8x32_ps(data_src, perm_3); + + result0 = _mm256_mul_ps(data_0, coef_4); + result1 = _mm256_mul_ps(data_2, coef_6); + + result0 = _mm256_fmadd_ps(data_1, coef_5, result0); + result1 = _mm256_fmadd_ps(data_3, coef_7, result1); + + result = _mm256_add_ps(result, result0); + result = _mm256_add_ps(result, result1); + + _mm256_store_ps(dst_ptr, result); + + dst_ptr += dst_pitch; + src_ptr += src_pitch; + } + current_coeff += filter_size * 8; + } +} + +#if 0 +// Original DTL2020, made end-contition safe +// process kernel size from up to 4 - BilinearResize, BicubicResize or sinc up to taps=2 +void resize_h_planar_float_avx_transpose_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel) { + int filter_size = program->filter_size; + + const float* AVS_RESTRICT current_coeff; + + src_pitch = src_pitch / sizeof(float); + dst_pitch = dst_pitch / sizeof(float); + + float* src = (float*)src8; + float* dst = (float*)dst8; + + current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float; + + for (int x = 0; x < width; x += 8) + { + __m256 coef_1_coef_5 = _mm256_load_2_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4); + __m256 coef_2_coef_6 = _mm256_load_2_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5); + __m256 coef_3_coef_7 = _mm256_load_2_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6); + __m256 coef_4_coef_8 = _mm256_load_2_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7); + + _MM_TRANSPOSE8_LANE4_PS(coef_1_coef_5, coef_2_coef_6, coef_3_coef_7, coef_4_coef_8); + + float* AVS_RESTRICT dst_ptr = dst + x; + const float* src_ptr = src; + + for (int y = 0; y < height; y++) + { + __m256 data_1_data_5 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 0], src_ptr + program->pixel_offset[x + 4]); + __m256 data_2_data_6 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 1], src_ptr + program->pixel_offset[x + 5]); + __m256 data_3_data_7 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 2], src_ptr + program->pixel_offset[x + 6]); + __m256 data_4_data_8 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 3], src_ptr + program->pixel_offset[x + 7]); + + _MM_TRANSPOSE8_LANE4_PS(data_1_data_5, data_2_data_6, data_3_data_7, data_4_data_8); + + __m256 result = _mm256_mul_ps(data_1_data_5, coef_1_coef_5); + result = _mm256_fmadd_ps(data_2_data_6, coef_2_coef_6, result); + result = _mm256_fmadd_ps(data_3_data_7, coef_3_coef_7, result); + result = _mm256_fmadd_ps(data_4_data_8, coef_4_coef_8, result); + + _mm256_store_ps(dst_ptr, result); + + dst_ptr += dst_pitch; + src_ptr += src_pitch; + } + current_coeff += filter_size * 8; + } + +} +#endif diff --git a/avs_core/filters/intel/resample_avx2.h b/avs_core/filters/intel/resample_avx2.h index a3b163aa3..8f4b41625 100644 --- a/avs_core/filters/intel/resample_avx2.h +++ b/avs_core/filters/intel/resample_avx2.h @@ -51,5 +51,75 @@ template void resize_v_avx2_planar_uint16_t(BYTE* dst0, const BYTE* src0, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel); void resize_v_avx2_planar_float(BYTE* dst0, const BYTE* src0, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel); +void resize_v_avx2_planar_float_w_sr(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel); + +void resize_h_planar_float_avx_transpose(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel); + +template +void resize_h_planar_float_avx_transpose_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel); + +void resize_h_planar_float_avx2_permutex_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel); + +template +void resize_h_planar_float_avx2_gather_permutex_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel); + +// Transpose 4x4 blocks within each lane +#define _MM_TRANSPOSE8_LANE4_PS(row0, row1, row2, row3) \ + do { \ + __m256 __t0, __t1, __t2, __t3; \ + __t0 = _mm256_unpacklo_ps(row0, row1); \ + __t1 = _mm256_unpackhi_ps(row0, row1); \ + __t2 = _mm256_unpacklo_ps(row2, row3); \ + __t3 = _mm256_unpackhi_ps(row2, row3); \ + row0 = _mm256_shuffle_ps(__t0, __t2, _MM_SHUFFLE(1, 0, 1, 0)); \ + row1 = _mm256_shuffle_ps(__t0, __t2, _MM_SHUFFLE(3, 2, 3, 2)); \ + row2 = _mm256_shuffle_ps(__t1, __t3, _MM_SHUFFLE(1, 0, 1, 0)); \ + row3 = _mm256_shuffle_ps(__t1, __t3, _MM_SHUFFLE(3, 2, 3, 2)); \ + } while (0) + +#define _MM_TRANSPOSE8_PS(row0, row1, row2, row3, row4, row5, row6, row7) \ + do { \ + __m256 __t0, __t1, __t2, __t3, __t4, __t5, __t6, __t7; \ + __m256 __tt0, __tt1, __tt2, __tt3, __tt4, __tt5, __tt6, __tt7; \ + __t0 = _mm256_unpacklo_ps(row0, row1); \ + __t1 = _mm256_unpackhi_ps(row0, row1); \ + __t2 = _mm256_unpacklo_ps(row2, row3); \ + __t3 = _mm256_unpackhi_ps(row2, row3); \ + __t4 = _mm256_unpacklo_ps(row4, row5); \ + __t5 = _mm256_unpackhi_ps(row4, row5); \ + __t6 = _mm256_unpacklo_ps(row6, row7); \ + __t7 = _mm256_unpackhi_ps(row6, row7); \ + __tt0 = _mm256_shuffle_ps(__t0, __t2, _MM_SHUFFLE(1, 0, 1, 0)); \ + __tt1 = _mm256_shuffle_ps(__t0, __t2, _MM_SHUFFLE(3, 2, 3, 2)); \ + __tt2 = _mm256_shuffle_ps(__t1, __t3, _MM_SHUFFLE(1, 0, 1, 0)); \ + __tt3 = _mm256_shuffle_ps(__t1, __t3, _MM_SHUFFLE(3, 2, 3, 2)); \ + __tt4 = _mm256_shuffle_ps(__t4, __t6, _MM_SHUFFLE(1, 0, 1, 0)); \ + __tt5 = _mm256_shuffle_ps(__t4, __t6, _MM_SHUFFLE(3, 2, 3, 2)); \ + __tt6 = _mm256_shuffle_ps(__t5, __t7, _MM_SHUFFLE(1, 0, 1, 0)); \ + __tt7 = _mm256_shuffle_ps(__t5, __t7, _MM_SHUFFLE(3, 2, 3, 2)); \ + row0 = _mm256_permute2f128_ps(__tt0, __tt4, 0x20); \ + row1 = _mm256_permute2f128_ps(__tt1, __tt5, 0x20); \ + row2 = _mm256_permute2f128_ps(__tt2, __tt6, 0x20); \ + row3 = _mm256_permute2f128_ps(__tt3, __tt7, 0x20); \ + row4 = _mm256_permute2f128_ps(__tt0, __tt4, 0x31); \ + row5 = _mm256_permute2f128_ps(__tt1, __tt5, 0x31); \ + row6 = _mm256_permute2f128_ps(__tt2, __tt6, 0x31); \ + row7 = _mm256_permute2f128_ps(__tt3, __tt7, 0x31); \ + } while (0) + + + +#ifndef _mm256_loadu_2_m128 +#define _mm256_loadu_2_m128(/* __m128 const* */ loaddr, \ + /* __m128 const* */ hiaddr) \ + _mm256_set_m128(_mm_loadu_ps(hiaddr), _mm_loadu_ps(loaddr)) +#endif + +#ifndef _mm256_load_2_m128 +#define _mm256_load_2_m128(/* __m128 const* */ loaddr, \ + /* __m128 const* */ hiaddr) \ + _mm256_set_m128(_mm_load_ps(hiaddr), _mm_load_ps(loaddr)) +#endif + #endif // __Resample_AVX2_H__ diff --git a/avs_core/filters/intel/resample_avx512.cpp b/avs_core/filters/intel/resample_avx512.cpp new file mode 100644 index 000000000..68e9cae78 --- /dev/null +++ b/avs_core/filters/intel/resample_avx512.cpp @@ -0,0 +1,2014 @@ +// Avisynth v2.5. Copyright 2002 Ben Rudiak-Gould et al. +// http://avisynth.nl + +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit +// http://www.gnu.org/copyleft/gpl.html . +// +// Linking Avisynth statically or dynamically with other modules is making a +// combined work based on Avisynth. Thus, the terms and conditions of the GNU +// General Public License cover the whole combination. +// +// As a special exception, the copyright holders of Avisynth give you +// permission to link Avisynth with independent modules that communicate with +// Avisynth solely through the interfaces defined in avisynth.h, regardless of the license +// terms of these independent modules, and to copy and distribute the +// resulting combined work under terms of your choice, provided that +// every copy of the combined work is accompanied by a complete copy of +// the source code of Avisynth (the version of Avisynth used to produce the +// combined work), being distributed under the terms of the GNU General +// Public License plus this exception. An independent module is a module +// which is not derived from or based on Avisynth, such as 3rd-party filters, +// import and export plugins, or graphical user interfaces. + +#include +#include "../core/internal.h" + +#include +#include + +#include "resample_avx512.h" +//------- 512 bit float Horizontals + +// Safe quad lane partial load with AVX512 +// Read exactly N pixels (where N mod 4 is the template parameter), avoiding +// - reading beyond the end of the source buffer. +// - avoid NaN contamination by padding with zeros. +template +AVS_FORCEINLINE static __m512 _mm512_load_partial_safe_4_m128(const float* src_ptr_offsetted1, const float* src_ptr_offsetted2, const float* src_ptr_offsetted3, const float* src_ptr_offsetted4) { + __m128 s1, s2, s3, s4; + switch (Nmod4) { + case 1: + s1 = _mm_set_ps(0.0f, 0.0f, 0.0f, src_ptr_offsetted1[0]); + s2 = _mm_set_ps(0.0f, 0.0f, 0.0f, src_ptr_offsetted2[0]); + s3 = _mm_set_ps(0.0f, 0.0f, 0.0f, src_ptr_offsetted3[0]); + s4 = _mm_set_ps(0.0f, 0.0f, 0.0f, src_ptr_offsetted4[0]); + // ideally: movss + break; + case 2: + s1 = _mm_set_ps(0.0f, 0.0f, src_ptr_offsetted1[1], src_ptr_offsetted1[0]); + s2 = _mm_set_ps(0.0f, 0.0f, src_ptr_offsetted2[1], src_ptr_offsetted2[0]); + s3 = _mm_set_ps(0.0f, 0.0f, src_ptr_offsetted3[1], src_ptr_offsetted3[0]); + s4 = _mm_set_ps(0.0f, 0.0f, src_ptr_offsetted4[1], src_ptr_offsetted4[0]); + // ideally: movsd + break; + case 3: + s1 = _mm_set_ps(0.0f, src_ptr_offsetted1[2], src_ptr_offsetted1[1], src_ptr_offsetted1[0]); + s2 = _mm_set_ps(0.0f, src_ptr_offsetted2[2], src_ptr_offsetted2[1], src_ptr_offsetted2[0]); + s3 = _mm_set_ps(0.0f, src_ptr_offsetted3[2], src_ptr_offsetted3[1], src_ptr_offsetted3[0]); + s4 = _mm_set_ps(0.0f, src_ptr_offsetted4[2], src_ptr_offsetted4[1], src_ptr_offsetted4[0]); + // ideally: movss + movsd + shuffle or movsd + insert + break; + case 0: + s1 = _mm_set_ps(src_ptr_offsetted1[3], src_ptr_offsetted1[2], src_ptr_offsetted1[1], src_ptr_offsetted1[0]); + s2 = _mm_set_ps(src_ptr_offsetted2[3], src_ptr_offsetted2[2], src_ptr_offsetted2[1], src_ptr_offsetted2[0]); + s3 = _mm_set_ps(src_ptr_offsetted3[3], src_ptr_offsetted3[2], src_ptr_offsetted3[1], src_ptr_offsetted3[0]); + s4 = _mm_set_ps(src_ptr_offsetted4[3], src_ptr_offsetted4[2], src_ptr_offsetted4[1], src_ptr_offsetted4[0]); + // ideally: movups + break; + default: + s1 = _mm_setzero_ps(); // n/a cannot happen + s2 = _mm_setzero_ps(); + s3 = _mm_setzero_ps(); + s4 = _mm_setzero_ps(); + } + __m512 result = _mm512_castps128_ps512(s1); // Cast the first __m128 to __m512 + result = _mm512_insertf32x4(result, s2, 1); // Insert the second __m128 at position 1 + result = _mm512_insertf32x4(result, s3, 2); // Insert the third __m128 at position 2 + result = _mm512_insertf32x4(result, s4, 3); // Insert the fourth __m128 at position 3 + return result; +} + + + + +// Processes a horizontal resampling kernel of up to four coefficients for float pixel types. +// Supports BilinearResize, BicubicResize, or sinc with up to 2 taps (filter size <= 4). +// AVX512 optimization loads and processes four float coefficients and sixteen pixels simultaneously. +// The 'filtersizemod4' template parameter (0-3) helps optimize for different filter sizes modulo 4. +// This AVX512 requires only filter_size_alignment of 4. +template +void resize_h_planar_float_avx512_transpose_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel) { + assert(filtersizemod4 >= 0 && filtersizemod4 <= 3); + + const int filter_size = program->filter_size; // aligned, practically the coeff table stride + + src_pitch /= sizeof(float); + dst_pitch /= sizeof(float); + + float* src = (float*)src8; + float* dst = (float*)dst8; + + const float* AVS_RESTRICT current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float; + + constexpr int PIXELS_AT_A_TIME = 16; // Process sixteen pixels in parallel using AVX512 (4x4 using m128 lanes) + + // 'source_overread_beyond_targetx' indicates if the filter kernel can read beyond the target width. + // Even if the filter alignment allows larger reads, our safety boundary for unaligned loads starts at 4 pixels back + // from the target width, as we load 4 floats at once conceptually with our safe load. + const int width_safe_mod = (program->safelimit_4_pixels.overread_possible ? program->safelimit_4_pixels.source_overread_beyond_targetx : width) / PIXELS_AT_A_TIME * PIXELS_AT_A_TIME; + + // Preconditions: + assert(program->filter_size_real <= 4); // We preload all relevant coefficients (up to 4) before the height loop. + + // 'target_size_alignment' ensures we can safely access coefficients using offsets like + // 'filter_size * 7' when processing 8 H pixels at a time or + // 'filter_size * 15' when processing 16 H pixels at a time + assert(program->target_size_alignment >= 16); // Adjusted for 16 pixels + assert(FRAME_ALIGN >= 64); // Adjusted for 16 pixels AviSynth+ default + + // Ensure that coefficient loading beyond the valid target size is safe for 4x4 float loads. + assert(program->filter_size_alignment >= 4); + + int x = 0; + + // This 'auto' lambda construct replaces the need of templates + auto do_h_float_core = [&](auto partial_load) { + // Load up to 4x4 coefficients at once before the height loop. + // Pre-loading and transposing coefficients keeps register usage efficient. + // Assumes 'filter_size_aligned' is at least 4. + + // Coefficients for the source pixel offset (for src_ptr + begin1 [0..3], begin5 [0..3], begin9 [0..3], begin13 [0..3]) + __m512 coef_1_5_9_13 = _mm512_load_4_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4, current_coeff + filter_size * 8, current_coeff + filter_size * 12); + __m512 coef_2_6_10_14 = _mm512_load_4_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5, current_coeff + filter_size * 9, current_coeff + filter_size * 13); + __m512 coef_3_7_11_15 = _mm512_load_4_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6, current_coeff + filter_size * 10, current_coeff + filter_size * 14); + __m512 coef_4_8_12_16 = _mm512_load_4_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7, current_coeff + filter_size * 11, current_coeff + filter_size * 15); + + _MM_TRANSPOSE16_LANE4_PS(coef_1_5_9_13, coef_2_6_10_14, coef_3_7_11_15, coef_4_8_12_16); + + float* AVS_RESTRICT dst_ptr = dst + x; + const float* src_ptr = src; + + // Pixel offsets for the current target x-positions. + // Even for x >= width, these offsets are guaranteed to be within the allocated 'target_size_alignment'. + const int begin1 = program->pixel_offset[x + 0]; + const int begin2 = program->pixel_offset[x + 1]; + const int begin3 = program->pixel_offset[x + 2]; + const int begin4 = program->pixel_offset[x + 3]; + const int begin5 = program->pixel_offset[x + 4]; + const int begin6 = program->pixel_offset[x + 5]; + const int begin7 = program->pixel_offset[x + 6]; + const int begin8 = program->pixel_offset[x + 7]; + const int begin9 = program->pixel_offset[x + 8]; + const int begin10 = program->pixel_offset[x + 9]; + const int begin11 = program->pixel_offset[x + 10]; + const int begin12 = program->pixel_offset[x + 11]; + const int begin13 = program->pixel_offset[x + 12]; + const int begin14 = program->pixel_offset[x + 13]; + const int begin15 = program->pixel_offset[x + 14]; + const int begin16 = program->pixel_offset[x + 15]; + + for (int y = 0; y < height; y++) + { + __m512 data_1_5_9_13; + __m512 data_2_6_10_14; + __m512 data_3_7_11_15; + __m512 data_4_8_12_16; + + if constexpr (partial_load) { + // In the potentially unsafe zone (near the right edge of the image), we use a safe loading function + // to prevent reading beyond the allocated source scanline. + + data_1_5_9_13 = _mm512_load_partial_safe_4_m128(src_ptr + begin1, src_ptr + begin5, src_ptr + begin9, src_ptr + begin13); + data_2_6_10_14 = _mm512_load_partial_safe_4_m128(src_ptr + begin2, src_ptr + begin6, src_ptr + begin10, src_ptr + begin14); + data_3_7_11_15 = _mm512_load_partial_safe_4_m128(src_ptr + begin3, src_ptr + begin7, src_ptr + begin11, src_ptr + begin15); + data_4_8_12_16 = _mm512_load_partial_safe_4_m128(src_ptr + begin4, src_ptr + begin8, src_ptr + begin12, src_ptr + begin16); + } + else { + // In the safe zone, we can directly load 4 pixels at a time for each of the four lanes. + data_1_5_9_13 = _mm512_loadu_4_m128(src_ptr + begin1, src_ptr + begin5, src_ptr + begin9, src_ptr + begin13); + data_2_6_10_14 = _mm512_loadu_4_m128(src_ptr + begin2, src_ptr + begin6, src_ptr + begin10, src_ptr + begin14); + data_3_7_11_15 = _mm512_loadu_4_m128(src_ptr + begin3, src_ptr + begin7, src_ptr + begin11, src_ptr + begin15); + data_4_8_12_16 = _mm512_loadu_4_m128(src_ptr + begin4, src_ptr + begin8, src_ptr + begin12, src_ptr + begin16); + } + + _MM_TRANSPOSE16_LANE4_PS(data_1_5_9_13, data_2_6_10_14, data_3_7_11_15, data_4_8_12_16); + + __m512 result = _mm512_mul_ps(data_1_5_9_13, coef_1_5_9_13); + result = _mm512_fmadd_ps(data_2_6_10_14, coef_2_6_10_14, result); + result = _mm512_fmadd_ps(data_3_7_11_15, coef_3_7_11_15, result); + result = _mm512_fmadd_ps(data_4_8_12_16, coef_4_8_12_16, result); + + _mm512_store_ps(dst_ptr, result); + + dst_ptr += dst_pitch; + src_ptr += src_pitch; + } // y + current_coeff += filter_size * 16; // Move to the next set of coefficients for the next 16 output pixels + }; // end of lambda + + // Process the 'safe zone' where direct full unaligned loads are acceptable. + for (; x < width_safe_mod; x += PIXELS_AT_A_TIME) + { + do_h_float_core(std::false_type{}); // partial_load == false, use direct _mm512_loadu_4_m128 + } + + // Process the potentially 'unsafe zone' near the image edge, using safe loading. + for (; x < width; x += PIXELS_AT_A_TIME) + { + do_h_float_core(std::true_type{}); // partial_load == true, use the safer '_mm512_load_partial_safe_4_m128' + } +} + +// Instantiate them +template void resize_h_planar_float_avx512_transpose_vstripe_ks4<0>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel); +template void resize_h_planar_float_avx512_transpose_vstripe_ks4<1>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel); +template void resize_h_planar_float_avx512_transpose_vstripe_ks4<2>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel); +template void resize_h_planar_float_avx512_transpose_vstripe_ks4<3>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel); + + +/* Universal function supporting 2 ways of processing depending on the max offset of the source samples to read in the resampling program : +1. For high upsampling ratios it uses low read (single 8 float source samples) and permute-transpose before V-fma +2. For downsample and no-resize convolution - use each input sequence gathering by direct addressing +*/ +template +void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel) +{ + assert(filtersizemod4 >= 0 && filtersizemod4 <= 3); + + const int filter_size = program->filter_size; // aligned, practically the coeff table stride + + src_pitch /= sizeof(float); + dst_pitch /= sizeof(float); + + float* src = (float*)src8; + float* dst = (float*)dst8; + + const float* AVS_RESTRICT current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float; + + constexpr int PIXELS_AT_A_TIME = 16; // Process sixteen pixels in parallel using AVX512 (4x4 using m128 lanes) + + // 'source_overread_beyond_targetx' indicates if the filter kernel can read beyond the target width. + // Even if the filter alignment allows larger reads, our safety boundary for unaligned loads starts at 4 pixels back + // from the target width, as we load 4 floats at once conceptually with our safe load. + const int width_safe_mod = (program->safelimit_4_pixels.overread_possible ? program->safelimit_4_pixels.source_overread_beyond_targetx : width) / PIXELS_AT_A_TIME * PIXELS_AT_A_TIME; + + // Preconditions: + assert(program->filter_size_real <= 4); // We preload all relevant coefficients (up to 4) before the height loop. + + // 'target_size_alignment' ensures we can safely access coefficients using offsets like + // 'filter_size * 7' when processing 8 H pixels at a time or + // 'filter_size * 15' when processing 16 H pixels at a time + assert(program->target_size_alignment >= 16); // Adjusted for 16 pixels + assert(FRAME_ALIGN >= 64); // Adjusted for 16 pixels AviSynth+ default + + // Ensure that coefficient loading beyond the valid target size is safe for 4x4 float loads. + assert(program->filter_size_alignment >= 4); + + bool bDoGather = false; + // Analyse input resampling program to select method of processing + for (int x = 0; x < width - 16; x += 16) // -16 to save from vector overrread at program->pixel_offset[x + 15 + 3]; ? + { + int start_off = program->pixel_offset[x + 0]; + int end_off = program->pixel_offset[x + 15]; + if ((end_off - start_off) + (program->filter_size_real - 1) > 32) bDoGather = true; + + start_off = program->pixel_offset[x + 1]; + end_off = program->pixel_offset[x + 15 + 1]; + if ((end_off - start_off) + (program->filter_size_real - 1) > 32) bDoGather = true; + + start_off = program->pixel_offset[x + 2]; + end_off = program->pixel_offset[x + 15 + 2]; + if ((end_off - start_off) + (program->filter_size_real - 1) > 32) bDoGather = true; + + start_off = program->pixel_offset[x + 3]; + end_off = program->pixel_offset[x + 15 + 3]; + if ((end_off - start_off) + (program->filter_size_real - 1) > 32) bDoGather = true; + } + + int x = 0; + + if (bDoGather) + { + // This 'auto' lambda construct replaces the need of templates + auto do_h_float_core = [&](auto partial_load) { + // Load up to 4x4 coefficients at once before the height loop. + // Pre-loading and transposing coefficients keeps register usage efficient. + // Assumes 'filter_size_aligned' is at least 4. + + // Coefficients for the source pixel offset (for src_ptr + begin1 [0..3], begin5 [0..3], begin9 [0..3], begin13 [0..3]) + __m512 coef_1_5_9_13 = _mm512_load_4_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4, current_coeff + filter_size * 8, current_coeff + filter_size * 12); + __m512 coef_2_6_10_14 = _mm512_load_4_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5, current_coeff + filter_size * 9, current_coeff + filter_size * 13); + __m512 coef_3_7_11_15 = _mm512_load_4_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6, current_coeff + filter_size * 10, current_coeff + filter_size * 14); + __m512 coef_4_8_12_16 = _mm512_load_4_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7, current_coeff + filter_size * 11, current_coeff + filter_size * 15); + + _MM_TRANSPOSE16_LANE4_PS(coef_1_5_9_13, coef_2_6_10_14, coef_3_7_11_15, coef_4_8_12_16); + + float* AVS_RESTRICT dst_ptr = dst + x; + const float* src_ptr = src; + + // Pixel offsets for the current target x-positions. + // Even for x >= width, these offsets are guaranteed to be within the allocated 'target_size_alignment'. + const int begin1 = program->pixel_offset[x + 0]; + const int begin2 = program->pixel_offset[x + 1]; + const int begin3 = program->pixel_offset[x + 2]; + const int begin4 = program->pixel_offset[x + 3]; + const int begin5 = program->pixel_offset[x + 4]; + const int begin6 = program->pixel_offset[x + 5]; + const int begin7 = program->pixel_offset[x + 6]; + const int begin8 = program->pixel_offset[x + 7]; + const int begin9 = program->pixel_offset[x + 8]; + const int begin10 = program->pixel_offset[x + 9]; + const int begin11 = program->pixel_offset[x + 10]; + const int begin12 = program->pixel_offset[x + 11]; + const int begin13 = program->pixel_offset[x + 12]; + const int begin14 = program->pixel_offset[x + 13]; + const int begin15 = program->pixel_offset[x + 14]; + const int begin16 = program->pixel_offset[x + 15]; + + for (int y = 0; y < height; y++) + { + __m512 data_1_5_9_13; + __m512 data_2_6_10_14; + __m512 data_3_7_11_15; + __m512 data_4_8_12_16; + + if constexpr (partial_load) { + // In the potentially unsafe zone (near the right edge of the image), we use a safe loading function + // to prevent reading beyond the allocated source scanline. + + data_1_5_9_13 = _mm512_load_partial_safe_4_m128(src_ptr + begin1, src_ptr + begin5, src_ptr + begin9, src_ptr + begin13); + data_2_6_10_14 = _mm512_load_partial_safe_4_m128(src_ptr + begin2, src_ptr + begin6, src_ptr + begin10, src_ptr + begin14); + data_3_7_11_15 = _mm512_load_partial_safe_4_m128(src_ptr + begin3, src_ptr + begin7, src_ptr + begin11, src_ptr + begin15); + data_4_8_12_16 = _mm512_load_partial_safe_4_m128(src_ptr + begin4, src_ptr + begin8, src_ptr + begin12, src_ptr + begin16); + } + else { + // In the safe zone, we can directly load 4 pixels at a time for each of the four lanes. + data_1_5_9_13 = _mm512_loadu_4_m128(src_ptr + begin1, src_ptr + begin5, src_ptr + begin9, src_ptr + begin13); + data_2_6_10_14 = _mm512_loadu_4_m128(src_ptr + begin2, src_ptr + begin6, src_ptr + begin10, src_ptr + begin14); + data_3_7_11_15 = _mm512_loadu_4_m128(src_ptr + begin3, src_ptr + begin7, src_ptr + begin11, src_ptr + begin15); + data_4_8_12_16 = _mm512_loadu_4_m128(src_ptr + begin4, src_ptr + begin8, src_ptr + begin12, src_ptr + begin16); + } + + _MM_TRANSPOSE16_LANE4_PS(data_1_5_9_13, data_2_6_10_14, data_3_7_11_15, data_4_8_12_16); + + __m512 result = _mm512_mul_ps(data_1_5_9_13, coef_1_5_9_13); + result = _mm512_fmadd_ps(data_2_6_10_14, coef_2_6_10_14, result); + result = _mm512_fmadd_ps(data_3_7_11_15, coef_3_7_11_15, result); + result = _mm512_fmadd_ps(data_4_8_12_16, coef_4_8_12_16, result); + + _mm512_store_ps(dst_ptr, result); + + dst_ptr += dst_pitch; + src_ptr += src_pitch; + } // y + current_coeff += filter_size * 16; // Move to the next set of coefficients for the next 16 output pixels + }; // end of lambda + + // Process the 'safe zone' where direct full unaligned loads are acceptable. + for (; x < width_safe_mod; x += PIXELS_AT_A_TIME) + { + do_h_float_core(std::false_type{}); // partial_load == false, use direct _mm512_loadu_4_m128 + } + + // Process the potentially 'unsafe zone' near the image edge, using safe loading. + for (; x < width; x += PIXELS_AT_A_TIME) + { + do_h_float_core(std::true_type{}); // partial_load == true, use the safer '_mm512_load_partial_safe_4_m128' + } + } + else // if(bDoGather) + { + for (int x = 0; x < width; x += 16) + { + // prepare coefs in transposed V-form + __m512 coef_r0 = _mm512_load_4_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4, current_coeff + filter_size * 8, current_coeff + filter_size * 12); + __m512 coef_r1 = _mm512_load_4_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5, current_coeff + filter_size * 9, current_coeff + filter_size * 13); + __m512 coef_r2 = _mm512_load_4_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6, current_coeff + filter_size * 10, current_coeff + filter_size * 14); + __m512 coef_r3 = _mm512_load_4_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7, current_coeff + filter_size * 11, current_coeff + filter_size * 15); + + _MM_TRANSPOSE16_LANE4_PS(coef_r0, coef_r1, coef_r2, coef_r3); + + // convert resampling program in H-form into permuting indexes for src transposition in V-form + int iStart = program->pixel_offset[x + 0]; + + __m512i perm_0 = _mm512_set_epi32( + program->pixel_offset[x + 15] - iStart, + program->pixel_offset[x + 14] - iStart, + program->pixel_offset[x + 13] - iStart, + program->pixel_offset[x + 12] - iStart, + program->pixel_offset[x + 11] - iStart, + program->pixel_offset[x + 10] - iStart, + program->pixel_offset[x + 9] - iStart, + program->pixel_offset[x + 8] - iStart, + program->pixel_offset[x + 7] - iStart, + program->pixel_offset[x + 6] - iStart, + program->pixel_offset[x + 5] - iStart, + program->pixel_offset[x + 4] - iStart, + program->pixel_offset[x + 3] - iStart, + program->pixel_offset[x + 2] - iStart, + program->pixel_offset[x + 1] - iStart, + 0); + + __m512i one_epi32 = _mm512_set1_epi32(1); + __m512i perm_1 = _mm512_add_epi32(perm_0, one_epi32); + one_epi32 = _mm512_set1_epi32(program->pixel_offset[x + 2] - program->pixel_offset[x + 1]); + __m512i perm_2 = _mm512_add_epi32(perm_1, one_epi32); + one_epi32 = _mm512_set1_epi32(program->pixel_offset[x + 3] - program->pixel_offset[x + 2]); + __m512i perm_3 = _mm512_add_epi32(perm_2, one_epi32); + + float* AVS_RESTRICT dst_ptr = dst + x; + const float* src_ptr = src + program->pixel_offset[x + 0]; // all permute offsets relative to this start offset + + for (int y = 0; y < height; y++) // single row proc + { + __m512 data_src = _mm512_loadu_ps(src_ptr); + __m512 data_src2 = _mm512_loadu_ps(src_ptr + 16); // not always needed for upscale also can cause end of buffer overread - need to add limitation (special end of buffer processing ?) + + __m512 data_0 = _mm512_permutex2var_ps(data_src, perm_0, data_src2); + __m512 data_1 = _mm512_permutex2var_ps(data_src, perm_1, data_src2); + __m512 data_2 = _mm512_permutex2var_ps(data_src, perm_2, data_src2); + __m512 data_3 = _mm512_permutex2var_ps(data_src, perm_3, data_src2); + + __m512 result0 = _mm512_mul_ps(data_0, coef_r0); + __m512 result1 = _mm512_mul_ps(data_2, coef_r2); + + result0 = _mm512_fmadd_ps(data_1, coef_r1, result0); + result1 = _mm512_fmadd_ps(data_3, coef_r3, result1); + + _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1)); + + dst_ptr += dst_pitch; + src_ptr += src_pitch; + } + + current_coeff += filter_size * 16; + } + } +} + +template void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4<0>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel); +template void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4<1>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel); +template void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4<2>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel); +template void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4<3>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel); + + +/* Universal function supporting 2 ways of processing depending on the max offset of the source samples to read in the resampling program : +1. For high upsampling ratios it uses low read (single 8 float source samples) and permute-transpose before V-fma +2. For downsample and no-resize convolution - use each input sequence gathering by direct addressing +*/ +template +void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel) +{ + assert(filtersizemod4 >= 0 && filtersizemod4 <= 3); + + const int filter_size = program->filter_size; // aligned, practically the coeff table stride + + src_pitch /= sizeof(float); + dst_pitch /= sizeof(float); + + float* src = (float*)src8; + float* dst = (float*)dst8; + + const float* AVS_RESTRICT current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float; + + const int width_mod32 = (width / 32) * 32; // Process by 2x 512it (2 x 16 floats) to make memory read/write linear streams longer, + + constexpr int MAX_PIXELS_AT_A_TIME = 32; // Process sixteen pixels in parallel using AVX512 (4x4 using m128 lanes) + constexpr int PIXELS_AT_A_TIME = 16; // Process sixteen pixels in parallel using AVX512 (4x4 using m128 lanes) + + // 'source_overread_beyond_targetx' indicates if the filter kernel can read beyond the target width. + // Even if the filter alignment allows larger reads, our safety boundary for unaligned loads starts at 4 pixels back + // from the target width, as we load 4 floats at once conceptually with our safe load. + const int width_safe_mod = (program->safelimit_4_pixels.overread_possible ? program->safelimit_4_pixels.source_overread_beyond_targetx : width) / MAX_PIXELS_AT_A_TIME * MAX_PIXELS_AT_A_TIME; + + // Preconditions: + assert(program->filter_size_real <= 4); // We preload all relevant coefficients (up to 4) before the height loop. + + // 'target_size_alignment' ensures we can safely access coefficients using offsets like + // 'filter_size * 7' when processing 8 H pixels at a time or + // 'filter_size * 15' when processing 16 H pixels at a time + assert(program->target_size_alignment >= 16); // Adjusted for 16 pixels + assert(FRAME_ALIGN >= 64); // Adjusted for 16 pixels AviSynth+ default + + // Ensure that coefficient loading beyond the valid target size is safe for 4x4 float loads. + assert(program->filter_size_alignment >= 4); + + bool bDoGather = false; + // Analyse input resampling program to select method of processing + for (int x = 0; x < width - 16; x += 16) // -16 to save from vector overrread at program->pixel_offset[x + 15 + 3]; ? + { + int start_off = program->pixel_offset[x + 0]; + int end_off = program->pixel_offset[x + 15]; + if ((end_off - start_off) + (program->filter_size_real - 1) > 32) bDoGather = true; + + start_off = program->pixel_offset[x + 1]; + end_off = program->pixel_offset[x + 15 + 1]; + if ((end_off - start_off) + (program->filter_size_real - 1) > 32) bDoGather = true; + + start_off = program->pixel_offset[x + 2]; + end_off = program->pixel_offset[x + 15 + 2]; + if ((end_off - start_off) + (program->filter_size_real - 1) > 32) bDoGather = true; + + start_off = program->pixel_offset[x + 3]; + end_off = program->pixel_offset[x + 15 + 3]; + if ((end_off - start_off) + (program->filter_size_real - 1) > 32) bDoGather = true; + } + + int x = 0; + + if (bDoGather) + { + // This 'auto' lambda construct replaces the need of templates + auto do_h_float_core_16 = [&](auto partial_load) { + // Load up to 4x4 coefficients at once before the height loop. + // Pre-loading and transposing coefficients keeps register usage efficient. + // Assumes 'filter_size_aligned' is at least 4. + + // Coefficients for the source pixel offset (for src_ptr + begin1 [0..3], begin5 [0..3], begin9 [0..3], begin13 [0..3]) + __m512 coef_1_5_9_13 = _mm512_load_4_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4, current_coeff + filter_size * 8, current_coeff + filter_size * 12); + __m512 coef_2_6_10_14 = _mm512_load_4_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5, current_coeff + filter_size * 9, current_coeff + filter_size * 13); + __m512 coef_3_7_11_15 = _mm512_load_4_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6, current_coeff + filter_size * 10, current_coeff + filter_size * 14); + __m512 coef_4_8_12_16 = _mm512_load_4_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7, current_coeff + filter_size * 11, current_coeff + filter_size * 15); + + _MM_TRANSPOSE16_LANE4_PS(coef_1_5_9_13, coef_2_6_10_14, coef_3_7_11_15, coef_4_8_12_16); + + float* AVS_RESTRICT dst_ptr = dst + x; + const float* src_ptr = src; + + // Pixel offsets for the current target x-positions. + // Even for x >= width, these offsets are guaranteed to be within the allocated 'target_size_alignment'. + const int begin1 = program->pixel_offset[x + 0]; + const int begin2 = program->pixel_offset[x + 1]; + const int begin3 = program->pixel_offset[x + 2]; + const int begin4 = program->pixel_offset[x + 3]; + const int begin5 = program->pixel_offset[x + 4]; + const int begin6 = program->pixel_offset[x + 5]; + const int begin7 = program->pixel_offset[x + 6]; + const int begin8 = program->pixel_offset[x + 7]; + const int begin9 = program->pixel_offset[x + 8]; + const int begin10 = program->pixel_offset[x + 9]; + const int begin11 = program->pixel_offset[x + 10]; + const int begin12 = program->pixel_offset[x + 11]; + const int begin13 = program->pixel_offset[x + 12]; + const int begin14 = program->pixel_offset[x + 13]; + const int begin15 = program->pixel_offset[x + 14]; + const int begin16 = program->pixel_offset[x + 15]; + + for (int y = 0; y < height; y++) + { + __m512 data_1_5_9_13; + __m512 data_2_6_10_14; + __m512 data_3_7_11_15; + __m512 data_4_8_12_16; + + if constexpr (partial_load) { + // In the potentially unsafe zone (near the right edge of the image), we use a safe loading function + // to prevent reading beyond the allocated source scanline. + + data_1_5_9_13 = _mm512_load_partial_safe_4_m128(src_ptr + begin1, src_ptr + begin5, src_ptr + begin9, src_ptr + begin13); + data_2_6_10_14 = _mm512_load_partial_safe_4_m128(src_ptr + begin2, src_ptr + begin6, src_ptr + begin10, src_ptr + begin14); + data_3_7_11_15 = _mm512_load_partial_safe_4_m128(src_ptr + begin3, src_ptr + begin7, src_ptr + begin11, src_ptr + begin15); + data_4_8_12_16 = _mm512_load_partial_safe_4_m128(src_ptr + begin4, src_ptr + begin8, src_ptr + begin12, src_ptr + begin16); + } + else { + // In the safe zone, we can directly load 4 pixels at a time for each of the four lanes. + data_1_5_9_13 = _mm512_loadu_4_m128(src_ptr + begin1, src_ptr + begin5, src_ptr + begin9, src_ptr + begin13); + data_2_6_10_14 = _mm512_loadu_4_m128(src_ptr + begin2, src_ptr + begin6, src_ptr + begin10, src_ptr + begin14); + data_3_7_11_15 = _mm512_loadu_4_m128(src_ptr + begin3, src_ptr + begin7, src_ptr + begin11, src_ptr + begin15); + data_4_8_12_16 = _mm512_loadu_4_m128(src_ptr + begin4, src_ptr + begin8, src_ptr + begin12, src_ptr + begin16); + } + + _MM_TRANSPOSE16_LANE4_PS(data_1_5_9_13, data_2_6_10_14, data_3_7_11_15, data_4_8_12_16); + + __m512 result = _mm512_mul_ps(data_1_5_9_13, coef_1_5_9_13); + result = _mm512_fmadd_ps(data_2_6_10_14, coef_2_6_10_14, result); + result = _mm512_fmadd_ps(data_3_7_11_15, coef_3_7_11_15, result); + result = _mm512_fmadd_ps(data_4_8_12_16, coef_4_8_12_16, result); + + _mm512_store_ps(dst_ptr, result); + + dst_ptr += dst_pitch; + src_ptr += src_pitch; + } // y + current_coeff += filter_size * 16; // Move to the next set of coefficients for the next 16 output pixels + }; // end of lambda_16 + + // This 'auto' lambda construct replaces the need of templates + auto do_h_float_core_32 = [&](auto partial_load) { + // Load up to 4x4 coefficients at once before the height loop. + // Pre-loading and transposing coefficients keeps register usage efficient. + // Assumes 'filter_size_aligned' is at least 4. + + // Coefficients for the source pixel offset (for src_ptr + begin1 [0..3], begin5 [0..3], begin9 [0..3], begin13 [0..3]) + __m512 coef_1_5_9_13 = _mm512_load_4_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4, current_coeff + filter_size * 8, current_coeff + filter_size * 12); + __m512 coef_2_6_10_14 = _mm512_load_4_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5, current_coeff + filter_size * 9, current_coeff + filter_size * 13); + __m512 coef_3_7_11_15 = _mm512_load_4_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6, current_coeff + filter_size * 10, current_coeff + filter_size * 14); + __m512 coef_4_8_12_16 = _mm512_load_4_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7, current_coeff + filter_size * 11, current_coeff + filter_size * 15); + + _MM_TRANSPOSE16_LANE4_PS(coef_1_5_9_13, coef_2_6_10_14, coef_3_7_11_15, coef_4_8_12_16); + + // Coefficients for the source pixel offset (for src_ptr + begin1 [0..3], begin5 [0..3], begin9 [0..3], begin13 [0..3]) + __m512 coef_1_5_9_13_2 = _mm512_load_4_m128(current_coeff + filter_size * 16, current_coeff + filter_size * 20, current_coeff + filter_size * 24, current_coeff + filter_size * 28); + __m512 coef_2_6_10_14_2 = _mm512_load_4_m128(current_coeff + filter_size * 17, current_coeff + filter_size * 21, current_coeff + filter_size * 25, current_coeff + filter_size * 29); + __m512 coef_3_7_11_15_2 = _mm512_load_4_m128(current_coeff + filter_size * 18, current_coeff + filter_size * 22, current_coeff + filter_size * 26, current_coeff + filter_size * 30); + __m512 coef_4_8_12_16_2 = _mm512_load_4_m128(current_coeff + filter_size * 19, current_coeff + filter_size * 23, current_coeff + filter_size * 27, current_coeff + filter_size * 31); + + _MM_TRANSPOSE16_LANE4_PS(coef_1_5_9_13_2, coef_2_6_10_14_2, coef_3_7_11_15_2, coef_4_8_12_16_2); + + float* AVS_RESTRICT dst_ptr = dst + x; + const float* src_ptr = src; + + // Pixel offsets for the current target x-positions. + // Even for x >= width, these offsets are guaranteed to be within the allocated 'target_size_alignment'. + const int begin1 = program->pixel_offset[x + 0]; + const int begin2 = program->pixel_offset[x + 1]; + const int begin3 = program->pixel_offset[x + 2]; + const int begin4 = program->pixel_offset[x + 3]; + const int begin5 = program->pixel_offset[x + 4]; + const int begin6 = program->pixel_offset[x + 5]; + const int begin7 = program->pixel_offset[x + 6]; + const int begin8 = program->pixel_offset[x + 7]; + const int begin9 = program->pixel_offset[x + 8]; + const int begin10 = program->pixel_offset[x + 9]; + const int begin11 = program->pixel_offset[x + 10]; + const int begin12 = program->pixel_offset[x + 11]; + const int begin13 = program->pixel_offset[x + 12]; + const int begin14 = program->pixel_offset[x + 13]; + const int begin15 = program->pixel_offset[x + 14]; + const int begin16 = program->pixel_offset[x + 15]; + + // Pixel offsets for the current target x-positions. + // Even for x >= width, these offsets are guaranteed to be within the allocated 'target_size_alignment'. + const int begin1_2 = program->pixel_offset[x + 16]; + const int begin2_2 = program->pixel_offset[x + 17]; + const int begin3_2 = program->pixel_offset[x + 18]; + const int begin4_2 = program->pixel_offset[x + 19]; + const int begin5_2 = program->pixel_offset[x + 20]; + const int begin6_2 = program->pixel_offset[x + 21]; + const int begin7_2 = program->pixel_offset[x + 22]; + const int begin8_2 = program->pixel_offset[x + 23]; + const int begin9_2 = program->pixel_offset[x + 24]; + const int begin10_2 = program->pixel_offset[x + 25]; + const int begin11_2 = program->pixel_offset[x + 26]; + const int begin12_2 = program->pixel_offset[x + 27]; + const int begin13_2 = program->pixel_offset[x + 28]; + const int begin14_2 = program->pixel_offset[x + 29]; + const int begin15_2 = program->pixel_offset[x + 30]; + const int begin16_2 = program->pixel_offset[x + 31]; + + for (int y = 0; y < height; y++) + { + __m512 data_1_5_9_13; + __m512 data_2_6_10_14; + __m512 data_3_7_11_15; + __m512 data_4_8_12_16; + + __m512 data_1_5_9_13_2; + __m512 data_2_6_10_14_2; + __m512 data_3_7_11_15_2; + __m512 data_4_8_12_16_2; + + if constexpr (partial_load) { + // In the potentially unsafe zone (near the right edge of the image), we use a safe loading function + // to prevent reading beyond the allocated source scanline. + + data_1_5_9_13 = _mm512_load_partial_safe_4_m128(src_ptr + begin1, src_ptr + begin5, src_ptr + begin9, src_ptr + begin13); + data_2_6_10_14 = _mm512_load_partial_safe_4_m128(src_ptr + begin2, src_ptr + begin6, src_ptr + begin10, src_ptr + begin14); + data_3_7_11_15 = _mm512_load_partial_safe_4_m128(src_ptr + begin3, src_ptr + begin7, src_ptr + begin11, src_ptr + begin15); + data_4_8_12_16 = _mm512_load_partial_safe_4_m128(src_ptr + begin4, src_ptr + begin8, src_ptr + begin12, src_ptr + begin16); + + data_1_5_9_13_2 = _mm512_load_partial_safe_4_m128(src_ptr + begin1_2, src_ptr + begin5_2, src_ptr + begin9_2, src_ptr + begin13_2); + data_2_6_10_14_2 = _mm512_load_partial_safe_4_m128(src_ptr + begin2_2, src_ptr + begin6_2, src_ptr + begin10_2, src_ptr + begin14_2); + data_3_7_11_15_2 = _mm512_load_partial_safe_4_m128(src_ptr + begin3_2, src_ptr + begin7_2, src_ptr + begin11_2, src_ptr + begin15_2); + data_4_8_12_16_2 = _mm512_load_partial_safe_4_m128(src_ptr + begin4_2, src_ptr + begin8_2, src_ptr + begin12_2, src_ptr + begin16_2); + + } + else { + // In the safe zone, we can directly load 4 pixels at a time for each of the four lanes. + data_1_5_9_13 = _mm512_loadu_4_m128(src_ptr + begin1, src_ptr + begin5, src_ptr + begin9, src_ptr + begin13); + data_2_6_10_14 = _mm512_loadu_4_m128(src_ptr + begin2, src_ptr + begin6, src_ptr + begin10, src_ptr + begin14); + data_3_7_11_15 = _mm512_loadu_4_m128(src_ptr + begin3, src_ptr + begin7, src_ptr + begin11, src_ptr + begin15); + data_4_8_12_16 = _mm512_loadu_4_m128(src_ptr + begin4, src_ptr + begin8, src_ptr + begin12, src_ptr + begin16); + + data_1_5_9_13_2 = _mm512_loadu_4_m128(src_ptr + begin1_2, src_ptr + begin5_2, src_ptr + begin9_2, src_ptr + begin13_2); + data_2_6_10_14_2 = _mm512_loadu_4_m128(src_ptr + begin2_2, src_ptr + begin6_2, src_ptr + begin10_2, src_ptr + begin14_2); + data_3_7_11_15_2 = _mm512_loadu_4_m128(src_ptr + begin3_2, src_ptr + begin7_2, src_ptr + begin11_2, src_ptr + begin15_2); + data_4_8_12_16_2 = _mm512_loadu_4_m128(src_ptr + begin4_2, src_ptr + begin8_2, src_ptr + begin12_2, src_ptr + begin16_2); + + } + + _MM_TRANSPOSE16_LANE4_PS(data_1_5_9_13, data_2_6_10_14, data_3_7_11_15, data_4_8_12_16); + _MM_TRANSPOSE16_LANE4_PS(data_1_5_9_13_2, data_2_6_10_14_2, data_3_7_11_15_2, data_4_8_12_16_2); + + __m512 result = _mm512_mul_ps(data_1_5_9_13, coef_1_5_9_13); + result = _mm512_fmadd_ps(data_2_6_10_14, coef_2_6_10_14, result); + result = _mm512_fmadd_ps(data_3_7_11_15, coef_3_7_11_15, result); + result = _mm512_fmadd_ps(data_4_8_12_16, coef_4_8_12_16, result); + + __m512 result_2 = _mm512_mul_ps(data_1_5_9_13_2, coef_1_5_9_13_2); + result_2 = _mm512_fmadd_ps(data_2_6_10_14_2, coef_2_6_10_14_2, result_2); + result_2 = _mm512_fmadd_ps(data_3_7_11_15_2, coef_3_7_11_15_2, result_2); + result_2 = _mm512_fmadd_ps(data_4_8_12_16_2, coef_4_8_12_16_2, result_2); + + + _mm512_store_ps(dst_ptr, result); + _mm512_store_ps(dst_ptr + 16, result_2); + + dst_ptr += dst_pitch; + src_ptr += src_pitch; + } // y + current_coeff += filter_size * 32; // Move to the next set of coefficients for the next 32 output pixels + }; // end of lambda + + // Process the 'safe zone' where direct full unaligned loads are acceptable. + for (; x < std::min(width_mod32, width_safe_mod); x += 32) + { + do_h_float_core_32(std::false_type{}); // partial_load == false, use direct _mm512_loadu_4_m128 + } + + for (width_mod32; x < width_safe_mod; x += PIXELS_AT_A_TIME) + { + do_h_float_core_16(std::false_type{}); // partial_load == false, use direct _mm512_loadu_4_m128 + } + + // Process the potentially 'unsafe zone' near the image edge, using safe loading. + for (; x < width; x += PIXELS_AT_A_TIME) + { + do_h_float_core_16(std::true_type{}); // partial_load == true, use the safer '_mm512_load_partial_safe_4_m128' + } + } + else // if(bDoGather) + { + for (int x = 0; x < width_mod32; x += 32) + { + // prepare coefs in transposed V-form + __m512 coef_r0 = _mm512_load_4_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4, current_coeff + filter_size * 8, current_coeff + filter_size * 12); + __m512 coef_r1 = _mm512_load_4_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5, current_coeff + filter_size * 9, current_coeff + filter_size * 13); + __m512 coef_r2 = _mm512_load_4_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6, current_coeff + filter_size * 10, current_coeff + filter_size * 14); + __m512 coef_r3 = _mm512_load_4_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7, current_coeff + filter_size * 11, current_coeff + filter_size * 15); + + _MM_TRANSPOSE16_LANE4_PS(coef_r0, coef_r1, coef_r2, coef_r3); + + __m512 coef_r0_2 = _mm512_load_4_m128(current_coeff + filter_size * 16, current_coeff + filter_size * 20, current_coeff + filter_size * 24, current_coeff + filter_size * 28); + __m512 coef_r1_2 = _mm512_load_4_m128(current_coeff + filter_size * 17, current_coeff + filter_size * 21, current_coeff + filter_size * 25, current_coeff + filter_size * 29); + __m512 coef_r2_2 = _mm512_load_4_m128(current_coeff + filter_size * 18, current_coeff + filter_size * 22, current_coeff + filter_size * 26, current_coeff + filter_size * 30); + __m512 coef_r3_2 = _mm512_load_4_m128(current_coeff + filter_size * 19, current_coeff + filter_size * 23, current_coeff + filter_size * 27, current_coeff + filter_size * 31); + + _MM_TRANSPOSE16_LANE4_PS(coef_r0_2, coef_r1_2, coef_r2_2, coef_r3_2); + + // convert resampling program in H-form into permuting indexes for src transposition in V-form + int iStart = program->pixel_offset[x + 0]; + + __m512i perm_0 = _mm512_set_epi32( + program->pixel_offset[x + 15] - iStart, + program->pixel_offset[x + 14] - iStart, + program->pixel_offset[x + 13] - iStart, + program->pixel_offset[x + 12] - iStart, + program->pixel_offset[x + 11] - iStart, + program->pixel_offset[x + 10] - iStart, + program->pixel_offset[x + 9] - iStart, + program->pixel_offset[x + 8] - iStart, + program->pixel_offset[x + 7] - iStart, + program->pixel_offset[x + 6] - iStart, + program->pixel_offset[x + 5] - iStart, + program->pixel_offset[x + 4] - iStart, + program->pixel_offset[x + 3] - iStart, + program->pixel_offset[x + 2] - iStart, + program->pixel_offset[x + 1] - iStart, + 0); + + __m512i one_epi32 = _mm512_set1_epi32(1); + __m512i perm_1 = _mm512_add_epi32(perm_0, one_epi32); + one_epi32 = _mm512_set1_epi32(program->pixel_offset[x + 2] - program->pixel_offset[x + 1]); + __m512i perm_2 = _mm512_add_epi32(perm_1, one_epi32); + one_epi32 = _mm512_set1_epi32(program->pixel_offset[x + 3] - program->pixel_offset[x + 2]); + __m512i perm_3 = _mm512_add_epi32(perm_2, one_epi32); + + // second gropup + __m512i perm_0_2 = _mm512_set_epi32( + program->pixel_offset[x + 31] - iStart, + program->pixel_offset[x + 30] - iStart, + program->pixel_offset[x + 29] - iStart, + program->pixel_offset[x + 28] - iStart, + program->pixel_offset[x + 27] - iStart, + program->pixel_offset[x + 26] - iStart, + program->pixel_offset[x + 25] - iStart, + program->pixel_offset[x + 24] - iStart, + program->pixel_offset[x + 23] - iStart, + program->pixel_offset[x + 22] - iStart, + program->pixel_offset[x + 21] - iStart, + program->pixel_offset[x + 20] - iStart, + program->pixel_offset[x + 19] - iStart, + program->pixel_offset[x + 18] - iStart, + program->pixel_offset[x + 17] - iStart, + program->pixel_offset[x + 16] - iStart); + + + __m512i perm_1_2 = _mm512_add_epi32(perm_0_2, one_epi32); + one_epi32 = _mm512_set1_epi32(program->pixel_offset[x + 2] - program->pixel_offset[x + 1]); + __m512i perm_2_2 = _mm512_add_epi32(perm_1_2, one_epi32); + one_epi32 = _mm512_set1_epi32(program->pixel_offset[x + 3] - program->pixel_offset[x + 2]); + __m512i perm_3_2 = _mm512_add_epi32(perm_2_2, one_epi32); + + float* AVS_RESTRICT dst_ptr = dst + x; + const float* src_ptr = src + program->pixel_offset[x + 0]; // all permute offsets relative to this start offset + const float* src_ptr2 = src + program->pixel_offset[x + 16]; // all permute offsets relative to this start offset + + for (int y = 0; y < height; y++) // single row proc + { + __m512 data_src = _mm512_loadu_ps(src_ptr); + __m512 data_src2 = _mm512_loadu_ps(src_ptr + 16); // not always needed for upscale also can cause end of buffer overread - need to add limitation (special end of buffer processing ?) + + __m512 data_src_2 = _mm512_loadu_ps(src_ptr2); + __m512 data_src2_2 = _mm512_loadu_ps(src_ptr2 + 16); // not always needed for upscale also can cause end of buffer overread - need to add limitation (special end of buffer processing ?) + + __m512 data_0 = _mm512_permutex2var_ps(data_src, perm_0, data_src2); + __m512 data_1 = _mm512_permutex2var_ps(data_src, perm_1, data_src2); + __m512 data_2 = _mm512_permutex2var_ps(data_src, perm_2, data_src2); + __m512 data_3 = _mm512_permutex2var_ps(data_src, perm_3, data_src2); + + __m512 data_0_2 = _mm512_permutex2var_ps(data_src_2, perm_0_2, data_src2_2); + __m512 data_1_2 = _mm512_permutex2var_ps(data_src_2, perm_1_2, data_src2_2); + __m512 data_2_2 = _mm512_permutex2var_ps(data_src_2, perm_2_2, data_src2_2); + __m512 data_3_2 = _mm512_permutex2var_ps(data_src_2, perm_3_2, data_src2_2); + + __m512 result0 = _mm512_mul_ps(data_0, coef_r0); + __m512 result1 = _mm512_mul_ps(data_2, coef_r2); + + __m512 result0_2 = _mm512_mul_ps(data_0_2, coef_r0_2); + __m512 result1_2 = _mm512_mul_ps(data_2_2, coef_r2_2); + + result0 = _mm512_fmadd_ps(data_1, coef_r1, result0); + result1 = _mm512_fmadd_ps(data_3, coef_r3, result1); + + result0_2 = _mm512_fmadd_ps(data_1_2, coef_r1_2, result0_2); + result1_2 = _mm512_fmadd_ps(data_3_2, coef_r3_2, result1_2); + + + _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1)); + _mm512_store_ps(dst_ptr + 16, _mm512_add_ps(result0_2, result1_2)); + + dst_ptr += dst_pitch; + src_ptr += src_pitch; + } + + current_coeff += filter_size * 32; + } // to width_mo32 + + for (int x = width_mod32; x < width; x += 16) + { + // prepare coefs in transposed V-form + __m512 coef_r0 = _mm512_load_4_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4, current_coeff + filter_size * 8, current_coeff + filter_size * 12); + __m512 coef_r1 = _mm512_load_4_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5, current_coeff + filter_size * 9, current_coeff + filter_size * 13); + __m512 coef_r2 = _mm512_load_4_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6, current_coeff + filter_size * 10, current_coeff + filter_size * 14); + __m512 coef_r3 = _mm512_load_4_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7, current_coeff + filter_size * 11, current_coeff + filter_size * 15); + + _MM_TRANSPOSE16_LANE4_PS(coef_r0, coef_r1, coef_r2, coef_r3); + + // convert resampling program in H-form into permuting indexes for src transposition in V-form + int iStart = program->pixel_offset[x + 0]; + + __m512i perm_0 = _mm512_set_epi32( + program->pixel_offset[x + 15] - iStart, + program->pixel_offset[x + 14] - iStart, + program->pixel_offset[x + 13] - iStart, + program->pixel_offset[x + 12] - iStart, + program->pixel_offset[x + 11] - iStart, + program->pixel_offset[x + 10] - iStart, + program->pixel_offset[x + 9] - iStart, + program->pixel_offset[x + 8] - iStart, + program->pixel_offset[x + 7] - iStart, + program->pixel_offset[x + 6] - iStart, + program->pixel_offset[x + 5] - iStart, + program->pixel_offset[x + 4] - iStart, + program->pixel_offset[x + 3] - iStart, + program->pixel_offset[x + 2] - iStart, + program->pixel_offset[x + 1] - iStart, + 0); + + __m512i one_epi32 = _mm512_set1_epi32(1); + __m512i perm_1 = _mm512_add_epi32(perm_0, one_epi32); + one_epi32 = _mm512_set1_epi32(program->pixel_offset[x + 2] - program->pixel_offset[x + 1]); + __m512i perm_2 = _mm512_add_epi32(perm_1, one_epi32); + one_epi32 = _mm512_set1_epi32(program->pixel_offset[x + 3] - program->pixel_offset[x + 2]); + __m512i perm_3 = _mm512_add_epi32(perm_2, one_epi32); + + float* AVS_RESTRICT dst_ptr = dst + x; + const float* src_ptr = src + program->pixel_offset[x + 0]; // all permute offsets relative to this start offset + + for (int y = 0; y < height; y++) // single row proc + { + __m512 data_src = _mm512_loadu_ps(src_ptr); + __m512 data_src2 = _mm512_loadu_ps(src_ptr + 16); // not always needed for upscale also can cause end of buffer overread - need to add limitation (special end of buffer processing ?) + + __m512 data_0 = _mm512_permutex2var_ps(data_src, perm_0, data_src2); + __m512 data_1 = _mm512_permutex2var_ps(data_src, perm_1, data_src2); + __m512 data_2 = _mm512_permutex2var_ps(data_src, perm_2, data_src2); + __m512 data_3 = _mm512_permutex2var_ps(data_src, perm_3, data_src2); + + __m512 result0 = _mm512_mul_ps(data_0, coef_r0); + __m512 result1 = _mm512_mul_ps(data_2, coef_r2); + + result0 = _mm512_fmadd_ps(data_1, coef_r1, result0); + result1 = _mm512_fmadd_ps(data_3, coef_r3, result1); + + _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1)); + + dst_ptr += dst_pitch; + src_ptr += src_pitch; + } + + current_coeff += filter_size * 16; + } // to width + } +} + +template void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w<0>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel); +template void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w<1>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel); +template void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w<2>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel); +template void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w<3>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel); + + +#if 0 // DTL version +// Transpose-based +// process kernel size from up to 4 - BilinearResize, BicubicResize or sinc up to taps=2 +void resize_h_planar_float_avx512_transpose_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel) { + int filter_size = program->filter_size; + + const float* AVS_RESTRICT current_coeff; + + src_pitch = src_pitch / sizeof(float); + dst_pitch = dst_pitch / sizeof(float); + + float* src = (float*)src8; + float* dst = (float*)dst8; + + current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float; + + // this 16xfloat works, since AviSynth aligns scanlines to 64 bytes. + for (int x = 0; x < width; x += 16) // is it safe to read by 16 floats = 64 bytes ? + { + __m512 c1_c5_c9_c13 = _mm512_load_4_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4, current_coeff + filter_size * 8, current_coeff + filter_size * 12); + __m512 c2_c6_c10_c14 = _mm512_load_4_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5, current_coeff + filter_size * 9, current_coeff + filter_size * 13); + __m512 c3_c7_c11_c15 = _mm512_load_4_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6, current_coeff + filter_size * 10, current_coeff + filter_size * 14); + __m512 c4_c8_c12_c16 = _mm512_load_4_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7, current_coeff + filter_size * 11, current_coeff + filter_size * 15); + + _MM_TRANSPOSE16_LANE4_PS(c1_c5_c9_c13, c2_c6_c10_c14, c3_c7_c11_c15, c4_c8_c12_c16); + + float* AVS_RESTRICT dst_ptr = dst + x; + const float* src_ptr = src; + + for (int y = 0; y < height; y++) + { + __m512 d1_d5_d9_d13 = _mm512_loadu_4_m128(src_ptr + program->pixel_offset[x + 0], src_ptr + program->pixel_offset[x + 4], src_ptr + program->pixel_offset[x + 8], src_ptr + program->pixel_offset[x + 12]); + __m512 d2_d6_d10_d14 = _mm512_loadu_4_m128(src_ptr + program->pixel_offset[x + 1], src_ptr + program->pixel_offset[x + 5], src_ptr + program->pixel_offset[x + 9], src_ptr + program->pixel_offset[x + 13]); + __m512 d3_d7_d11_d15 = _mm512_loadu_4_m128(src_ptr + program->pixel_offset[x + 2], src_ptr + program->pixel_offset[x + 6], src_ptr + program->pixel_offset[x + 10], src_ptr + program->pixel_offset[x + 14]); + __m512 d4_d8_d12_d16 = _mm512_loadu_4_m128(src_ptr + program->pixel_offset[x + 3], src_ptr + program->pixel_offset[x + 7], src_ptr + program->pixel_offset[x + 11], src_ptr + program->pixel_offset[x + 15]); + + _MM_TRANSPOSE16_LANE4_PS(d1_d5_d9_d13, d2_d6_d10_d14, d3_d7_d11_d15, d4_d8_d12_d16); + + __m512 result = _mm512_mul_ps(d1_d5_d9_d13, c1_c5_c9_c13); + result = _mm512_fmadd_ps(d2_d6_d10_d14, c2_c6_c10_c14, result); + result = _mm512_fmadd_ps(d3_d7_d11_d15, c3_c7_c11_c15, result); + result = _mm512_fmadd_ps(d4_d8_d12_d16, c4_c8_c12_c16, result); + + _mm512_store_ps(dst_ptr, result); + + dst_ptr += dst_pitch; + src_ptr += src_pitch; + } + current_coeff += filter_size * 16; + } + +} +#endif + +#if 0 +void resize_h_planar_float_avx512_permutex_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel) +{ + + // assert - check if max pixel_offset is not above single load of 16 src floats (or need several loads and more complex permute program) + +#ifdef _DEBUG + for (int x = 0; x < width; x += 16) + { + int start_off = program->pixel_offset[x + 0]; + int end_off = program->pixel_offset[x + 15]; + assert((end_off - start_off) > 15); + } +#endif + + int filter_size = program->filter_size; + + const float* AVS_RESTRICT current_coeff; + __m512i one_epi32 = _mm512_set1_epi32(1); + + src_pitch = src_pitch / sizeof(float); + dst_pitch = dst_pitch / sizeof(float); + + float* src = (float*)src8; + float* dst = (float*)dst8; + + current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float; + + for (int x = 0; x < width; x += 16) + { + // prepare coefs in transposed V-form + __m512 coef_r0 = _mm512_load_4_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4, current_coeff + filter_size * 8, current_coeff + filter_size * 12); + __m512 coef_r1 = _mm512_load_4_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5, current_coeff + filter_size * 9, current_coeff + filter_size * 13); + __m512 coef_r2 = _mm512_load_4_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6, current_coeff + filter_size * 10, current_coeff + filter_size * 14); + __m512 coef_r3 = _mm512_load_4_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7, current_coeff + filter_size * 11, current_coeff + filter_size * 15); + + _MM_TRANSPOSE16_LANE4_PS(coef_r0, coef_r1, coef_r2, coef_r3); + + // convert resampling program in H-form into permuting indexes for src transposition in V-form + int iStart = program->pixel_offset[x + 0]; + __m512i perm_0 = _mm512_set_epi32(program->pixel_offset[x + 15] - iStart, program->pixel_offset[x + 14] - iStart, program->pixel_offset[x + 13] - iStart, program->pixel_offset[x + 12] - iStart, program->pixel_offset[x + 11] - iStart, program->pixel_offset[x + 10] - iStart, program->pixel_offset[x + 9] - iStart, program->pixel_offset[x + 8] - iStart, \ + program->pixel_offset[x + 7] - iStart, program->pixel_offset[x + 6] - iStart, program->pixel_offset[x + 5] - iStart, program->pixel_offset[x + 4] - iStart, program->pixel_offset[x + 3] - iStart, program->pixel_offset[x + 2] - iStart, program->pixel_offset[x + 1] - iStart, 0); + __m512i perm_1 = _mm512_add_epi32(perm_0, one_epi32); + __m512i perm_2 = _mm512_add_epi32(perm_1, one_epi32); + __m512i perm_3 = _mm512_add_epi32(perm_2, one_epi32); + + float* AVS_RESTRICT dst_ptr = dst + x; + const float* src_ptr = src + program->pixel_offset[x + 0]; // all permute offsets relative to this start offset + +#if 0 + for (int y = 0; y < height; y++) // single row proc + { + __m512 data_src = _mm512_loadu_ps(src_ptr); + + __m512 data_0 = _mm512_permutexvar_ps(perm_0, data_src); + __m512 data_1 = _mm512_permutexvar_ps(perm_1, data_src); + __m512 data_2 = _mm512_permutexvar_ps(perm_2, data_src); + __m512 data_3 = _mm512_permutexvar_ps(perm_3, data_src); + + __m512 result0 = _mm512_mul_ps(data_0, coef_r0); + __m512 result1 = _mm512_mul_ps(data_2, coef_r2); + + result0 = _mm512_fmadd_ps(data_1, coef_r1, result0); + result1 = _mm512_fmadd_ps(data_3, coef_r3, result1); + + _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1)); + + dst_ptr += dst_pitch; + src_ptr += src_pitch; + } +#endif + + const int height_mod2 = (height / 2) * 2; // Process pairs of rows for better efficiency + // dual-rows not worst in performance - may be left for the future better memory performance and compute performance hosts + for (int y = 0; y < height_mod2; y+=2) + { + __m512 data_src = _mm512_loadu_ps(src_ptr); + __m512 data_src_2 = _mm512_loadu_ps(src_ptr + src_pitch); + + __m512 data_0 = _mm512_permutexvar_ps(perm_0, data_src); + __m512 data_1 = _mm512_permutexvar_ps(perm_1, data_src); + __m512 data_2 = _mm512_permutexvar_ps(perm_2, data_src); + __m512 data_3 = _mm512_permutexvar_ps(perm_3, data_src); + + __m512 data_0_2 = _mm512_permutexvar_ps(perm_0, data_src_2); + __m512 data_1_2 = _mm512_permutexvar_ps(perm_1, data_src_2); + __m512 data_2_2 = _mm512_permutexvar_ps(perm_2, data_src_2); + __m512 data_3_2 = _mm512_permutexvar_ps(perm_3, data_src_2); + + __m512 result0 = _mm512_mul_ps(data_0, coef_r0); + __m512 result1 = _mm512_mul_ps(data_0_2, coef_r0); + + result0 = _mm512_fmadd_ps(data_1, coef_r1, result0); + result1 = _mm512_fmadd_ps(data_1_2, coef_r1, result1); + + result0 = _mm512_fmadd_ps(data_2, coef_r2, result0); + result1 = _mm512_fmadd_ps(data_2_2, coef_r2, result1); + + result0 = _mm512_fmadd_ps(data_3, coef_r3, result0); + result1 = _mm512_fmadd_ps(data_3_2, coef_r3, result1); + + _mm512_store_ps(dst_ptr, result0); + _mm512_store_ps(dst_ptr + dst_pitch, result1); + + dst_ptr += dst_pitch * 2; + src_ptr += src_pitch * 2; + } + + if (height > height_mod2) // last row + { + __m512 data_src = _mm512_loadu_ps(src_ptr); + + __m512 data_0 = _mm512_permutexvar_ps(perm_0, data_src); + __m512 data_1 = _mm512_permutexvar_ps(perm_1, data_src); + __m512 data_2 = _mm512_permutexvar_ps(perm_2, data_src); + __m512 data_3 = _mm512_permutexvar_ps(perm_3, data_src); + + __m512 result0 = _mm512_mul_ps(data_0, coef_r0); + __m512 result1 = _mm512_mul_ps(data_2, coef_r2); + + result0 = _mm512_fmadd_ps(data_1, coef_r1, result0); + result1 = _mm512_fmadd_ps(data_3, coef_r3, result1); + + _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1)); + } + + current_coeff += filter_size * 16; + } +} +#endif +void resize_h_planar_float_avx512_permutex_vstripe_ks8(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel) +{ + // assert - check if max pixel_offset is not above single load of 16 src floats (or need several loads and more complex permute program) + +#ifdef _DEBUG + for (int x = 0; x < width; x += 16) + { + int start_off = program->pixel_offset[x + 0]; + int end_off = program->pixel_offset[x + 15]; + assert((end_off - start_off) > 15); + } +#endif + + int filter_size = program->filter_size; + + const float* AVS_RESTRICT current_coeff; + __m512i one_epi32 = _mm512_set1_epi32(1); + + src_pitch = src_pitch / sizeof(float); + dst_pitch = dst_pitch / sizeof(float); + + float* src = (float*)src8; + float* dst = (float*)dst8; + + current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float; + + for (int x = 0; x < width; x += 16) + { + // prepare coefs in transposed V-form, use gathering - not very slow until TRANSPOSE8_ is designed + + __m512i offsets = _mm512_set_epi32(filter_size * 7, filter_size * 6, filter_size * 5, filter_size * 4, filter_size * 3, filter_size * 2, filter_size * 1, filter_size * 0, \ + filter_size * 7, filter_size * 6, filter_size * 5, filter_size * 4, filter_size * 3, filter_size * 2, filter_size * 1, filter_size * 0 ); + + __m512 coef_r0 = _mm512_i32gather_ps(offsets, current_coeff, 4); + + offsets = _mm512_add_epi32(offsets, one_epi32); + __m512 coef_r1 = _mm512_i32gather_ps(offsets, current_coeff, 4); + + offsets = _mm512_add_epi32(offsets, one_epi32); + __m512 coef_r2 = _mm512_i32gather_ps(offsets, current_coeff, 4); + + offsets = _mm512_add_epi32(offsets, one_epi32); + __m512 coef_r3 = _mm512_i32gather_ps(offsets, current_coeff, 4); + + offsets = _mm512_add_epi32(offsets, one_epi32); + __m512 coef_r4 = _mm512_i32gather_ps(offsets, current_coeff, 4); + + offsets = _mm512_add_epi32(offsets, one_epi32); + __m512 coef_r5 = _mm512_i32gather_ps(offsets, current_coeff, 4); + + offsets = _mm512_add_epi32(offsets, one_epi32); + __m512 coef_r6 = _mm512_i32gather_ps(offsets, current_coeff, 4); + + offsets = _mm512_add_epi32(offsets, one_epi32); + __m512 coef_r7 = _mm512_i32gather_ps(offsets, current_coeff, 4); + + + // convert resampling program in H-form into permuting indexes for src transposition in V-form + int iStart = program->pixel_offset[x + 0]; + __m512i perm_0 = _mm512_set_epi32(program->pixel_offset[x + 15] - iStart, program->pixel_offset[x + 14] - iStart, program->pixel_offset[x + 13] - iStart, program->pixel_offset[x + 12] - iStart, program->pixel_offset[x + 11] - iStart, program->pixel_offset[x + 10] - iStart, program->pixel_offset[x + 9] - iStart, program->pixel_offset[x + 8] - iStart, \ + program->pixel_offset[x + 7] - iStart, program->pixel_offset[x + 6] - iStart, program->pixel_offset[x + 5] - iStart, program->pixel_offset[x + 4] - iStart, program->pixel_offset[x + 3] - iStart, program->pixel_offset[x + 2] - iStart, program->pixel_offset[x + 1] - iStart, 0); + __m512i perm_1 = _mm512_add_epi32(perm_0, one_epi32); + __m512i perm_2 = _mm512_add_epi32(perm_1, one_epi32); + __m512i perm_3 = _mm512_add_epi32(perm_2, one_epi32); + __m512i perm_4 = _mm512_add_epi32(perm_3, one_epi32); + __m512i perm_5 = _mm512_add_epi32(perm_4, one_epi32); + __m512i perm_6 = _mm512_add_epi32(perm_5, one_epi32); + __m512i perm_7 = _mm512_add_epi32(perm_6, one_epi32); + + float* AVS_RESTRICT dst_ptr = dst + x; + const float* src_ptr = src + program->pixel_offset[x + 0]; // all permute offsets relative to this start offset +#if 0 + for (int y = 0; y < height; y++) // single row proc + { + __m512 data_src = _mm512_loadu_ps(src_ptr); + + __m512 data_0 = _mm512_permutexvar_ps(perm_0, data_src); + __m512 data_1 = _mm512_permutexvar_ps(perm_1, data_src); + __m512 data_2 = _mm512_permutexvar_ps(perm_2, data_src); + __m512 data_3 = _mm512_permutexvar_ps(perm_3, data_src); + __m512 data_4 = _mm512_permutexvar_ps(perm_4, data_src); + __m512 data_5 = _mm512_permutexvar_ps(perm_5, data_src); + __m512 data_6 = _mm512_permutexvar_ps(perm_6, data_src); + __m512 data_7 = _mm512_permutexvar_ps(perm_7, data_src); + + __m512 result0 = _mm512_mul_ps(data_0, coef_r0); + __m512 result1 = _mm512_mul_ps(data_4, coef_r4); + + result0 = _mm512_fmadd_ps(data_1, coef_r1, result0); + result1 = _mm512_fmadd_ps(data_5, coef_r5, result1); + + result0 = _mm512_fmadd_ps(data_2, coef_r2, result0); + result1 = _mm512_fmadd_ps(data_6, coef_r6, result1); + + result0 = _mm512_fmadd_ps(data_3, coef_r3, result0); + result1 = _mm512_fmadd_ps(data_7, coef_r7, result1); + + _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1)); + + dst_ptr += dst_pitch; + src_ptr += src_pitch; + } +#endif + + const int height_mod2 = (height / 2) * 2; // Process pairs of rows for better efficiency + // dual-rows not worst in performance - may be left for the future better memory performance and compute performance hosts + for (int y = 0; y < height_mod2; y += 2) + { + __m512 data_src = _mm512_loadu_ps(src_ptr); + __m512 data_src_2 = _mm512_loadu_ps(src_ptr + src_pitch); + + __m512 data_0 = _mm512_permutexvar_ps(perm_0, data_src); + __m512 data_1 = _mm512_permutexvar_ps(perm_1, data_src); + __m512 data_2 = _mm512_permutexvar_ps(perm_2, data_src); + __m512 data_3 = _mm512_permutexvar_ps(perm_3, data_src); + __m512 data_4 = _mm512_permutexvar_ps(perm_4, data_src); + __m512 data_5 = _mm512_permutexvar_ps(perm_5, data_src); + __m512 data_6 = _mm512_permutexvar_ps(perm_6, data_src); + __m512 data_7 = _mm512_permutexvar_ps(perm_7, data_src); + + __m512 data_0_2 = _mm512_permutexvar_ps(perm_0, data_src_2); + __m512 data_1_2 = _mm512_permutexvar_ps(perm_1, data_src_2); + __m512 data_2_2 = _mm512_permutexvar_ps(perm_2, data_src_2); + __m512 data_3_2 = _mm512_permutexvar_ps(perm_3, data_src_2); + __m512 data_4_2 = _mm512_permutexvar_ps(perm_4, data_src_2); + __m512 data_5_2 = _mm512_permutexvar_ps(perm_5, data_src_2); + __m512 data_6_2 = _mm512_permutexvar_ps(perm_6, data_src_2); + __m512 data_7_2 = _mm512_permutexvar_ps(perm_7, data_src_2); + + __m512 result0 = _mm512_mul_ps(data_0, coef_r0); + __m512 result1 = _mm512_mul_ps(data_0_2, coef_r0); + + result0 = _mm512_fmadd_ps(data_1, coef_r1, result0); + result1 = _mm512_fmadd_ps(data_1_2, coef_r1, result1); + + result0 = _mm512_fmadd_ps(data_2, coef_r2, result0); + result1 = _mm512_fmadd_ps(data_2_2, coef_r2, result1); + + result0 = _mm512_fmadd_ps(data_3, coef_r3, result0); + result1 = _mm512_fmadd_ps(data_3_2, coef_r3, result1); + + result0 = _mm512_fmadd_ps(data_4, coef_r4, result0); + result1 = _mm512_fmadd_ps(data_4_2, coef_r4, result1); + + result0 = _mm512_fmadd_ps(data_5, coef_r5, result0); + result1 = _mm512_fmadd_ps(data_5_2, coef_r5, result1); + + result0 = _mm512_fmadd_ps(data_6, coef_r6, result0); + result1 = _mm512_fmadd_ps(data_6_2, coef_r6, result1); + + result0 = _mm512_fmadd_ps(data_7, coef_r7, result0); + result1 = _mm512_fmadd_ps(data_7_2, coef_r7, result1); + + _mm512_store_ps(dst_ptr, result0); + _mm512_store_ps(dst_ptr + dst_pitch, result1); + + dst_ptr += dst_pitch * 2; + src_ptr += src_pitch * 2; + } + + if (height > height_mod2) // last row + { + __m512 data_src = _mm512_loadu_ps(src_ptr); + + __m512 data_0 = _mm512_permutexvar_ps(perm_0, data_src); + __m512 data_1 = _mm512_permutexvar_ps(perm_1, data_src); + __m512 data_2 = _mm512_permutexvar_ps(perm_2, data_src); + __m512 data_3 = _mm512_permutexvar_ps(perm_3, data_src); + __m512 data_4 = _mm512_permutexvar_ps(perm_4, data_src); + __m512 data_5 = _mm512_permutexvar_ps(perm_5, data_src); + __m512 data_6 = _mm512_permutexvar_ps(perm_6, data_src); + __m512 data_7 = _mm512_permutexvar_ps(perm_7, data_src); + + __m512 result0 = _mm512_mul_ps(data_0, coef_r0); + __m512 result1 = _mm512_mul_ps(data_4, coef_r4); + + result0 = _mm512_fmadd_ps(data_1, coef_r1, result0); + result1 = _mm512_fmadd_ps(data_5, coef_r5, result1); + + result0 = _mm512_fmadd_ps(data_2, coef_r2, result0); + result1 = _mm512_fmadd_ps(data_6, coef_r6, result1); + + result0 = _mm512_fmadd_ps(data_3, coef_r3, result0); + result1 = _mm512_fmadd_ps(data_7, coef_r7, result1); + + _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1)); + } + + current_coeff += filter_size * 16; + } +} + +#if 0 +void resize_h_planar_float_avx512_permutex_vstripe_ks16(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel) +{ + // assert - check if max pixel_offset is not above single load of 16 src floats (or need several loads and more complex permute program) +#ifdef _DEBUG + for (int x = 0; x < width; x += 16) + { + int start_off = program->pixel_offset[x + 0]; + int end_off = program->pixel_offset[x + 15]; + assert((end_off - start_off) > 15); + } +#endif + + int filter_size = program->filter_size; + + const float* AVS_RESTRICT current_coeff; + __m512i one_epi32 = _mm512_set1_epi32(1); + + src_pitch = src_pitch / sizeof(float); + dst_pitch = dst_pitch / sizeof(float); + + float* src = (float*)src8; + float* dst = (float*)dst8; + + current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float; + + for (int x = 0; x < width; x += 16) + { + // prepare coefs in transposed V-form, use gathering - not very slow until TRANSPOSE8_ is designed + + __m512i offsets = _mm512_set_epi32(filter_size * 15, filter_size * 14, filter_size * 13, filter_size * 12, filter_size * 11, filter_size * 10, filter_size * 9, filter_size * 8, \ + filter_size * 7, filter_size * 6, filter_size * 5, filter_size * 4, filter_size * 3, filter_size * 2, filter_size * 1, filter_size * 0); + + __m512 coef_r0 = _mm512_i32gather_ps(offsets, current_coeff, 4); + + offsets = _mm512_add_epi32(offsets, one_epi32); + __m512 coef_r1 = _mm512_i32gather_ps(offsets, current_coeff, 4); + + offsets = _mm512_add_epi32(offsets, one_epi32); + __m512 coef_r2 = _mm512_i32gather_ps(offsets, current_coeff, 4); + + offsets = _mm512_add_epi32(offsets, one_epi32); + __m512 coef_r3 = _mm512_i32gather_ps(offsets, current_coeff, 4); + + offsets = _mm512_add_epi32(offsets, one_epi32); + __m512 coef_r4 = _mm512_i32gather_ps(offsets, current_coeff, 4); + + offsets = _mm512_add_epi32(offsets, one_epi32); + __m512 coef_r5 = _mm512_i32gather_ps(offsets, current_coeff, 4); + + offsets = _mm512_add_epi32(offsets, one_epi32); + __m512 coef_r6 = _mm512_i32gather_ps(offsets, current_coeff, 4); + + offsets = _mm512_add_epi32(offsets, one_epi32); + __m512 coef_r7 = _mm512_i32gather_ps(offsets, current_coeff, 4); + + offsets = _mm512_add_epi32(offsets, one_epi32); + __m512 coef_r8 = _mm512_i32gather_ps(offsets, current_coeff, 4); + + offsets = _mm512_add_epi32(offsets, one_epi32); + __m512 coef_r9 = _mm512_i32gather_ps(offsets, current_coeff, 4); + + offsets = _mm512_add_epi32(offsets, one_epi32); + __m512 coef_r10 = _mm512_i32gather_ps(offsets, current_coeff, 4); + + offsets = _mm512_add_epi32(offsets, one_epi32); + __m512 coef_r11 = _mm512_i32gather_ps(offsets, current_coeff, 4); + + offsets = _mm512_add_epi32(offsets, one_epi32); + __m512 coef_r12 = _mm512_i32gather_ps(offsets, current_coeff, 4); + + offsets = _mm512_add_epi32(offsets, one_epi32); + __m512 coef_r13 = _mm512_i32gather_ps(offsets, current_coeff, 4); + + offsets = _mm512_add_epi32(offsets, one_epi32); + __m512 coef_r14 = _mm512_i32gather_ps(offsets, current_coeff, 4); + + offsets = _mm512_add_epi32(offsets, one_epi32); + __m512 coef_r15 = _mm512_i32gather_ps(offsets, current_coeff, 4); + + + // convert resampling program in H-form into permuting indexes for src transposition in V-form + int iStart = program->pixel_offset[x + 0]; + __m512i perm_0 = _mm512_set_epi32(program->pixel_offset[x + 15] - iStart, program->pixel_offset[x + 14] - iStart, program->pixel_offset[x + 13] - iStart, program->pixel_offset[x + 12] - iStart, program->pixel_offset[x + 11] - iStart, program->pixel_offset[x + 10] - iStart, program->pixel_offset[x + 9] - iStart, program->pixel_offset[x + 8] - iStart, \ + program->pixel_offset[x + 7] - iStart, program->pixel_offset[x + 6] - iStart, program->pixel_offset[x + 5] - iStart, program->pixel_offset[x + 4] - iStart, program->pixel_offset[x + 3] - iStart, program->pixel_offset[x + 2] - iStart, program->pixel_offset[x + 1] - iStart, 0); + __m512i perm_1 = _mm512_add_epi32(perm_0, one_epi32); + __m512i perm_2 = _mm512_add_epi32(perm_1, one_epi32); + __m512i perm_3 = _mm512_add_epi32(perm_2, one_epi32); + __m512i perm_4 = _mm512_add_epi32(perm_3, one_epi32); + __m512i perm_5 = _mm512_add_epi32(perm_4, one_epi32); + __m512i perm_6 = _mm512_add_epi32(perm_5, one_epi32); + __m512i perm_7 = _mm512_add_epi32(perm_6, one_epi32); + __m512i perm_8 = _mm512_add_epi32(perm_7, one_epi32); + __m512i perm_9 = _mm512_add_epi32(perm_8, one_epi32); + __m512i perm_10 = _mm512_add_epi32(perm_9, one_epi32); + __m512i perm_11 = _mm512_add_epi32(perm_10, one_epi32); + __m512i perm_12 = _mm512_add_epi32(perm_11, one_epi32); + __m512i perm_13 = _mm512_add_epi32(perm_12, one_epi32); + __m512i perm_14 = _mm512_add_epi32(perm_13, one_epi32); + __m512i perm_15 = _mm512_add_epi32(perm_14, one_epi32); // to do : test if better to add one_epi32 in the loop and only store perm_0 complex to fill dataword + + float* AVS_RESTRICT dst_ptr = dst + x; + const float* src_ptr = src + program->pixel_offset[x + 0]; // all permute offsets relative to this start offset + + for (int y = 0; y < height; y++) // single row proc + { + __m512 data_src = _mm512_loadu_ps(src_ptr); + + __m512 data_0 = _mm512_permutexvar_ps(perm_0, data_src); + __m512 data_1 = _mm512_permutexvar_ps(perm_1, data_src); + __m512 data_2 = _mm512_permutexvar_ps(perm_2, data_src); + __m512 data_3 = _mm512_permutexvar_ps(perm_3, data_src); + __m512 data_4 = _mm512_permutexvar_ps(perm_4, data_src); + __m512 data_5 = _mm512_permutexvar_ps(perm_5, data_src); + __m512 data_6 = _mm512_permutexvar_ps(perm_6, data_src); + __m512 data_7 = _mm512_permutexvar_ps(perm_7, data_src); + __m512 data_8 = _mm512_permutexvar_ps(perm_8, data_src); + __m512 data_9 = _mm512_permutexvar_ps(perm_9, data_src); + __m512 data_10 = _mm512_permutexvar_ps(perm_10, data_src); + __m512 data_11 = _mm512_permutexvar_ps(perm_11, data_src); + __m512 data_12 = _mm512_permutexvar_ps(perm_12, data_src); + __m512 data_13 = _mm512_permutexvar_ps(perm_13, data_src); + __m512 data_14 = _mm512_permutexvar_ps(perm_14, data_src); + __m512 data_15 = _mm512_permutexvar_ps(perm_15, data_src); + + __m512 result0 = _mm512_mul_ps(data_0, coef_r0); + __m512 result1 = _mm512_mul_ps(data_8, coef_r8); + + result0 = _mm512_fmadd_ps(data_1, coef_r1, result0); + result1 = _mm512_fmadd_ps(data_9, coef_r9, result1); + + result0 = _mm512_fmadd_ps(data_2, coef_r2, result0); + result1 = _mm512_fmadd_ps(data_10, coef_r10, result1); + + result0 = _mm512_fmadd_ps(data_3, coef_r3, result0); + result1 = _mm512_fmadd_ps(data_11, coef_r11, result1); + + result0 = _mm512_fmadd_ps(data_4, coef_r4, result0); + result1 = _mm512_fmadd_ps(data_12, coef_r12, result1); + + result0 = _mm512_fmadd_ps(data_5, coef_r5, result0); + result1 = _mm512_fmadd_ps(data_13, coef_r13, result1); + + result0 = _mm512_fmadd_ps(data_6, coef_r6, result0); + result1 = _mm512_fmadd_ps(data_14, coef_r14, result1); + + result0 = _mm512_fmadd_ps(data_7, coef_r7, result0); + result1 = _mm512_fmadd_ps(data_15, coef_r15, result1); + + _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1)); + + dst_ptr += dst_pitch; + src_ptr += src_pitch; + } + + current_coeff += filter_size * 16; + } +} +#endif + + + +//-------- 512 bit float Verticals + +void resize_v_avx512_planar_float(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel) +{ + AVS_UNUSED(bits_per_pixel); + + const int filter_size = program->filter_size; + const float* AVS_RESTRICT current_coeff = program->pixel_coefficient_float; + + const float* src = (const float*)src8; + float* AVS_RESTRICT dst = (float*)dst8; + dst_pitch = dst_pitch / sizeof(float); + src_pitch = src_pitch / sizeof(float); + + const int kernel_size = program->filter_size_real; // not the aligned + const int kernel_size_mod2 = (kernel_size / 2) * 2; // Process pairs of rows for better efficiency + const bool notMod2 = kernel_size_mod2 < kernel_size; + + for (int y = 0; y < target_height; y++) { + int offset = program->pixel_offset[y]; + const float* src_ptr = src + offset * src_pitch; + + // 64 byte 16 floats (AVX512 register holds 16 floats) + // no need for wmod8, alignment is safe 32 bytes at least - is it safe for 64 bytes ? + for (int x = 0; x < width; x += 16) { + __m512 result_single = _mm512_setzero_ps(); + __m512 result_single_2 = _mm512_setzero_ps(); + + const float* AVS_RESTRICT src2_ptr = src_ptr + x; // __restrict here + + // Process pairs of rows for better efficiency (2 coeffs/cycle) + // two result variables for potential parallel operation + int i = 0; + for (; i < kernel_size_mod2; i += 2) { + __m512 coeff_even = _mm512_set1_ps(current_coeff[i]); + __m512 coeff_odd = _mm512_set1_ps(current_coeff[i + 1]); + + __m512 src_even = _mm512_loadu_ps(src2_ptr); + __m512 src_odd = _mm512_loadu_ps(src2_ptr + src_pitch); + + result_single = _mm512_fmadd_ps(src_even, coeff_even, result_single); + result_single_2 = _mm512_fmadd_ps(src_odd, coeff_odd, result_single_2); + + src2_ptr += 2 * src_pitch; + } + + result_single = _mm512_add_ps(result_single, result_single_2); + + // Process the last odd row if needed + if (notMod2) { + __m512 coeff = _mm512_set1_ps(current_coeff[i]); + __m512 src_val = _mm512_loadu_ps(src2_ptr); + result_single = _mm512_fmadd_ps(src_val, coeff, result_single); + } + + _mm512_store_ps(dst + x, result_single); + } + + dst += dst_pitch; + current_coeff += filter_size; + } +} + +void resize_v_avx512_planar_float_w_sr(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel) +{ + AVS_UNUSED(bits_per_pixel); + + const int filter_size = program->filter_size; + const float* AVS_RESTRICT current_coeff = program->pixel_coefficient_float; + + const float* src = (const float*)src8; + float* AVS_RESTRICT dst = (float*)dst8; + dst_pitch = dst_pitch / sizeof(float); + src_pitch = src_pitch / sizeof(float); + + const int kernel_size = program->filter_size_real; // not the aligned + const int kernel_size_mod2 = (kernel_size / 2) * 2; // Process pairs of rows for better efficiency + const bool notMod2 = kernel_size_mod2 < kernel_size; + + const int width_mod128 = (width / 128) * 128; // Process by 8x 512it (8 x 16 floats) to make memory read/write linear streams longer, 32x512 bit registers should be enough + const int width_mod64 = (width / 64) * 64; // Process by 4x 512it (4 x 16 floats) to make memory read/write linear streams longer, + const int width_mod32 = (width / 32) * 32; // Process by 2x 512it (2 x 16 floats) to make memory read/write linear streams longer, + + for (int y = 0; y < target_height; y++) { + int offset = program->pixel_offset[y]; + const float* src_ptr = src + offset * src_pitch; + + for (int x = 0; x < width_mod128; x += 128) { + __m512 result_1 = _mm512_setzero_ps(); + __m512 result_2 = _mm512_setzero_ps(); + __m512 result_3 = _mm512_setzero_ps(); + __m512 result_4 = _mm512_setzero_ps(); + __m512 result_5 = _mm512_setzero_ps(); + __m512 result_6 = _mm512_setzero_ps(); + __m512 result_7 = _mm512_setzero_ps(); + __m512 result_8 = _mm512_setzero_ps(); + + const float* AVS_RESTRICT src2_ptr = src_ptr + x; // __restrict here + + int i = 0; + for (; i < kernel_size; i ++) { + __m512 coeff = _mm512_set1_ps(current_coeff[i]); + + __m512 src_1 = _mm512_load_ps(src2_ptr); + __m512 src_2 = _mm512_load_ps(src2_ptr + 16); + __m512 src_3 = _mm512_load_ps(src2_ptr + 32); + __m512 src_4 = _mm512_load_ps(src2_ptr + 48); + __m512 src_5 = _mm512_load_ps(src2_ptr + 64); + __m512 src_6 = _mm512_load_ps(src2_ptr + 80); + __m512 src_7 = _mm512_load_ps(src2_ptr + 96); + __m512 src_8 = _mm512_load_ps(src2_ptr + 112); + + result_1 = _mm512_fmadd_ps(src_1, coeff, result_1); + result_2 = _mm512_fmadd_ps(src_2, coeff, result_2); + result_3 = _mm512_fmadd_ps(src_3, coeff, result_3); + result_4 = _mm512_fmadd_ps(src_4, coeff, result_4); + result_5 = _mm512_fmadd_ps(src_5, coeff, result_5); + result_6 = _mm512_fmadd_ps(src_6, coeff, result_6); + result_7 = _mm512_fmadd_ps(src_7, coeff, result_7); + result_8 = _mm512_fmadd_ps(src_8, coeff, result_8); + + src2_ptr += src_pitch; + } + + _mm512_store_ps(dst + x, result_1); + _mm512_store_ps(dst + x + 16, result_2); + _mm512_store_ps(dst + x + 32, result_3); + _mm512_store_ps(dst + x + 48, result_4); + _mm512_store_ps(dst + x + 64, result_5); + _mm512_store_ps(dst + x + 80, result_6); + _mm512_store_ps(dst + x + 96, result_7); + _mm512_store_ps(dst + x + 112, result_8); + } + + for (int x = width_mod128; x < width_mod64; x += 64) { + __m512 result_1 = _mm512_setzero_ps(); + __m512 result_2 = _mm512_setzero_ps(); + __m512 result_3 = _mm512_setzero_ps(); + __m512 result_4 = _mm512_setzero_ps(); + + const float* AVS_RESTRICT src2_ptr = src_ptr + x; // __restrict here + + int i = 0; + for (; i < kernel_size; i++) { + __m512 coeff = _mm512_set1_ps(current_coeff[i]); + + __m512 src_1 = _mm512_load_ps(src2_ptr); + __m512 src_2 = _mm512_load_ps(src2_ptr + 16); + __m512 src_3 = _mm512_load_ps(src2_ptr + 32); + __m512 src_4 = _mm512_load_ps(src2_ptr + 48); + + result_1 = _mm512_fmadd_ps(src_1, coeff, result_1); + result_2 = _mm512_fmadd_ps(src_2, coeff, result_2); + result_3 = _mm512_fmadd_ps(src_3, coeff, result_3); + result_4 = _mm512_fmadd_ps(src_4, coeff, result_4); + + src2_ptr += src_pitch; + } + + _mm512_store_ps(dst + x, result_1); + _mm512_store_ps(dst + x + 16, result_2); + _mm512_store_ps(dst + x + 32, result_3); + _mm512_store_ps(dst + x + 48, result_4); + } + + for (int x = width_mod64; x < width_mod32; x += 32) { + __m512 result_1 = _mm512_setzero_ps(); + __m512 result_2 = _mm512_setzero_ps(); + + const float* AVS_RESTRICT src2_ptr = src_ptr + x; // __restrict here + + int i = 0; + for (; i < kernel_size; i++) { + __m512 coeff = _mm512_set1_ps(current_coeff[i]); + + __m512 src_1 = _mm512_load_ps(src2_ptr); + __m512 src_2 = _mm512_load_ps(src2_ptr + 16); + + result_1 = _mm512_fmadd_ps(src_1, coeff, result_1); + result_2 = _mm512_fmadd_ps(src_2, coeff, result_2); + + src2_ptr += src_pitch; + } + + _mm512_store_ps(dst + x, result_1); + _mm512_store_ps(dst + x + 16, result_2); + } + + + // 64 byte 16 floats (AVX512 register holds 16 floats) + // row alignment is 64 bytes - so it is safe to load mod16 of float32 ? + for (int x = width_mod32; x < width; x += 16) { + __m512 result_single = _mm512_setzero_ps(); + __m512 result_single_2 = _mm512_setzero_ps(); + + const float* AVS_RESTRICT src2_ptr = src_ptr + x; // __restrict here + + // Process pairs of rows for better efficiency (2 coeffs/cycle) + // two result variables for potential parallel operation + int i = 0; + for (; i < kernel_size_mod2; i += 2) { + __m512 coeff_even = _mm512_set1_ps(current_coeff[i]); + __m512 coeff_odd = _mm512_set1_ps(current_coeff[i + 1]); + + __m512 src_even = _mm512_load_ps(src2_ptr); + __m512 src_odd = _mm512_load_ps(src2_ptr + src_pitch); + + result_single = _mm512_fmadd_ps(src_even, coeff_even, result_single); + result_single_2 = _mm512_fmadd_ps(src_odd, coeff_odd, result_single_2); + + src2_ptr += 2 * src_pitch; + } + + result_single = _mm512_add_ps(result_single, result_single_2); + + // Process the last odd row if needed + if (notMod2) { + __m512 coeff = _mm512_set1_ps(current_coeff[i]); + __m512 src_val = _mm512_loadu_ps(src2_ptr); + result_single = _mm512_fmadd_ps(src_val, coeff, result_single); + } + + _mm512_store_ps(dst + x, result_single); + } + + + dst += dst_pitch; + current_coeff += filter_size; + } +} + +// uint8_t +void resize_v_avx512_planar_uint8_t_w_sr(BYTE* AVS_RESTRICT dst, const BYTE* src, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel) +{ + AVS_UNUSED(bits_per_pixel); + int filter_size = program->filter_size; + const short* AVS_RESTRICT current_coeff = program->pixel_coefficient; + __m512i rounder = _mm512_set1_epi32(1 << (FPScale8bits - 1)); + __m512i zero = _mm512_setzero_si512(); + + const int kernel_size = program->filter_size_real; // not the aligned + + const int width_mod128 = (width / 128) * 128; + + const __m512i perm_idx1 = _mm512_set_epi64(8 + 5, 8 + 4, 8 + 1, 8 + 0, 5, 4, 1, 0); + const __m512i perm_idx2 = _mm512_set_epi64(8 + 7, 8 + 6, 8 + 3, 8 + 2, 7, 6, 3, 2); + + for (int y = 0; y < target_height; y++) { + int offset = program->pixel_offset[y]; + const BYTE* AVS_RESTRICT src_ptr = src + offset * src_pitch; + + for (int x = 0; x < width_mod128; x += 128) { + + __m512i result_lo = rounder; + __m512i result_hi = rounder; + __m512i result_lo2 = rounder; + __m512i result_hi2 = rounder; + + __m512i result_lo_2 = rounder; + __m512i result_hi_2 = rounder; + __m512i result_lo2_2 = rounder; + __m512i result_hi2_2 = rounder; + + const uint8_t* AVS_RESTRICT src2_ptr = src_ptr + x; + + int i = 0; + // 128 byte 128 pixel + for (; i < kernel_size; i++) { + // Broadcast a single coefficients + __m512i coeff = _mm512_set1_epi16(*reinterpret_cast(current_coeff + i)); // 0|co|0|co|0|co|0|co 0|co|0|co|0|co|0|co + + __m512i src_1_1 = _mm512_cvtepu8_epi16(_mm256_load_si256(reinterpret_cast(src2_ptr))); // 32x 8->16bit pixels + __m512i src_1_2 = _mm512_cvtepu8_epi16(_mm256_load_si256(reinterpret_cast(src2_ptr + 32))); // 32x 8->16bit pixels + __m512i src_2_1 = _mm512_cvtepu8_epi16(_mm256_load_si256(reinterpret_cast(src2_ptr + 64))); // 32x 8->16bit pixels + __m512i src_2_2 = _mm512_cvtepu8_epi16(_mm256_load_si256(reinterpret_cast(src2_ptr + 96))); // 32x 8->16bit pixels + + __m512i src_lo = _mm512_unpacklo_epi16(src_1_1, zero); + __m512i src_hi = _mm512_unpackhi_epi16(src_1_1, zero); + __m512i src_lo2 = _mm512_unpacklo_epi16(src_1_2, zero); + __m512i src_hi2 = _mm512_unpackhi_epi16(src_1_2, zero); + + __m512i src_lo_2 = _mm512_unpacklo_epi16(src_2_1, zero); + __m512i src_hi_2 = _mm512_unpackhi_epi16(src_2_1, zero); + __m512i src_lo2_2 = _mm512_unpacklo_epi16(src_2_2, zero); + __m512i src_hi2_2 = _mm512_unpackhi_epi16(src_2_2, zero); + + result_lo = _mm512_add_epi32(result_lo, _mm512_madd_epi16(src_lo, coeff)); // a*b + c + result_hi = _mm512_add_epi32(result_hi, _mm512_madd_epi16(src_hi, coeff)); // a*b + c + result_lo2 = _mm512_add_epi32(result_lo2, _mm512_madd_epi16(src_lo2, coeff)); // a*b + c + result_hi2 = _mm512_add_epi32(result_hi2, _mm512_madd_epi16(src_hi2, coeff)); // a*b + c + + result_lo_2 = _mm512_add_epi32(result_lo_2, _mm512_madd_epi16(src_lo_2, coeff)); // a*b + c + result_hi_2 = _mm512_add_epi32(result_hi_2, _mm512_madd_epi16(src_hi_2, coeff)); // a*b + c + result_lo2_2 = _mm512_add_epi32(result_lo2_2, _mm512_madd_epi16(src_lo2_2, coeff)); // a*b + c + result_hi2_2 = _mm512_add_epi32(result_hi2_2, _mm512_madd_epi16(src_hi2_2, coeff)); // a*b + c + + src2_ptr += src_pitch; + + } + + // scale back, store + // shift back integer arithmetic 14 bits precision + result_lo = _mm512_srai_epi32(result_lo, FPScale8bits); + result_hi = _mm512_srai_epi32(result_hi, FPScale8bits); + result_lo2 = _mm512_srai_epi32(result_lo2, FPScale8bits); + result_hi2 = _mm512_srai_epi32(result_hi2, FPScale8bits); + + result_lo_2 = _mm512_srai_epi32(result_lo_2, FPScale8bits); + result_hi_2 = _mm512_srai_epi32(result_hi_2, FPScale8bits); + result_lo2_2 = _mm512_srai_epi32(result_lo2_2, FPScale8bits); + result_hi2_2 = _mm512_srai_epi32(result_hi2_2, FPScale8bits); + + __m512i result_2x8x_uint16 = _mm512_packus_epi32(result_lo, result_hi); + __m512i result2_2x8x_uint16 = _mm512_packus_epi32(result_lo2, result_hi2); + + __m512i result_2x8x_uint16_2 = _mm512_packus_epi32(result_lo_2, result_hi_2); + __m512i result2_2x8x_uint16_2 = _mm512_packus_epi32(result_lo2_2, result_hi2_2); + + __m512i pack_1 = _mm512_permutex2var_epi64(result_2x8x_uint16, perm_idx1, result2_2x8x_uint16); + __m512i pack_2 = _mm512_permutex2var_epi64(result_2x8x_uint16, perm_idx2, result2_2x8x_uint16); + + __m512i pack_1_2 = _mm512_permutex2var_epi64(result_2x8x_uint16_2, perm_idx1, result2_2x8x_uint16_2); + __m512i pack_2_2 = _mm512_permutex2var_epi64(result_2x8x_uint16_2, perm_idx2, result2_2x8x_uint16_2); + + __m512i res = _mm512_packus_epi16(pack_1, pack_2); + __m512i res_2 = _mm512_packus_epi16(pack_1_2, pack_2_2); + + _mm512_store_si512(reinterpret_cast<__m512i*>(dst + x), res); + _mm512_store_si512(reinterpret_cast<__m512i*>(dst + x + 64), res_2); + + } + + // 64 byte 64 pixel + // no need wmod16, alignment is safe at least 32 + for (int x = width_mod128; x < width; x += 64) { + + __m512i result_lo = rounder; + __m512i result_hi = rounder; + + __m512i result_lo2 = rounder; + __m512i result_hi2 = rounder; + + const uint8_t* AVS_RESTRICT src2_ptr = src_ptr + x; + + int i = 0; + for (; i < kernel_size; i++) { + // Broadcast a single coefficients + __m512i coeff = _mm512_set1_epi16(*reinterpret_cast(current_coeff + i)); // 0|co|0|co|0|co|0|co 0|co|0|co|0|co|0|co + + __m512i src_1_1 = _mm512_cvtepu8_epi16(_mm256_load_si256(reinterpret_cast(src2_ptr))); // 32x 8->16bit pixels + __m512i src_1_2 = _mm512_cvtepu8_epi16(_mm256_load_si256(reinterpret_cast(src2_ptr + 32))); // 32x 8->16bit pixels + + __m512i src_lo = _mm512_unpacklo_epi16(src_1_1, zero); + __m512i src_hi = _mm512_unpackhi_epi16(src_1_1, zero); + + __m512i src_lo2 = _mm512_unpacklo_epi16(src_1_2, zero); + __m512i src_hi2 = _mm512_unpackhi_epi16(src_1_2, zero); + + result_lo = _mm512_add_epi32(result_lo, _mm512_madd_epi16(src_lo, coeff)); // a*b + c + result_hi = _mm512_add_epi32(result_hi, _mm512_madd_epi16(src_hi, coeff)); // a*b + c + + result_lo2 = _mm512_add_epi32(result_lo2, _mm512_madd_epi16(src_lo2, coeff)); // a*b + c + result_hi2 = _mm512_add_epi32(result_hi2, _mm512_madd_epi16(src_hi2, coeff)); // a*b + c + + src2_ptr += src_pitch; + + } + + // scale back, store + // shift back integer arithmetic 14 bits precision + result_lo = _mm512_srai_epi32(result_lo, FPScale8bits); + result_hi = _mm512_srai_epi32(result_hi, FPScale8bits); + + result_lo2 = _mm512_srai_epi32(result_lo2, FPScale8bits); + result_hi2 = _mm512_srai_epi32(result_hi2, FPScale8bits); + + __m512i result_2x8x_uint16 = _mm512_packus_epi32(result_lo, result_hi); + __m512i result_2x8x_uint16_2 = _mm512_packus_epi32(result_lo2, result_hi2); + + __m512i pack_1 = _mm512_permutex2var_epi64(result_2x8x_uint16, perm_idx1, result_2x8x_uint16_2); + __m512i pack_2 = _mm512_permutex2var_epi64(result_2x8x_uint16, perm_idx2, result_2x8x_uint16_2); + + __m512i res = _mm512_packus_epi16(pack_1, pack_2); + + _mm512_store_si512(reinterpret_cast<__m512i*>(dst + x), res); + + } + + dst += dst_pitch; + current_coeff += filter_size; + } +} + +//uint16_t +template +void resize_v_avx512_planar_uint16_t_w_sr(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel) +{ + int filter_size = program->filter_size; + const short* AVS_RESTRICT current_coeff = program->pixel_coefficient; + + const __m512i zero = _mm512_setzero_si512(); + + const int width_mod64 = (width / 64) * 64; + + // for 16 bits only + const __m512i shifttosigned = _mm512_set1_epi16(-32768); + const __m512i shiftfromsigned = _mm512_set1_epi32(32768 << FPScale16bits); + + const __m512i rounder = _mm512_set1_epi32(1 << (FPScale16bits - 1)); + + const uint16_t* src = (uint16_t*)src8; + uint16_t* AVS_RESTRICT dst = (uint16_t * AVS_RESTRICT)dst8; + dst_pitch = dst_pitch / sizeof(uint16_t); + src_pitch = src_pitch / sizeof(uint16_t); + + const int kernel_size = program->filter_size_real; // not the aligned + + const int limit = (1 << bits_per_pixel) - 1; + __m512i clamp_limit = _mm512_set1_epi16((short)limit); // clamp limit for <16 bits + + for (int y = 0; y < target_height; y++) { + int offset = program->pixel_offset[y]; + const uint16_t* src_ptr = src + offset * src_pitch; + + // 128 byte 32 word + for (int x = 0; x < width_mod64; x += 64) { + + __m512i result_lo = rounder; + __m512i result_hi = rounder; + + __m512i result_lo_2 = rounder; + __m512i result_hi_2 = rounder; + + const uint16_t* AVS_RESTRICT src2_ptr = src_ptr + x; + + int i = 0; + for (; i < kernel_size; i++) { + // Broadcast a single coefficients + __m512i coeff = _mm512_set1_epi16(current_coeff[i]); // 0|co|0|co|0|co|0|co 0|co|0|co|0|co|0|co + + __m512i src = _mm512_load_si512(reinterpret_cast(src2_ptr)); // 32x 16bit pixels + __m512i src_2 = _mm512_load_si512(reinterpret_cast(src2_ptr + 32)); // 32x 16bit pixels + + if (!lessthan16bit) { + src = _mm512_add_epi16(src, shifttosigned); + src_2 = _mm512_add_epi16(src_2, shifttosigned); + } + + __m512i src_lo = _mm512_unpacklo_epi16(src, zero); + __m512i src_hi = _mm512_unpackhi_epi16(src, zero); + + __m512i src_lo_2 = _mm512_unpacklo_epi16(src_2, zero); + __m512i src_hi_2 = _mm512_unpackhi_epi16(src_2, zero); + + result_lo = _mm512_add_epi32(result_lo, _mm512_madd_epi16(src_lo, coeff)); // a*b + c + result_hi = _mm512_add_epi32(result_hi, _mm512_madd_epi16(src_hi, coeff)); // a*b + c + + result_lo_2 = _mm512_add_epi32(result_lo_2, _mm512_madd_epi16(src_lo_2, coeff)); // a*b + c + result_hi_2 = _mm512_add_epi32(result_hi_2, _mm512_madd_epi16(src_hi_2, coeff)); // a*b + c + + src2_ptr += src_pitch; + } + + if (!lessthan16bit) { + result_lo = _mm512_add_epi32(result_lo, shiftfromsigned); + result_hi = _mm512_add_epi32(result_hi, shiftfromsigned); + + result_lo_2 = _mm512_add_epi32(result_lo_2, shiftfromsigned); + result_hi_2 = _mm512_add_epi32(result_hi_2, shiftfromsigned); + + } + // shift back integer arithmetic 13 bits precision + result_lo = _mm512_srai_epi32(result_lo, FPScale16bits); + result_hi = _mm512_srai_epi32(result_hi, FPScale16bits); + + result_lo_2 = _mm512_srai_epi32(result_lo_2, FPScale16bits); + result_hi_2 = _mm512_srai_epi32(result_hi_2, FPScale16bits); + + __m512i result_2x8x_uint16 = _mm512_packus_epi32(result_lo, result_hi); + __m512i result_2x8x_uint16_2 = _mm512_packus_epi32(result_lo_2, result_hi_2); + if (lessthan16bit) { + result_2x8x_uint16 = _mm512_min_epu16(result_2x8x_uint16, clamp_limit); // extra clamp for 10-14 bit + result_2x8x_uint16_2 = _mm512_min_epu16(result_2x8x_uint16_2, clamp_limit); // extra clamp for 10-14 bit + } + _mm512_store_si512(reinterpret_cast<__m512i*>(dst + x), result_2x8x_uint16); + _mm512_store_si512(reinterpret_cast<__m512i*>(dst + x + 32), result_2x8x_uint16_2); + } + + // last 32 + // 64 byte 32 word + for (int x = width_mod64; x < width; x += 32) { + + __m512i result_lo = rounder; + __m512i result_hi = rounder; + + const uint16_t* AVS_RESTRICT src2_ptr = src_ptr + x; + + int i = 0; + for (; i < kernel_size; i++) { + // Broadcast a single coefficients + __m512i coeff = _mm512_set1_epi16(current_coeff[i]); // 0|co|0|co|0|co|0|co 0|co|0|co|0|co|0|co + + __m512i src = _mm512_load_si512(reinterpret_cast(src2_ptr)); // 32x 16bit pixels + if (!lessthan16bit) { + src = _mm512_add_epi16(src, shifttosigned); + } + __m512i src_lo = _mm512_unpacklo_epi16(src, zero); + __m512i src_hi = _mm512_unpackhi_epi16(src, zero); + result_lo = _mm512_add_epi32(result_lo, _mm512_madd_epi16(src_lo, coeff)); // a*b + c + result_hi = _mm512_add_epi32(result_hi, _mm512_madd_epi16(src_hi, coeff)); // a*b + c + + src2_ptr += src_pitch; + } + + if (!lessthan16bit) { + result_lo = _mm512_add_epi32(result_lo, shiftfromsigned); + result_hi = _mm512_add_epi32(result_hi, shiftfromsigned); + } + // shift back integer arithmetic 13 bits precision + result_lo = _mm512_srai_epi32(result_lo, FPScale16bits); + result_hi = _mm512_srai_epi32(result_hi, FPScale16bits); + + __m512i result_2x8x_uint16 = _mm512_packus_epi32(result_lo, result_hi); + if (lessthan16bit) { + result_2x8x_uint16 = _mm512_min_epu16(result_2x8x_uint16, clamp_limit); // extra clamp for 10-14 bit + } + _mm512_store_si512(reinterpret_cast<__m512i*>(dst + x), result_2x8x_uint16); + + } + + dst += dst_pitch; + current_coeff += filter_size; + } +} + +// avx512 16 +template void resize_v_avx512_planar_uint16_t_w_sr(BYTE* dst0, const BYTE* src0, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel); +// avx512 10-14bit +template void resize_v_avx512_planar_uint16_t_w_sr(BYTE* dst0, const BYTE* src0, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel); + + + + diff --git a/avs_core/filters/intel/resample_avx512.h b/avs_core/filters/intel/resample_avx512.h new file mode 100644 index 000000000..a9b5a0706 --- /dev/null +++ b/avs_core/filters/intel/resample_avx512.h @@ -0,0 +1,147 @@ +// Avisynth v2.5. Copyright 2002 Ben Rudiak-Gould et al. +// http://avisynth.nl + +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit +// http://www.gnu.org/copyleft/gpl.html . +// +// Linking Avisynth statically or dynamically with other modules is making a +// combined work based on Avisynth. Thus, the terms and conditions of the GNU +// General Public License cover the whole combination. +// +// As a special exception, the copyright holders of Avisynth give you +// permission to link Avisynth with independent modules that communicate with +// Avisynth solely through the interfaces defined in avisynth.h, regardless of the license +// terms of these independent modules, and to copy and distribute the +// resulting combined work under terms of your choice, provided that +// every copy of the combined work is accompanied by a complete copy of +// the source code of Avisynth (the version of Avisynth used to produce the +// combined work), being distributed under the terms of the GNU General +// Public License plus this exception. An independent module is a module +// which is not derived from or based on Avisynth, such as 3rd-party filters, +// import and export plugins, or graphical user interfaces. + +#ifndef __Resample_AVX512_H__ +#define __Resample_AVX512_H__ + +#include +#include "../resample_functions.h" + +#include // includes AVX, AVX2, FMA3, AVX512F, AVX512BW, etc. for MSVC, Clang, and GCC + +// compiler feature checks and error handling +#if defined(__clang__) && !defined(_MSC_VER) +#if !defined(__AVX512F__) || !defined(__AVX512BW__) +#error "This code requires a compiler that supports AVX-512F and AVX-512BW. Use compiler flags -mavx512f -mavx512bw." +#endif +#elif defined(__GNUC__) +#if !defined(__AVX512F__) || !defined(__AVX512BW__) +#error "This code requires a compiler that supports AVX-512F and AVX-512BW. Use compiler flags -mavx512f -mavx512bw." +#endif +#elif defined(_MSC_VER) + #if !defined(_M_X64) && !defined(_M_AMD64) && !defined(_M_ARM64) + #error "AVX-512 is only supported on x64 and ARM64 architectures." + #endif + // MSVC's provides AVX-512 support when /arch:AVX512 is used. + // However, MSVC may not define __AVX512F__ or __AVX512BW__ consistently. + // We rely on /arch:AVX512 having been set, and assume that if the user is + // including this header, they intend to use AVX-512. +#else + #error "Unsupported compiler. This code requires a compiler that supports AVX-512F and AVX-512BW (GCC, Clang, or MSVC)." +#endif + +#if !defined(__FMA__) +// Assume that all processors that have AVX2/AVX512 also have FMA3 +#if defined (__GNUC__) && ! defined (__INTEL_COMPILER) && ! defined (__clang__) +// Prevent error message in g++ when using FMA intrinsics with avx2: +#pragma message "It is recommended to specify also option -mfma when using -mavx2 or higher" +#else +#define __FMA__ 1 +#endif +#endif +// FMA3 instruction set +#if defined (__FMA__) && (defined(__GNUC__) || defined(__clang__)) && ! defined (__INTEL_COMPILER) +#include +#endif // __FMA__ + +// MSVC Missing Intrinsics (Workaround for older MSVC versions) +#if defined(_MSC_VER) && !defined(__clang__) +#if _MSC_VER < 1922 // Check for MSVC version less than 16.2 (VS 2019 16.2) + // Define missing AVX-512BW mask intrinsics for older MSVC. + // inline functions that perform the mask operations directly. + // Since this is MSVC only, using specific __forceinline. +__forceinline __mmask64 _kand_mask64(__mmask64 a, __mmask64 b) { return a & b; } +__forceinline __mmask64 _kor_mask64(__mmask64 a, __mmask64 b) { return a | b; } +__forceinline __mmask32 _kand_mask32(__mmask32 a, __mmask32 b) { return a & b; } +__forceinline __mmask32 _kor_mask32(__mmask32 a, __mmask32 b) { return a | b; } +#endif +#endif + +// useful macros + +#define _MM_TRANSPOSE16_LANE4_PS(row0, row1, row2, row3) \ + do { \ + __m512 __t0, __t1, __t2, __t3; \ + __t0 = _mm512_unpacklo_ps(row0, row1); \ + __t1 = _mm512_unpackhi_ps(row0, row1); \ + __t2 = _mm512_unpacklo_ps(row2, row3); \ + __t3 = _mm512_unpackhi_ps(row2, row3); \ + row0 = _mm512_shuffle_ps(__t0, __t2, _MM_SHUFFLE(1, 0, 1, 0)); \ + row1 = _mm512_shuffle_ps(__t0, __t2, _MM_SHUFFLE(3, 2, 3, 2)); \ + row2 = _mm512_shuffle_ps(__t1, __t3, _MM_SHUFFLE(1, 0, 1, 0)); \ + row3 = _mm512_shuffle_ps(__t1, __t3, _MM_SHUFFLE(3, 2, 3, 2)); \ + } while (0) + +#ifndef _mm512_loadu_4_m128 +#define _mm512_loadu_4_m128(/* __m128 const* */ addr1, \ + /* __m128 const* */ addr2, \ + /* __m128 const* */ addr3, \ + /* __m128 const* */ addr4) \ +_mm512_insertf32x4(_mm512_insertf32x4(_mm512_insertf32x4(_mm512_castps128_ps512(_mm_loadu_ps(addr1)), _mm_loadu_ps(addr2), 1), _mm_loadu_ps(addr3), 2), _mm_loadu_ps(addr4), 3) +#endif + +#ifndef _mm512_load_4_m128 +#define _mm512_load_4_m128(/* __m128 const* */ addr1, \ + /* __m128 const* */ addr2, \ + /* __m128 const* */ addr3, \ + /* __m128 const* */ addr4) \ +_mm512_insertf32x4(_mm512_insertf32x4(_mm512_insertf32x4(_mm512_castps128_ps512(_mm_load_ps(addr1)), _mm_load_ps(addr2), 1), _mm_load_ps(addr3), 2), _mm_load_ps(addr4), 3) +#endif + + +template +void resize_h_planar_float_avx512_transpose_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel); + +template +void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel); + +template +void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel); + + +void resize_h_planar_float_avx512_permutex_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel); +void resize_h_planar_float_avx512_permutex_vstripe_ks8(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel); +void resize_h_planar_float_avx512_permutex_vstripe_ks16(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel); + +void resize_v_avx512_planar_float(BYTE* dst0, const BYTE* src0, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel); +void resize_v_avx512_planar_float_w_sr(BYTE* dst0, const BYTE* src0, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel); + +// uint8_t +void resize_v_avx512_planar_uint8_t_w_sr(BYTE* AVS_RESTRICT dst, const BYTE* src, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel); + +// uint16_t +template +void resize_v_avx512_planar_uint16_t_w_sr(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel); + +#endif // __Resample_AVX512_H__ diff --git a/avs_core/filters/intel/resample_sse.cpp b/avs_core/filters/intel/resample_sse.cpp index 0fc6c66ca..7f0afbca3 100644 --- a/avs_core/filters/intel/resample_sse.cpp +++ b/avs_core/filters/intel/resample_sse.cpp @@ -152,7 +152,7 @@ void resize_v_mmx_planar(BYTE* dst, const BYTE* src, int dst_pitch, int src_pitc } #endif -/* +#if 0 void resize_v_sse2_planar(BYTE* dst8, const BYTE* src, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel) { AVS_UNUSED(bits_per_pixel); @@ -226,7 +226,7 @@ void resize_v_sse2_planar(BYTE* dst8, const BYTE* src, int dst_pitch, int src_pi current_coeff += filter_size; } } -*/ +#else void resize_v_sse2_planar(BYTE* dst8, const BYTE* src, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel) { @@ -340,7 +340,7 @@ void resize_v_sse2_planar(BYTE* dst8, const BYTE* src, int dst_pitch, int src_pi current_coeff += filter_size; } } - +#endif // like the AVX2 version, but only 8 pixels at a time template void resize_v_sse2_planar_uint16_t(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel) @@ -597,7 +597,7 @@ void resizer_h_ssse3_generic_float(BYTE* dst8, const BYTE* src8, int dst_pitch, dst_pitch = dst_pitch / sizeof(float); src_pitch = src_pitch / sizeof(float); - const int w_safe_mod8 = (program->overread_possible ? program->source_overread_beyond_targetx : width) / 8 * 8; + const int w_safe_mod8 = (program->safelimit_filter_size_aligned.overread_possible ? program->safelimit_filter_size_aligned.source_overread_beyond_targetx : width) / 8 * 8; for (int y = 0; y < height; y++) { float* current_coeff_base = program->pixel_coefficient_float; @@ -987,7 +987,7 @@ void resizer_h_ssse3_generic_uint8_16(BYTE* dst8, const BYTE* src8, int dst_pitc dst_pitch /= sizeof(pixel_t); src_pitch /= sizeof(pixel_t); - const int w_safe_mod8 = (program->overread_possible ? program->source_overread_beyond_targetx : width) / 8 * 8; + const int w_safe_mod8 = (program->safelimit_filter_size_aligned.overread_possible ? program->safelimit_filter_size_aligned.source_overread_beyond_targetx : width) / 8 * 8; for (int y = 0; y < height; y++) { const short* AVS_RESTRICT current_coeff_base = program->pixel_coefficient; @@ -1018,3 +1018,289 @@ template void resizer_h_ssse3_generic_uint8_16(BYTE* dst8, const template void resize_v_sse2_planar_uint16_t(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel); template void resize_v_sse2_planar_uint16_t(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel); +// Transpose-based SIMD +void resize_h_planar_float_sse_transpose(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel) { + int filter_size = program->filter_size; + + const float* AVS_RESTRICT current_coeff; + + src_pitch = src_pitch / sizeof(float); + dst_pitch = dst_pitch / sizeof(float); + + float* src = (float*)src8; + float* dst = (float*)dst8; + + const int kernel_size = program->filter_size_real; + const int ksmod4 = kernel_size / 4 * 4; + // const int ksmod8 = kernel_size / 8 * 8; + +#if 0 + // single row processing - slower + for (int y = 0; y < height; y++) { + current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float; + + float* AVS_RESTRICT dst2_ptr = dst + y * dst_pitch; + const float* src_ptr = src + y * src_pitch; + + // FIXME: the SIMD safe end is not width, but safe_width + for (int x = 0; x < width; x += 4) { + + __m128 result = _mm_setzero_ps(); + + for (int i = 0; i < ksmod4; i += 4) { + // 4 pixels, in outer x loop. Each has different "begin" offset + __m128 data_1 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + 0] + i); + __m128 data_2 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + 1] + i); + __m128 data_3 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + 2] + i); + __m128 data_4 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + 3] + i); + + __m128 coeff_1 = _mm_load_ps(current_coeff + i + filter_size * 0); + __m128 coeff_2 = _mm_load_ps(current_coeff + i + filter_size * 1); + __m128 coeff_3 = _mm_load_ps(current_coeff + i + filter_size * 2); + __m128 coeff_4 = _mm_load_ps(current_coeff + i + filter_size * 3); + + _MM_TRANSPOSE4_PS(data_1, data_2, data_3, data_4); + _MM_TRANSPOSE4_PS(coeff_1, coeff_2, coeff_3, coeff_4); + + result = _mm_add_ps(_mm_mul_ps(data_1, coeff_1), result); + result = _mm_add_ps(_mm_mul_ps(data_2, coeff_2), result); + result = _mm_add_ps(_mm_mul_ps(data_3, coeff_3), result); + result = _mm_add_ps(_mm_mul_ps(data_4, coeff_4), result); + } + + _mm_store_ps(dst2_ptr + x, result); + current_coeff += filter_size * 4; + } + } +#endif + constexpr int PIXELS_AT_A_TIME = 4; + // source_overread_beyond_targetx must be compatible with the number of source pixels loaded by SIMD load. + // loadu_ps: 4 pixels. + const int width_safe_mod = (program->safelimit_4_pixels.overread_possible ? program->safelimit_4_pixels.source_overread_beyond_targetx : width) / PIXELS_AT_A_TIME * PIXELS_AT_A_TIME; + + // this is not good, height mod 2 must be used src_ptr2 would access beyond frame + for (int y = 0; y < height; y += 2) { + current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float; + + float* AVS_RESTRICT dst2_ptr = dst + y * dst_pitch; + float* AVS_RESTRICT dst2_ptr2 = dst + (y + 1) * dst_pitch; + const float* src_ptr = src + y * src_pitch; + const float* src_ptr2 = src + (y + 1) * src_pitch; + + // 1st pass: from 0 to width_safe_mod in PIXELS_AT_A_TIME steps + // 2nd pass: from width_safe_mod to width in single pixel steps + //for (int x = 0; x < width_safe_mod; x += PIXELS_AT_A_TIME) { + for (int x = 0; x < width; x += PIXELS_AT_A_TIME) { + + __m128 result = _mm_setzero_ps(); + __m128 result2 = _mm_setzero_ps(); + + for (int i = 0; i < kernel_size; i += 4) { // it is always mod4 ? + + const int begin1 = program->pixel_offset[x + 0]; + const int begin2 = program->pixel_offset[x + 1]; + const int begin3 = program->pixel_offset[x + 2]; + const int begin4 = program->pixel_offset[x + 3]; + + // this is not good, src_ptr must be used instead of src_ptr + i + __m128 data_1 = _mm_loadu_ps(src_ptr + i + begin1); + __m128 data_2 = _mm_loadu_ps(src_ptr + i + begin2); + __m128 data_3 = _mm_loadu_ps(src_ptr + i + begin3); + __m128 data_4 = _mm_loadu_ps(src_ptr + i + begin4); + + __m128 data_1_2 = _mm_loadu_ps(src_ptr2 + i + begin1); + __m128 data_2_2 = _mm_loadu_ps(src_ptr2 + i + begin2); + __m128 data_3_2 = _mm_loadu_ps(src_ptr2 + i + begin3); + __m128 data_4_2 = _mm_loadu_ps(src_ptr2 + i + begin4); + + __m128 coeff_1 = _mm_load_ps(current_coeff + i + filter_size * 0); + __m128 coeff_2 = _mm_load_ps(current_coeff + i + filter_size * 1); + __m128 coeff_3 = _mm_load_ps(current_coeff + i + filter_size * 2); + __m128 coeff_4 = _mm_load_ps(current_coeff + i + filter_size * 3); + + _MM_TRANSPOSE4_PS(data_1, data_2, data_3, data_4); + _MM_TRANSPOSE4_PS(data_1_2, data_2_2, data_3_2, data_4_2); + _MM_TRANSPOSE4_PS(coeff_1, coeff_2, coeff_3, coeff_4); + + result = _mm_add_ps(_mm_mul_ps(data_1, coeff_1), result); + result = _mm_add_ps(_mm_mul_ps(data_2, coeff_2), result); + result = _mm_add_ps(_mm_mul_ps(data_3, coeff_3), result); + result = _mm_add_ps(_mm_mul_ps(data_4, coeff_4), result); + + result2 = _mm_add_ps(_mm_mul_ps(data_1_2, coeff_1), result2); + result2 = _mm_add_ps(_mm_mul_ps(data_2_2, coeff_2), result2); + result2 = _mm_add_ps(_mm_mul_ps(data_3_2, coeff_3), result2); + result2 = _mm_add_ps(_mm_mul_ps(data_4_2, coeff_4), result2); + + } + + _mm_store_ps(dst2_ptr + x, result); + _mm_store_ps(dst2_ptr2 + x, result2); + + current_coeff += filter_size * 4; + } + } + + // to do - need to process last row of not-mod2 heights +} + +// Safe partial load with SSE2 +// Read exactly N pixels, avoiding +// - reading beyond the end of the source buffer. +// - avoid NaN contamination, since event with zero coefficients NaN * 0 = NaN +template +AVS_FORCEINLINE static __m128 load_partial_safe_sse2(const float* src_ptr_offsetted) { + switch (Nmod4) { + case 1: + return _mm_set_ps(0.0f, 0.0f, 0.0f, src_ptr_offsetted[0]); + // ideally: movss + case 2: + return _mm_set_ps(0.0f, 0.0f, src_ptr_offsetted[1], src_ptr_offsetted[0]); + // ideally: movsd + case 3: + return _mm_set_ps(0.0f, src_ptr_offsetted[2], src_ptr_offsetted[1], src_ptr_offsetted[0]); + // ideally: movss + movsd + shuffle or movsd + insert + case 0: + return _mm_set_ps(src_ptr_offsetted[3], src_ptr_offsetted[2], src_ptr_offsetted[1], src_ptr_offsetted[0]); + // ideally: movups + default: + return _mm_setzero_ps(); // n/a cannot happen + } +} + +// Processes a horizontal resampling kernel of up to four coefficients for float pixel types. +// Supports BilinearResize, BicubicResize, or sinc with up to 2 taps (filter size <= 4). +// SSE2 optimization loads and processes four float coefficients and pixels simultaneously. +// The 'filtersizemod4' template parameter (0-3) helps optimize for different filter sizes modulo 4. +// This SSE2 requires only filter_size_alignment of 4. +template +void resize_h_planar_float_sse_transpose_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel) { + assert(filtersizemod4 >= 0 && filtersizemod4 <= 3); + + const int filter_size = program->filter_size; // aligned, practically the coeff table stride + + src_pitch /= sizeof(float); + dst_pitch /= sizeof(float); + + float* src = (float*)src8; + float* dst = (float*)dst8; + + const float* AVS_RESTRICT current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float; + + constexpr int PIXELS_AT_A_TIME = 4; // Process four pixels in parallel using SSE2 + + // 'source_overread_beyond_targetx' indicates if the filter kernel can read beyond the target width. + // Even if the filter alignment allows larger reads, our safety boundary for unaligned loads starts at 4 pixels back + // from the target width, as we load 4 floats at once with '_mm_loadu_ps'. + // In AVX2 we process two lanes, so any of the 8 offsets cannot be safely used, fallback to the unsafe case. + // This is why then safelimit_4_pixels is used combined with safelimit_4 / PIXELS_AT_A_TIME * PIXELS_AT_A_TIME. + const int width_safe_mod = (program->safelimit_4_pixels.overread_possible ? program->safelimit_4_pixels.source_overread_beyond_targetx : width) / PIXELS_AT_A_TIME * PIXELS_AT_A_TIME; + + // Preconditions: + assert(program->filter_size_real <= 4); // We preload all relevant coefficients (up to 4) before the height loop. + + // 'target_size_alignment' ensures we can safely access coefficients using offsets like + // 'filter_size * 3' when processing 4 H pixels at a time or + // 'filter_size * 7' when processing 8 H pixels at a time or + // 'filter_size * 15' when processing 16 H pixels at a time + assert(program->target_size_alignment >= 4); + + // Ensure that coefficient loading beyond the valid target size is safe for 4x4 float loads. + assert(program->filter_size_alignment >= 4); + + int x = 0; + + // This 'auto' lambda construct replaces the need of templates + auto do_h_float_core = [&](auto partial_load) { + // Load up to 4 coefficients at once before the height loop. + // Pre-loading and transposing coefficients keeps register usage efficient. + // Assumes 'filter_size_aligned' is at least 4. + __m128 coeff_1 = _mm_load_ps(current_coeff + filter_size * 0); // Coefficients for the source pixel offset (for src_ptr + begin1 [0..3]) + __m128 coeff_2 = _mm_load_ps(current_coeff + filter_size * 1); // for src_ptr + begin2 [0..3] + __m128 coeff_3 = _mm_load_ps(current_coeff + filter_size * 2); // for src_ptr + begin3 [0..3] + __m128 coeff_4 = _mm_load_ps(current_coeff + filter_size * 3); // for src_ptr + begin4 [0..3] + + _MM_TRANSPOSE4_PS(coeff_1, coeff_2, coeff_3, coeff_4); + + float* AVS_RESTRICT dst_ptr = dst + x; + const float* src_ptr = src; + + // Pixel offsets for the current target x-positions. + // Even for x >= width, these offsets are guaranteed to be within the allocated 'target_size_alignment'. + const int begin1 = program->pixel_offset[x + 0]; + const int begin2 = program->pixel_offset[x + 1]; + const int begin3 = program->pixel_offset[x + 2]; + const int begin4 = program->pixel_offset[x + 3]; + + for (int y = 0; y < height; y++) + { + __m128 data_1; + __m128 data_2; + __m128 data_3; + __m128 data_4; + if constexpr (partial_load) { + // In the potentially unsafe zone (near the right edge of the image), we use a safe loading function + // to prevent reading beyond the allocated source scanline. This handles cases where loading 4 floats + // starting from 'src_ptr + beginX' might exceed the source buffer. + + // Example of the unsafe scenario: If target width is 320, a naive load at src_ptr + 317 + // would attempt to read floats at indices 317, 318, 319, and 320, potentially going out of bounds. + + // Two main issues in the unsafe zone: + // 1.) Out-of-bounds memory access: Reading beyond the allocated memory for the source scanline can + // lead to access violations and crashes. '_mm_loadu_ps' attempts to load 16 bytes, so even if + // the starting address is within bounds, subsequent reads might not be. + // 2.) Garbage or NaN values: Even if a read doesn't cause a crash, accessing uninitialized or + // out-of-bounds memory (especially for float types) can result in garbage data, including NaN. + // Multiplying by a valid coefficient and accumulating this NaN can contaminate the final result. + + // 'load_partial_safe_sse2' safely loads up to 'filter_size_real' pixels and pads with zeros if needed, + // preventing out-of-bounds reads and ensuring predictable results even near the image edges. + + data_1 = load_partial_safe_sse2(src_ptr + begin1); + data_2 = load_partial_safe_sse2(src_ptr + begin2); + data_3 = load_partial_safe_sse2(src_ptr + begin3); + data_4 = load_partial_safe_sse2(src_ptr + begin4); + } + else { + // In the safe zone, we can directly load 4 pixels at a time using unaligned loads. + data_1 = _mm_loadu_ps(src_ptr + begin1); + data_2 = _mm_loadu_ps(src_ptr + begin2); + data_3 = _mm_loadu_ps(src_ptr + begin3); + data_4 = _mm_loadu_ps(src_ptr + begin4); + } + + _MM_TRANSPOSE4_PS(data_1, data_2, data_3, data_4); + + __m128 result = _mm_mul_ps(data_1, coeff_1); + result = _mm_add_ps(_mm_mul_ps(data_2, coeff_2), result); + result = _mm_add_ps(_mm_mul_ps(data_3, coeff_3), result); + result = _mm_add_ps(_mm_mul_ps(data_4, coeff_4), result); + + _mm_store_ps(dst_ptr, result); + + dst_ptr += dst_pitch; + src_ptr += src_pitch; + } // y + current_coeff += filter_size * 4; // Move to the next set of coefficients for the next 4 output pixels + }; // end of lambda + + // Process the 'safe zone' where direct full unaligned loads are acceptable. + for (; x < width_safe_mod; x += PIXELS_AT_A_TIME) + { + do_h_float_core(std::false_type{}); // partial_load == false, use direct _mm_loadu_ps + } + + // Process the potentially 'unsafe zone' near the image edge, using safe loading. + for (; x < width; x += PIXELS_AT_A_TIME) + { + do_h_float_core(std::true_type{}); // partial_load == true, use the safer 'load_partial_safe_sse2' + } +} + +// Instantiate them +template void resize_h_planar_float_sse_transpose_vstripe_ks4<0>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel); +template void resize_h_planar_float_sse_transpose_vstripe_ks4<1>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel); +template void resize_h_planar_float_sse_transpose_vstripe_ks4<2>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel); +template void resize_h_planar_float_sse_transpose_vstripe_ks4<3>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel); + diff --git a/avs_core/filters/intel/resample_sse.h b/avs_core/filters/intel/resample_sse.h index 8c85795a3..c09228a07 100644 --- a/avs_core/filters/intel/resample_sse.h +++ b/avs_core/filters/intel/resample_sse.h @@ -60,4 +60,9 @@ __attribute__((__target__("ssse3"))) #endif void resizer_h_ssse3_generic_float(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel); +void resize_h_planar_float_sse_transpose(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel); + +template +void resize_h_planar_float_sse_transpose_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel); + #endif // __Resample_SSE_H__ diff --git a/avs_core/filters/resample.cpp b/avs_core/filters/resample.cpp index 7caf9d467..18ca2ba5c 100644 --- a/avs_core/filters/resample.cpp +++ b/avs_core/filters/resample.cpp @@ -36,6 +36,9 @@ #ifdef INTEL_INTRINSICS #include "intel/resample_sse.h" #include "intel/resample_avx2.h" +#ifdef INTEL_INTRINSICS_AVX512 +#include "intel/resample_avx512.h" +#endif #include "intel/turn_sse.h" #endif #include @@ -71,15 +74,34 @@ // while maintaining correct coefficient positioning and proper zero padding. +static void checkAndSetOverread(int end_pos, SafeLimit& safelimit, int start_pos, int i, int source_size) { + if (end_pos > source_size) { + if (!safelimit.overread_possible) { + safelimit.overread_possible = true; + safelimit.source_overread_offset = start_pos; + safelimit.source_overread_beyond_targetx = i; + } + } +} + + void resize_prepare_coeffs(ResamplingProgram* p, IScriptEnvironment* env, int filter_size_alignment) { p->filter_size_alignment = filter_size_alignment; - p->overread_possible = false; + p->safelimit_filter_size_aligned.overread_possible = false; + p->safelimit_4_pixels.overread_possible = false; + p->safelimit_8_pixels.overread_possible = false; + p->safelimit_16_pixels.overread_possible = false; + p->safelimit_32_pixels.overread_possible = false; // note: filter_size_real was the max(kernel_sizes[]) int filter_size_aligned = AlignNumber(p->filter_size_real, p->filter_size_alignment); int target_size_aligned = AlignNumber(p->target_size, ALIGN_RESIZER_TARGET_SIZE); + // align target_size to 8 units to allow safe up to 8 pixels/cycle in H resizers. modded later. + p->target_size_alignment = ALIGN_RESIZER_TARGET_SIZE; + + // Common variables for both float and integer paths void* new_coeff = nullptr; void* src_coeff = nullptr; @@ -146,28 +168,27 @@ void resize_prepare_coeffs(ResamplingProgram* p, IScriptEnvironment* env, int fi // we must protect against source scanline overread. // Using this not in only 32-bit float resizers is new in 3.7.4. const int start_pos = p->pixel_offset[i]; - const int end_pos_aligned = start_pos + filter_size_aligned - 1; const int end_pos = start_pos + p->filter_size_real - 1; if (end_pos >= p->source_size) { // This issue has already been fixed, so it cannot occur. } // Check for SIMD optimization limits - if (end_pos_aligned >= p->source_size) { - if (!p->overread_possible) { - // Register the first occurrence, because we are entering the danger zone from here. - // Up to this point, template-based alignment-aware quick code can be used - // in H resizers. But beyond this point an e.g. _mm256_loadu_si256() would read into - // invalid memory area at the end of the frame buffer. - p->overread_possible = true; - p->source_overread_offset = start_pos; - p->source_overread_beyond_targetx = i; - } - } + // a.) when filter_size_aligned pixels are read (e.g. 16 byte SIMD load: 4 float pixels must be safely read) + // b.-e.) same for exacly 4, 8, 16 and 32 pixels + // We register only the first occurrence, because we are entering the danger zone from there. + // Up to this point, it is safe to read 4/8/... pixels from "start_pos" in the actual line. + // e.g. reading 4 floats will not read beyond the last pixel in line. Used in modified H resizers. + + checkAndSetOverread(start_pos + filter_size_aligned - 1, p->safelimit_filter_size_aligned, start_pos, i, p->source_size); + checkAndSetOverread(start_pos + 4 - 1, p->safelimit_4_pixels, start_pos, i, p->source_size); + checkAndSetOverread(start_pos + 8 - 1, p->safelimit_8_pixels, start_pos, i, p->source_size); + checkAndSetOverread(start_pos + 16 - 1, p->safelimit_16_pixels, start_pos, i, p->source_size); + checkAndSetOverread(start_pos + 32 - 1, p->safelimit_32_pixels, start_pos, i, p->source_size); } // Fill the extra offset after target_size with fake values. - // Our aim is to have a safe, up to 8 pixels/cycle simd loop for V resizers. + // Our aim is to have a safe, up to 8 pixels/cycle simd loop for V and specific H resizers. // Their coeffs will be 0, so they don't count if such coeffs // are multiplied with invalid pixels. if (p->target_size < target_size_aligned) { @@ -176,6 +197,8 @@ void resize_prepare_coeffs(ResamplingProgram* p, IScriptEnvironment* env, int fi for (int i = p->target_size; i < target_size_aligned; ++i) { p->kernel_sizes[i] = p->filter_size_real; p->pixel_offset[i] = 0; // 0th pixel offset makes no harm + // even if this ensures the in-line safety, alternative H resizer implementations must + // not read beyond last line, where y>=height. } } @@ -1044,7 +1067,7 @@ void resizer_h_c_generic_uint8_16_vectorized(BYTE* dst8, const BYTE* src8, int d dst_pitch /= sizeof(pixel_t); src_pitch /= sizeof(pixel_t); - const int w_safe_mod8 = (program->overread_possible ? program->source_overread_beyond_targetx : width) / 8 * 8; + const int w_safe_mod8 = (program->safelimit_filter_size_aligned.overread_possible ? program->safelimit_filter_size_aligned.source_overread_beyond_targetx : width) / 8 * 8; for (int y = 0; y < height; y++) { const short* current_coeff_base = program->pixel_coefficient; @@ -1584,11 +1607,46 @@ ResamplerH FilteredResizeH::GetResampler(int CPU, int pixelsize, int bits_per_pi } else { //if (pixelsize == 4) #ifdef INTEL_INTRINSICS +#ifdef INTEL_INTRINSICS_AVX512 + if ((CPU & CPUF_AVX512F) && program->filter_size_real <= 4) { + //return resize_h_planar_float_avx2_permutex_vstripe_ks4; + switch (program->filter_size_real) { +/* case 1: return resize_h_planar_float_avx512_transpose_vstripe_ks4<1>; break; + case 2: return resize_h_planar_float_avx512_transpose_vstripe_ks4<2>; break; + case 3: return resize_h_planar_float_avx512_transpose_vstripe_ks4<3>; break; + case 4: return resize_h_planar_float_avx512_transpose_vstripe_ks4<0>; break;*/ + case 1: return resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w<1>; break; + case 2: return resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w<2>; break; + case 3: return resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w<3>; break; + case 4: return resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w<0>; break; + } + } +#endif if (CPU & CPUF_AVX2) { - return resizer_h_avx2_generic_float; + //return resize_h_planar_float_avx2_permutex_vstripe_ks4; + + switch (program->filter_size_real) { +/* case 1: return resize_h_planar_float_avx_transpose_vstripe_ks4<1>; break; + case 2: return resize_h_planar_float_avx_transpose_vstripe_ks4<2>; break; + case 3: return resize_h_planar_float_avx_transpose_vstripe_ks4<3>; break; + case 4: return resize_h_planar_float_avx_transpose_vstripe_ks4<0>; break;*/ + case 1: return resize_h_planar_float_avx2_gather_permutex_vstripe_ks4<1>; break; + case 2: return resize_h_planar_float_avx2_gather_permutex_vstripe_ks4<2>; break; + case 3: return resize_h_planar_float_avx2_gather_permutex_vstripe_ks4<3>; break; + case 4: return resize_h_planar_float_avx2_gather_permutex_vstripe_ks4<0>; break; + default: return resizer_h_avx2_generic_float; + } + } if (CPU & CPUF_SSSE3) { - return resizer_h_ssse3_generic_float; + // return resizer_h_ssse3_generic_float; + switch (program->filter_size_real) { + case 1: return resize_h_planar_float_sse_transpose_vstripe_ks4<1>; break; + case 2: return resize_h_planar_float_sse_transpose_vstripe_ks4<2>; break; + case 3: return resize_h_planar_float_sse_transpose_vstripe_ks4<3>; break; + case 4: return resize_h_planar_float_sse_transpose_vstripe_ks4<0>; break; + default: return resizer_h_ssse3_generic_float; + } } #endif return resize_h_c_planar; @@ -1751,6 +1809,10 @@ ResamplerV FilteredResizeV::GetResampler(int CPU, int pixelsize, int bits_per_pi if (pixelsize == 1) { #ifdef INTEL_INTRINSICS +#ifdef INTEL_INTRINSICS_AVX512 + if (CPU & CPUF_AVX512F) + return resize_v_avx512_planar_uint8_t_w_sr; +#endif if (CPU & CPUF_AVX2) return resize_v_avx2_planar_uint8_t; if (CPU & CPUF_SSE2) @@ -1766,6 +1828,13 @@ ResamplerV FilteredResizeV::GetResampler(int CPU, int pixelsize, int bits_per_pi else if (pixelsize == 2) { #ifdef INTEL_INTRINSICS +#ifdef INTEL_INTRINSICS_AVX512 + if (CPU & CPUF_AVX512F) + if (bits_per_pixel < 16) + return resize_v_avx512_planar_uint16_t_w_sr; + else + return resize_v_avx512_planar_uint16_t_w_sr; +#endif if (CPU & CPUF_AVX2) { if (bits_per_pixel < 16) return resize_v_avx2_planar_uint16_t; @@ -1788,8 +1857,15 @@ ResamplerV FilteredResizeV::GetResampler(int CPU, int pixelsize, int bits_per_pi else // pixelsize== 4 { #ifdef INTEL_INTRINSICS +#ifdef INTEL_INTRINSICS_AVX512 + if (CPU & CPUF_AVX512F) { +// return resize_v_avx512_planar_float; + return resize_v_avx512_planar_float_w_sr; + } +#endif if (CPU & CPUF_AVX2) { - return resize_v_avx2_planar_float; +// return resize_v_avx2_planar_float; + return resize_v_avx2_planar_float_w_sr; } if (CPU & CPUF_SSE2) { return resize_v_sse2_planar_float; @@ -1909,15 +1985,24 @@ PClip FilteredResize::CreateResize(PClip clip, int target_width, int target_heig // 3 - force H and V const bool force_H = force == 1 || force == 3; const bool force_V = force == 2 || force == 3; - if (area_FirstH < area_FirstV) - { - result = CreateResizeV(clip, subrange_top, subrange_height, target_height, force_V, f, preserve_center, chroma_placement, env); - result = CreateResizeH(result, subrange_left, subrange_width, target_width, force_H, f, preserve_center, chroma_placement, env); - } + + if (force == 3) // not very good manual forcing of special 2pass mode, better to nake selection if both H and V resizs required, currently for test only + result = new FilteredResize_2p(clip, + subrange_left, subrange_width, target_width, + subrange_top, subrange_height, target_height, + f, preserve_center, chroma_placement, env); else { - result = CreateResizeH(clip, subrange_left, subrange_width, target_width, force_H, f, preserve_center, chroma_placement, env); - result = CreateResizeV(result, subrange_top, subrange_height, target_height, force_V, f, preserve_center, chroma_placement, env); + if (area_FirstH < area_FirstV) + { + result = CreateResizeV(clip, subrange_top, subrange_height, target_height, force_V, f, preserve_center, chroma_placement, env); + result = CreateResizeH(result, subrange_left, subrange_width, target_width, force_H, f, preserve_center, chroma_placement, env); + } + else + { + result = CreateResizeH(clip, subrange_left, subrange_width, target_width, force_H, f, preserve_center, chroma_placement, env); + result = CreateResizeV(result, subrange_top, subrange_height, target_height, force_V, f, preserve_center, chroma_placement, env); + } } return result; } @@ -2095,3 +2180,482 @@ AVSValue __cdecl FilteredResize::Create_UserDefined2Resize(AVSValue args, void*, return CreateResize(args[0].AsClip(), args[1].AsInt(), args[2].AsInt(), &args[6], force, &f, preserve_center, placement_name, forced_chroma_placement, env); } + +/*************************************** + ***** Filtered Resize - 2p ****** + ***************************************/ + +FilteredResize_2p::FilteredResize_2p(PClip _child, + double subrange_left, double subrange_width, int target_width, + double subrange_top, double subrange_height, int target_height, + ResamplingFunction* func, bool preserve_center, int chroma_placement, IScriptEnvironment* env) + : GenericVideoFilter(_child), + resampling_program_luma_h(0), resampling_program_chroma_h(0), + resampling_program_luma_v(0), resampling_program_chroma_v(0) +{ + if (target_height <= 0) + env->ThrowError("Resize: Height must be greater than 0."); + + if (target_width <= 0) + env->ThrowError("Resize: Width must be greater than 0."); + + // set class globals + src_width = vi.width; + src_height = vi.height; + dst_width = target_width; + dst_height = target_height; + + pixelsize = vi.ComponentSize(); // AVS16 + bits_per_pixel = vi.BitsPerComponent(); + grey = vi.IsY(); + bool isRGBPfamily = vi.IsPlanarRGB() || vi.IsPlanarRGBA(); + + if (vi.IsPlanar() && !grey && !isRGBPfamily) { + const int mask = (1 << vi.GetPlaneHeightSubsampling(PLANAR_U)) - 1; + + if (target_height & mask) + env->ThrowError("Resize: Planar destination height must be a multiple of %d.", mask + 1); + } + + if (vi.IsRGB() && !isRGBPfamily) + subrange_top = vi.height - subrange_top - subrange_height; // packed RGB upside down + +#ifdef INTEL_INTRINSICS + int cpu = env->GetCPUFlags(); +#else + int cpu = 0; +#endif + + double center_pos_v_luma; + double center_pos_v_chroma; + GetCenterShiftForResizers(center_pos_v_luma, center_pos_v_chroma, preserve_center, chroma_placement, vi, false /* for vertical */); + + double center_pos_h_luma; + double center_pos_h_chroma; + GetCenterShiftForResizers(center_pos_h_luma, center_pos_h_chroma, preserve_center, chroma_placement, vi, true /* for horizontal */); + // 3.7.4- parameter, old Avisynth behavior: 0.5, 0.5 + + // Create resampling program and pitch table for H + resampling_program_luma_h = func->GetResamplingProgram(vi.width, subrange_left, subrange_width, target_width, bits_per_pixel, + center_pos_h_luma, center_pos_h_luma, // for resizing it's the same for source and dest + env); + resampler_luma_h = GetResamplerH(cpu, pixelsize, bits_per_pixel, resampling_program_luma_h, env); + + // Create resampling program and pitch table for V + resampling_program_luma_v = func->GetResamplingProgram(vi.height, subrange_top, subrange_height, target_height, bits_per_pixel, + center_pos_v_luma, center_pos_v_luma, // for resizing it's the same for source and dest + env); + resampler_luma_v = GetResamplerV(cpu, pixelsize, bits_per_pixel, resampling_program_luma_v, env); + + + if (vi.IsPlanar() && !grey && !isRGBPfamily) { + const int shift = vi.GetPlaneHeightSubsampling(PLANAR_U); + const int div = 1 << shift; + + resampling_program_chroma_v = func->GetResamplingProgram( + vi.height >> shift, + subrange_top / div, + subrange_height / div, + target_height >> shift, + bits_per_pixel, + center_pos_v_chroma, center_pos_v_chroma, // for resizing it's the same for source and dest + env); + + resampler_chroma_v = GetResamplerV(cpu, pixelsize, bits_per_pixel, resampling_program_chroma_v, env); + } + + if (vi.IsPlanar() && !grey && !isRGBPfamily) { + const int shift = vi.GetPlaneWidthSubsampling(PLANAR_U); + const int div = 1 << shift; + + resampling_program_chroma_h = func->GetResamplingProgram( + vi.width >> shift, + subrange_left / div, + subrange_width / div, + target_width >> shift, + bits_per_pixel, + center_pos_h_chroma, center_pos_h_chroma, // horizontal + env); + + resampler_chroma_h = GetResamplerH(cpu, pixelsize, bits_per_pixel, resampling_program_chroma_h, env); + } + + // Change target video info size + vi.height = target_height; + vi.width = target_width; +} + +#if 0 // expected worse in performance - left for performance tests +PVideoFrame __stdcall FilteredResize_2p::GetFrame(int n, IScriptEnvironment* env) // use env->Allocate() to get temp buf from other allocated memory - it is NOT returned to the memory pool for the NewVideoFrameP for the downstream filter to write to ? +{ + PVideoFrame src = child->GetFrame(n, env); + PVideoFrame dst = env->NewVideoFrameP(vi, &src); + int src_pitch = src->GetPitch(); + int dst_pitch = dst->GetPitch(); + const BYTE* srcp = src->GetReadPtr(); + BYTE* dstp = dst->GetWritePtr(); // for first (largest ?) plane or for single ? + + bool isRGBPfamily = vi.IsPlanarRGB() || vi.IsPlanarRGBA(); + + BYTE* temp_1 = static_cast(env->Allocate(dst_pitch * dst_height, FRAME_ALIGN, AVS_POOLED_ALLOC)); + if (!temp_1 ) { + env->Free(temp_1); + env->ThrowError("Could not reserve temp memory in a resampler_2p."); + } + + // Do resizing, single plane by plane + resampler_luma_h(temp_1, srcp, dst_pitch, src_pitch, resampling_program_luma_h, dst_width, src_height, bits_per_pixel); + int work_height = vi.IsPlanar() ? vi.width : vi.BytesFromPixels(vi.width) / pixelsize; // packed RGB: or vi.width * vi.NumComponent() + resampler_luma_v(dstp, temp_1, dst_pitch, dst_pitch, resampling_program_luma_v, work_height, vi.height, bits_per_pixel); + + if (isRGBPfamily) + { + src_pitch = src->GetPitch(PLANAR_B); + dst_pitch = dst->GetPitch(PLANAR_B); + srcp = src->GetReadPtr(PLANAR_B); + dstp = dst->GetWritePtr(PLANAR_B); + + resampler_luma_h(temp_1, srcp, dst_pitch, src_pitch, resampling_program_luma_h, dst_width, src_height, bits_per_pixel); + int work_height = vi.IsPlanar() ? vi.width : vi.BytesFromPixels(vi.width) / pixelsize; // packed RGB: or vi.width * vi.NumComponent() + resampler_luma_v(dstp, temp_1, dst_pitch, dst_pitch, resampling_program_luma_v, work_height, vi.height, bits_per_pixel); + + src_pitch = src->GetPitch(PLANAR_R); + dst_pitch = dst->GetPitch(PLANAR_R); + srcp = src->GetReadPtr(PLANAR_R); + dstp = dst->GetWritePtr(PLANAR_R); + + resampler_luma_h(temp_1, srcp, dst_pitch, src_pitch, resampling_program_luma_h, dst_width, src_height, bits_per_pixel); + resampler_luma_v(dstp, temp_1, dst_pitch, dst_pitch, resampling_program_luma_v, work_height, vi.height, bits_per_pixel); + + } + else if (!grey && vi.IsPlanar()) { + int width = vi.width >> vi.GetPlaneWidthSubsampling(PLANAR_U); + int height = vi.height >> vi.GetPlaneHeightSubsampling(PLANAR_U); + + // Plane U resizing + src_pitch = src->GetPitch(PLANAR_U); + dst_pitch = dst->GetPitch(PLANAR_U); + srcp = src->GetReadPtr(PLANAR_U); + dstp = dst->GetWritePtr(PLANAR_U); + + resampler_chroma_h(temp_1, srcp, dst_pitch, src_pitch, resampling_program_chroma_h, width, src_height >> vi.GetPlaneHeightSubsampling(PLANAR_U), bits_per_pixel); + resampler_chroma_v(dstp, temp_1, dst_pitch, dst_pitch, resampling_program_chroma_v, width, height, bits_per_pixel); + + // Plane V resizing + src_pitch = src->GetPitch(PLANAR_V); + dst_pitch = dst->GetPitch(PLANAR_V); + srcp = src->GetReadPtr(PLANAR_V); + dstp = dst->GetWritePtr(PLANAR_V); + + resampler_chroma_h(temp_1, srcp, dst_pitch, src_pitch, resampling_program_chroma_h, width, src_height >> vi.GetPlaneHeightSubsampling(PLANAR_U), bits_per_pixel); + resampler_chroma_v(dstp, temp_1, dst_pitch, dst_pitch, resampling_program_chroma_v, width, height, bits_per_pixel); + + } + + if (vi.IsYUVA() || vi.IsPlanarRGBA()) { + src_pitch = src->GetPitch(PLANAR_A); + dst_pitch = dst->GetPitch(PLANAR_A); + srcp = src->GetReadPtr(PLANAR_A); + dstp = dst->GetWritePtr(PLANAR_A); + + resampler_luma_h(temp_1, srcp, dst_pitch, src_pitch, resampling_program_luma_h, dst_width, src_height, bits_per_pixel); + int work_height = vi.IsPlanar() ? vi.width : vi.BytesFromPixels(vi.width) / pixelsize; // packed RGB: or vi.width * vi.NumComponent() + resampler_luma_v(dstp, temp_1, dst_pitch, dst_pitch, resampling_program_luma_v, work_height, vi.height, bits_per_pixel); + } + + env->Free(temp_1); + + return dst; +} +#endif + +PVideoFrame __stdcall FilteredResize_2p::GetFrame(int n, IScriptEnvironment* env) // use NewVideoFrame as temp buf to return it in the vfb pool after exit this filter +{ + PVideoFrame src = child->GetFrame(n, env); + PVideoFrame dst = env->NewVideoFrameP(vi, &src); + + PVideoFrame tmp = env->NewVideoFrame(vi); // no need frame properties copy, use as temporal buffer only and its refcount will be zeroed at function exit with object auto-release/destructor (PVideoFrame::~PVideoFrame() ) + /* + Here we need to ask ScriptEnvironment to look for output format of downstream filter ? So it is not trans-in-place filter we can request frame buffer larger and left + it unused after exiting this function. Only in this case there is a big probability the env->NewVideoFrameP(vi, &src); for downstream filter call will return this same virtual address buffer + to the downstream filter and it can be (at least partially) overwritten saving from useless downloading from CPU cache. It is new TODO idea for modification of ScriptEnvironment vfb memory management. + After this will be implemented - we can use such method of requesting temp buffer (frame) to use in 2pass resize. + If this is last filter in a chain - simply request lowest possible sized frame. + + Update 30.05.2025: The expected transfer of tmp buf address to downstream fiter dst frame sometime happens - but how frequently it happens in real scripts running - need to be discovered. + + As env->Allocate/Free buffers are definitely worse (only good if downstream filter will request same temp buf for write) - this temp method expected to be faster (as first expectations). + */ + + int src_pitch = src->GetPitch(); + int dst_pitch = dst->GetPitch(); + const BYTE* srcp = src->GetReadPtr(); + BYTE* dstp = dst->GetWritePtr(); // for first (largest ?) plane or for single ? + + bool isRGBPfamily = vi.IsPlanarRGB() || vi.IsPlanarRGBA(); + + const BYTE* tmp_srcp = tmp->GetReadPtr(); + BYTE* tmp_dstp = tmp->GetWritePtr(); // for first (largest ?) plane or for single ? + int tmp_pitch = tmp->GetPitch(); + + // Do resizing, single plane by plane + resampler_luma_h(tmp_dstp, srcp, tmp_pitch, src_pitch, resampling_program_luma_h, dst_width, src_height, bits_per_pixel); + int work_height = vi.IsPlanar() ? vi.width : vi.BytesFromPixels(vi.width) / pixelsize; // packed RGB: or vi.width * vi.NumComponent() + resampler_luma_v(dstp, tmp_srcp, dst_pitch, tmp_pitch, resampling_program_luma_v, work_height, vi.height, bits_per_pixel); + + + if (isRGBPfamily) + { + src_pitch = src->GetPitch(PLANAR_B); + dst_pitch = dst->GetPitch(PLANAR_B); + srcp = src->GetReadPtr(PLANAR_B); + dstp = dst->GetWritePtr(PLANAR_B); + + resampler_luma_h(tmp_dstp, srcp, tmp_pitch, src_pitch, resampling_program_luma_h, dst_width, src_height, bits_per_pixel); + int work_height = vi.IsPlanar() ? vi.width : vi.BytesFromPixels(vi.width) / pixelsize; // packed RGB: or vi.width * vi.NumComponent() + resampler_luma_v(dstp, tmp_srcp, dst_pitch, tmp_pitch, resampling_program_luma_v, work_height, vi.height, bits_per_pixel); + + src_pitch = src->GetPitch(PLANAR_R); + dst_pitch = dst->GetPitch(PLANAR_R); + srcp = src->GetReadPtr(PLANAR_R); + dstp = dst->GetWritePtr(PLANAR_R); + + resampler_luma_h(tmp_dstp, srcp, tmp_pitch, src_pitch, resampling_program_luma_h, dst_width, src_height, bits_per_pixel); + resampler_luma_v(dstp, tmp_srcp, dst_pitch, tmp_pitch, resampling_program_luma_v, work_height, vi.height, bits_per_pixel); + + } + else if (!grey && vi.IsPlanar()) { + int width = vi.width >> vi.GetPlaneWidthSubsampling(PLANAR_U); + int height = vi.height >> vi.GetPlaneHeightSubsampling(PLANAR_U); + + // Plane U resizing + src_pitch = src->GetPitch(PLANAR_U); + dst_pitch = dst->GetPitch(PLANAR_U); + srcp = src->GetReadPtr(PLANAR_U); + dstp = dst->GetWritePtr(PLANAR_U); + + resampler_chroma_h(tmp_dstp, srcp, tmp_pitch, src_pitch, resampling_program_chroma_h, width, src_height >> vi.GetPlaneHeightSubsampling(PLANAR_U), bits_per_pixel); + resampler_chroma_v(dstp, tmp_dstp, dst_pitch, tmp_pitch, resampling_program_chroma_v, width, height, bits_per_pixel); + + // Plane V resizing + src_pitch = src->GetPitch(PLANAR_V); + dst_pitch = dst->GetPitch(PLANAR_V); + srcp = src->GetReadPtr(PLANAR_V); + dstp = dst->GetWritePtr(PLANAR_V); + + resampler_chroma_h(tmp_dstp, srcp, tmp_pitch, src_pitch, resampling_program_chroma_h, width, src_height >> vi.GetPlaneHeightSubsampling(PLANAR_V), bits_per_pixel); + resampler_chroma_v(dstp, tmp_dstp, dst_pitch, tmp_pitch, resampling_program_chroma_v, width, height, bits_per_pixel); + + } + + if (vi.IsYUVA() || vi.IsPlanarRGBA()) { + src_pitch = src->GetPitch(PLANAR_A); + dst_pitch = dst->GetPitch(PLANAR_A); + srcp = src->GetReadPtr(PLANAR_A); + dstp = dst->GetWritePtr(PLANAR_A); + + resampler_luma_h(tmp_dstp, srcp, tmp_pitch, src_pitch, resampling_program_luma_h, dst_width, src_height, bits_per_pixel); + int work_height = vi.IsPlanar() ? vi.width : vi.BytesFromPixels(vi.width) / pixelsize; // packed RGB: or vi.width * vi.NumComponent() + resampler_luma_v(dstp, tmp_dstp, dst_pitch, tmp_pitch, resampling_program_luma_v, work_height, vi.height, bits_per_pixel); + } + + return dst; +} + + +ResamplerV FilteredResize_2p::GetResamplerV(int CPU, int pixelsize, int bits_per_pixel, ResamplingProgram* program, IScriptEnvironment* env) // may be somehow call same method from FilteredResizeV class ? +{ + + resize_prepare_coeffs(program, env, 8); + // for SIMD friendliness and more: consolidate the kernel_size vs filter_size at the end. + // See comments at FilteredResizeH::GetResampler + + if (program->filter_size == 1) { + // Fast pointresize + switch (pixelsize) // AVS16 + { + case 1: return resize_v_planar_pointresize; + case 2: return resize_v_planar_pointresize; + default: // case 4: + return resize_v_planar_pointresize; + } + } + else { + // Other resizers + if (pixelsize == 1) + { +#ifdef INTEL_INTRINSICS +#ifdef INTEL_INTRINSICS_AVX512 + if (CPU & CPUF_AVX512F) + return resize_v_avx512_planar_uint8_t_w_sr; +#endif + if (CPU & CPUF_AVX2) + return resize_v_avx2_planar_uint8_t; + if (CPU & CPUF_SSE2) + return resize_v_sse2_planar; +#ifdef X86_32 + if (CPU & CPUF_MMX) + return resize_v_mmx_planar; +#endif +#endif + // C version + return resize_v_c_planar_uint8_16_t_auto_vectorized; + } + else if (pixelsize == 2) + { +#ifdef INTEL_INTRINSICS +#ifdef INTEL_INTRINSICS_AVX512 + if (CPU & CPUF_AVX512F) + if (bits_per_pixel < 16) + return resize_v_avx512_planar_uint16_t_w_sr; + else + return resize_v_avx512_planar_uint16_t_w_sr; +#endif + if (CPU & CPUF_AVX2) { + if (bits_per_pixel < 16) + return resize_v_avx2_planar_uint16_t; + else + return resize_v_avx2_planar_uint16_t; + } + if (CPU & CPUF_SSE2) { + if (bits_per_pixel < 16) + return resize_v_sse2_planar_uint16_t; + else + return resize_v_sse2_planar_uint16_t; + } +#endif + // C version + if (bits_per_pixel == 16) + return resize_v_c_planar_uint8_16_t_auto_vectorized; + else + return resize_v_c_planar_uint8_16_t_auto_vectorized; + } + else // pixelsize== 4 + { +#ifdef INTEL_INTRINSICS +#ifdef INTEL_INTRINSICS_AVX512 + if (CPU & CPUF_AVX512F) { +// return resize_v_avx512_planar_float; + return resize_v_avx512_planar_float_w_sr; + } +#endif + if (CPU & CPUF_AVX2) { + return resize_v_avx2_planar_float; + } + if (CPU & CPUF_SSE2) { + return resize_v_sse2_planar_float; + } +#endif + return resize_v_c_planar_float_auto_vectorized; + } + } +} + +ResamplerH FilteredResize_2p::GetResamplerH(int CPU, int pixelsize, int bits_per_pixel, ResamplingProgram* program, IScriptEnvironment* env) // may be somehow call same method from FilteredResizeH class ? +{ + int simd_coeff_count_padding = 8; + + // Both 8-bit and 16-bit SSSE3 and AVX2 horizontal resizers benefit from processing 16 pixels per cycle. + // Floats also use 32 bytes, but since 32/sizeof(float) = 8, processing 16 pixels is unnecessary. + // Even in C, the code is optimized to be vector-friendly. + if (pixelsize == 1 || pixelsize == 2) + simd_coeff_count_padding = 16; + + // Not only does it prepare and pad for SIMD/vector code, but it also corrects, reorders, and equalizes coefficients + // at the right and bottom ends, since we may have variable kernel sizes due to boundary conditions. + resize_prepare_coeffs(program, env, simd_coeff_count_padding); + + if (pixelsize == 1) + { +#ifdef INTEL_INTRINSICS + if (CPU & CPUF_AVX2) { + return resizer_h_avx2_generic_uint8_t; + } + if (CPU & CPUF_SSSE3) { + return resizer_h_ssse3_generic_uint8_16; + } +#endif + return resizer_h_c_generic_uint8_16_vectorized; + //return resize_h_c_planar; + } + else if (pixelsize == 2) { +#ifdef INTEL_INTRINSICS + if (CPU & CPUF_AVX2) { + if (bits_per_pixel < 16) + return resizer_h_avx2_generic_uint16_t; + else + return resizer_h_avx2_generic_uint16_t; + } + if (CPU & CPUF_SSSE3) { + if (bits_per_pixel < 16) + return resizer_h_ssse3_generic_uint8_16; + else + return resizer_h_ssse3_generic_uint8_16; + } +#endif + if (bits_per_pixel == 16) + return resizer_h_c_generic_uint8_16_vectorized; + // return resize_h_c_planar; + else + return resizer_h_c_generic_uint8_16_vectorized; + // return resize_h_c_planar; + } + else { //if (pixelsize == 4) +#ifdef INTEL_INTRINSICS +#ifdef INTEL_INTRINSICS_AVX512 + if ((CPU & CPUF_AVX512F) && program->filter_size_real <= 4) { + //return resize_h_planar_float_avx2_permutex_vstripe_ks4; + switch (program->filter_size_real) { + /* case 1: return resize_h_planar_float_avx512_transpose_vstripe_ks4<1>; break; + case 2: return resize_h_planar_float_avx512_transpose_vstripe_ks4<2>; break; + case 3: return resize_h_planar_float_avx512_transpose_vstripe_ks4<3>; break; + case 4: return resize_h_planar_float_avx512_transpose_vstripe_ks4<0>; break;*/ + case 1: return resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w<1>; break; + case 2: return resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w<2>; break; + case 3: return resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w<3>; break; + case 4: return resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w<0>; break; + } + } +#endif + if (CPU & CPUF_AVX2) { + //return resize_h_planar_float_avx2_permutex_vstripe_ks4; + + switch (program->filter_size_real) { + /* case 1: return resize_h_planar_float_avx_transpose_vstripe_ks4<1>; break; + case 2: return resize_h_planar_float_avx_transpose_vstripe_ks4<2>; break; + case 3: return resize_h_planar_float_avx_transpose_vstripe_ks4<3>; break; + case 4: return resize_h_planar_float_avx_transpose_vstripe_ks4<0>; break;*/ + case 1: return resize_h_planar_float_avx2_gather_permutex_vstripe_ks4<1>; break; + case 2: return resize_h_planar_float_avx2_gather_permutex_vstripe_ks4<2>; break; + case 3: return resize_h_planar_float_avx2_gather_permutex_vstripe_ks4<3>; break; + case 4: return resize_h_planar_float_avx2_gather_permutex_vstripe_ks4<0>; break; + default: return resizer_h_avx2_generic_float; + } + + } + if (CPU & CPUF_SSSE3) { + // return resizer_h_ssse3_generic_float; + switch (program->filter_size_real) { + case 1: return resize_h_planar_float_sse_transpose_vstripe_ks4<1>; break; + case 2: return resize_h_planar_float_sse_transpose_vstripe_ks4<2>; break; + case 3: return resize_h_planar_float_sse_transpose_vstripe_ks4<3>; break; + case 4: return resize_h_planar_float_sse_transpose_vstripe_ks4<0>; break; + default: return resizer_h_ssse3_generic_float; + } + } +#endif + return resize_h_c_planar; + } +} + + +FilteredResize_2p::~FilteredResize_2p(void) +{ + if (resampling_program_luma_h) { delete resampling_program_luma_h; } + if (resampling_program_chroma_h) { delete resampling_program_chroma_h; } + + if (resampling_program_luma_v) { delete resampling_program_luma_v; } + if (resampling_program_chroma_v) { delete resampling_program_chroma_v; } + + +} diff --git a/avs_core/filters/resample.h b/avs_core/filters/resample.h index 8afed8775..d6272ebee 100644 --- a/avs_core/filters/resample.h +++ b/avs_core/filters/resample.h @@ -121,6 +121,52 @@ class FilteredResizeV : public GenericVideoFilter }; +/** + * Class to resize in the dual directions using a specified sampling filter and lower size used temporal buffer + * Helper for resample functions + **/ +class FilteredResize_2p : public GenericVideoFilter +{ +public: + FilteredResize_2p(PClip _child, + double subrange_left, double subrange_width, int target_width, + double subrange_top, double subrange_height, int target_height, + ResamplingFunction* func, bool preserve_center, int chroma_placement, IScriptEnvironment* env); + virtual ~FilteredResize_2p(void); + + PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment* env) override; + + int __stdcall SetCacheHints(int cachehints, int frame_range) override { + AVS_UNUSED(frame_range); + return cachehints == CACHE_GET_MTMODE ? MT_NICE_FILTER : 0; + } + + static ResamplerH GetResamplerH(int CPU, int pixelsize, int bits_per_pixel, ResamplingProgram* program, IScriptEnvironment* env); + static ResamplerV GetResamplerV(int CPU, int pixelsize, int bits_per_pixel, ResamplingProgram* program, IScriptEnvironment* env); + +private: + bool grey; + int pixelsize; // AVS16 + int bits_per_pixel; + + int src_width, src_height, dst_width, dst_height; + + ResamplingProgram* resampling_program_luma_h; + ResamplingProgram* resampling_program_chroma_h; + + ResamplingProgram* resampling_program_luma_v; + ResamplingProgram* resampling_program_chroma_v; + + ResamplerH resampler_luma_h; + ResamplerH resampler_chroma_h; + + ResamplerV resampler_luma_v; + ResamplerV resampler_chroma_v; + +}; + + + /*** Resample factory methods ***/ class FilteredResize diff --git a/avs_core/filters/resample_functions.cpp b/avs_core/filters/resample_functions.cpp index 437bc6a73..4b0b3399e 100644 --- a/avs_core/filters/resample_functions.cpp +++ b/avs_core/filters/resample_functions.cpp @@ -539,9 +539,9 @@ ResamplingProgram* ResamplingFunction::GetResamplingProgram(int source_size, dou // in order not to have NaN floats if (start_pos + AlignNumber(fir_filter_size, ALIGN_FLOAT_RESIZER_COEFF_SIZE) - 1 > source_size - 1) { - if (!program->overread_possible) { + if (!program->overread_possible_filter_size_aligned) { // register the first occurance - program->overread_possible = true; + program->overread_possible_filter_size_aligned = true; program->source_overread_offset = start_pos; program->source_overread_beyond_targetx = i; } diff --git a/avs_core/filters/resample_functions.h b/avs_core/filters/resample_functions.h index 4e7b71f58..738046824 100644 --- a/avs_core/filters/resample_functions.h +++ b/avs_core/filters/resample_functions.h @@ -48,10 +48,16 @@ constexpr int FPScale = 1 << FPScale8bits; // fixed point scaler (1<<14) // for 16 bits: one bit less constexpr int FPScale16bits = 13; constexpr int FPScale16 = 1 << FPScale16bits; // fixed point scaler for 10-16 bit SIMD signed operation -constexpr int ALIGN_RESIZER_TARGET_SIZE = 8; +constexpr int ALIGN_RESIZER_TARGET_SIZE = 16; // 16: avx512 float Hoprizontal // 09-14-2002 - Vlad59 - Lanczos3Resize - Constant added #define M_PI 3.14159265358979323846 +struct SafeLimit { + bool overread_possible; + int source_overread_offset; + int source_overread_beyond_targetx; +}; + struct ResamplingProgram { IScriptEnvironment * Env; int source_size, target_size; @@ -59,6 +65,7 @@ struct ResamplingProgram { int filter_size; int filter_size_real; // maybe less than filter_size if dimensions are small int filter_size_alignment; // for info, 1 (C, nonvector-friendly), 8 (sse or avx2) or 16 (avx2) + int target_size_alignment; // coeff table exists (and containt zero coeffs) even beyond target_size. Helps alternative H resizers. // Array of Integer indicate starting point of sampling std::vector pixel_offset; @@ -74,24 +81,24 @@ struct ResamplingProgram { std::vector kernel_sizes; // 3.7.4- can be different for each line but then they get equalized and aligned. - // anti-overread helpers for float resizer simd code reading 8 pixels from a given offset - bool overread_possible; - int source_overread_offset; // offset from where reading 8 bytes requires masking garbage on the right side - int source_overread_beyond_targetx; // in H resizers danger zone starts from here. - // When reading aligned_filter_size elements from (src+offset) no longer fits image scanline dimensions - + // When reading multiple (SIMD load) source pixels from (src+offset) and it no + // longer fits image scanline dimensions (width) + SafeLimit safelimit_filter_size_aligned = { false, -1, -1 }; + SafeLimit safelimit_4_pixels = { false, -1, -1 }; + SafeLimit safelimit_8_pixels = { false, -1, -1 }; + SafeLimit safelimit_16_pixels = { false, -1, -1 }; + SafeLimit safelimit_32_pixels = { false, -1, -1 }; ResamplingProgram(int filter_size, int source_size, int target_size, double crop_start, double crop_size, int bits_per_pixel, IScriptEnvironment* env) : Env(env), source_size(source_size), target_size(target_size), crop_start(crop_start), crop_size(crop_size), filter_size(filter_size), filter_size_real(filter_size), bits_per_pixel(bits_per_pixel), pixel_coefficient(0), pixel_coefficient_float(0) { - overread_possible = false; - source_overread_offset = -1; - source_overread_beyond_targetx = -1; - // align target_size to 8 units to allow safe 8 pixels/cycle in H resizers + filter_size_alignment = 1; + // align target_size to 8 units to allow safe up to 8 pixels/cycle in H resizers. modded later. + target_size_alignment = 1; // resize_prepare_coeff can override and realign the size of coefficient table if (bits_per_pixel < 32) pixel_coefficient = (short*)Env->Allocate(sizeof(short) * target_size * filter_size, 64, AVS_NORMAL_ALLOC);