From 8ec252a593a08a8679c8ac9c6ea910a0ea9d9990 Mon Sep 17 00:00:00 2001
From: DTL2020 <68707763+DTL2020@users.noreply.github.com>
Date: Fri, 2 May 2025 20:24:24 +0300
Subject: [PATCH 01/27] Added example of

transpose-based SIMD H-resize function resize_h_planar_float_sse_transpose()
---
 avs_core/filters/intel/resample_sse.cpp | 59 +++++++++++++++++++++++++
 avs_core/filters/intel/resample_sse.h   |  2 +
 2 files changed, 61 insertions(+)
diff --git a/avs_core/filters/intel/resample_sse.cpp b/avs_core/filters/intel/resample_sse.cpp
index 0fc6c66ca..13e8924dc 100644
--- a/avs_core/filters/intel/resample_sse.cpp
+++ b/avs_core/filters/intel/resample_sse.cpp
@@ -1018,3 +1018,62 @@ template void resizer_h_ssse3_generic_uint8_16<uint16_t, true>(BYTE* dst8, const
 template void resize_v_sse2_planar_uint16_t<false>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel);
 template void resize_v_sse2_planar_uint16_t<true>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel);
 
+// Transpose-based SIMD
+void resize_h_planar_float_sse_transpose(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel) {
+	int filter_size = program->filter_size;
+
+	const float* AVS_RESTRICT current_coeff;
+
+	src_pitch = src_pitch / sizeof(float);
+	dst_pitch = dst_pitch / sizeof(float);
+
+	float* src = (float*)src8;
+	float* dst = (float*)dst8;
+
+	const int kernel_size = program->filter_size_real;
+	const int ksmod4 = kernel_size / 4 * 4;
+	//	const int ksmod8 = kernel_size / 8 * 8;
+
+	for (int y = 0; y < height; y++) {
+		current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
+
+		float* AVS_RESTRICT dst2_ptr = dst + y * dst_pitch;
+		const float* src_ptr = src + y * src_pitch;
+
+		for (int x = 0; x < width; x+=4) {
+
+			__m128 result = _mm_setzero_ps();
+
+			for (int i = 0; i < ksmod4; i += 4) {
+				__m128 data_1 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + i + 0]);
+				__m128 data_2 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + i + 1]);
+				__m128 data_3 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + i + 2]);
+				__m128 data_4 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + i + 3]);
+
+				__m128 coeff_1 = _mm_load_ps(current_coeff + i + filter_size * 0); // is it correct for i > 0 ? may be filter_size * (0 + i) ?
+				__m128 coeff_2 = _mm_load_ps(current_coeff + i + filter_size * 1);
+				__m128 coeff_3 = _mm_load_ps(current_coeff + i + filter_size * 2);
+				__m128 coeff_4 = _mm_load_ps(current_coeff + i + filter_size * 3);
+
+				_MM_TRANSPOSE4_PS(data_1, data_2, data_3, data_4);
+				_MM_TRANSPOSE4_PS(coeff_1, coeff_2, coeff_3, coeff_4);
+
+				__m128 temp_result = _mm_mul_ps(data_1, coeff_1);
+				result = _mm_add_ps(temp_result, result);
+
+				temp_result = _mm_mul_ps(data_2, coeff_2);
+				result = _mm_add_ps(temp_result, result);
+
+				temp_result = _mm_mul_ps(data_3, coeff_3);
+				result = _mm_add_ps(temp_result, result);
+
+				temp_result = _mm_mul_ps(data_4, coeff_4);
+				result = _mm_add_ps(temp_result, result);
+			}
+
+			_mm_store_ps(dst2_ptr + x, result);
+			current_coeff += filter_size * 4;
+		}
+	}
+}
+
diff --git a/avs_core/filters/intel/resample_sse.h b/avs_core/filters/intel/resample_sse.h
index 8c85795a3..e1e20488a 100644
--- a/avs_core/filters/intel/resample_sse.h
+++ b/avs_core/filters/intel/resample_sse.h
@@ -60,4 +60,6 @@ __attribute__((__target__("ssse3")))
 #endif
 void resizer_h_ssse3_generic_float(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
 
+void resize_h_planar_float_sse_transpose(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+
 #endif // __Resample_SSE_H__

From 04aea193187e50b663daea7464e6233537ab142c Mon Sep 17 00:00:00 2001
From: DTL2020 <68707763+DTL2020@users.noreply.github.com>
Date: Fri, 2 May 2025 20:25:24 +0300
Subject: [PATCH 02/27] Function switch for testing of

H-resize
---
 avs_core/filters/resample.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/avs_core/filters/resample.cpp b/avs_core/filters/resample.cpp
index 7caf9d467..1f52fb21d 100644
--- a/avs_core/filters/resample.cpp
+++ b/avs_core/filters/resample.cpp
@@ -1589,6 +1589,7 @@ ResamplerH FilteredResizeH::GetResampler(int CPU, int pixelsize, int bits_per_pi
     }
     if (CPU & CPUF_SSSE3) {
       return resizer_h_ssse3_generic_float;
+//		return resize_h_planar_float_sse_transpose;
     }
 #endif
     return resize_h_c_planar<float, 0>;

From b7af59117620261a0efb7e1bf124f54406798dac Mon Sep 17 00:00:00 2001
From: DTL2020 <68707763+DTL2020@users.noreply.github.com>
Date: Sun, 4 May 2025 03:30:34 -0700
Subject: [PATCH 03/27] Finally debugged SSE and AVX

H-resizers transpose-based. Ready for testing.
---
 avs_core/filters/intel/resample_avx2.cpp | 202 +++++++++++++++++++++++
 avs_core/filters/intel/resample_avx2.h   |  61 +++++++
 avs_core/filters/intel/resample_sse.cpp  |  87 ++++++++--
 3 files changed, 332 insertions(+), 18 deletions(-)

diff --git a/avs_core/filters/intel/resample_avx2.cpp b/avs_core/filters/intel/resample_avx2.cpp
index 1929021eb..ea4837fca 100644
--- a/avs_core/filters/intel/resample_avx2.cpp
+++ b/avs_core/filters/intel/resample_avx2.cpp
@@ -998,3 +998,205 @@ template void resizer_h_avx2_generic_uint16_t<true>(BYTE* dst8, const BYTE* src8
 template void resize_v_avx2_planar_uint16_t<false>(BYTE* dst0, const BYTE* src0, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel);
 // avx2 10-14bit
 template void resize_v_avx2_planar_uint16_t<true>(BYTE* dst0, const BYTE* src0, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel);
+
+
+// Transpose-based SIMD
+void resize_h_planar_float_avx_transpose(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel) {
+    int filter_size = program->filter_size;
+
+    const float* AVS_RESTRICT current_coeff;
+
+    src_pitch = src_pitch / sizeof(float);
+    dst_pitch = dst_pitch / sizeof(float);
+
+    float* src = (float*)src8;
+    float* dst = (float*)dst8;
+
+    const int kernel_size = program->filter_size_real;
+    const int ksmod4 = kernel_size / 4 * 4;
+//    const int ksmod8 = kernel_size / 8 * 8;
+
+#if 0
+    for (int y = 0; y < height; y ++) {
+        current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
+
+        float* AVS_RESTRICT dst2_ptr = dst + y * dst_pitch;
+        const float* src_ptr = src + y * src_pitch;
+
+        for (int x = 0; x < width; x += 8) {
+
+            __m256 result = _mm256_setzero_ps();
+
+            for (int i = 0; i < ksmod4; i += 4) {
+
+                __m256 data_1_data_5 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 0] + i, src_ptr + program->pixel_offset[x + 4] + i);
+                __m256 data_2_data_6 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 1] + i, src_ptr + program->pixel_offset[x + 5] + i);
+                __m256 data_3_data_7 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 2] + i, src_ptr + program->pixel_offset[x + 6] + i);
+                __m256 data_4_data_8 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 3] + i, src_ptr + program->pixel_offset[x + 7] + i);
+
+                __m256 coef_1_coef_5 = _mm256_load_2_m128(current_coeff + i + filter_size * 0, current_coeff + i + filter_size * 4);
+                __m256 coef_2_coef_6 = _mm256_load_2_m128(current_coeff + i + filter_size * 1, current_coeff + i + filter_size * 5);
+                __m256 coef_3_coef_7 = _mm256_load_2_m128(current_coeff + i + filter_size * 2, current_coeff + i + filter_size * 6);
+                __m256 coef_4_coef_8 = _mm256_load_2_m128(current_coeff + i + filter_size * 3, current_coeff + i + filter_size * 7);
+
+
+                _MM_TRANSPOSE8_LANE4_PS(data_1_data_5, data_2_data_6, data_3_data_7, data_4_data_8);
+                _MM_TRANSPOSE8_LANE4_PS(coef_1_coef_5, coef_2_coef_6, coef_3_coef_7, coef_4_coef_8);
+
+                result = _mm256_fmadd_ps(data_1_data_5, coef_1_coef_5, result);
+                result = _mm256_fmadd_ps(data_2_data_6, coef_2_coef_6, result);
+                result = _mm256_fmadd_ps(data_3_data_7, coef_3_coef_7, result);
+                result = _mm256_fmadd_ps(data_4_data_8, coef_4_coef_8, result);
+
+            }
+
+            _mm256_store_ps(dst2_ptr + x, result);
+            current_coeff += filter_size * 8;
+        }
+    }
+
+#endif
+
+    for (int y = 0; y < height; y+=2) {
+        current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
+
+        float* AVS_RESTRICT dst2_ptr = dst + y * dst_pitch;
+        float* AVS_RESTRICT dst2_ptr2 = dst + (y + 1) * dst_pitch;
+        const float* src_ptr = src + y * src_pitch;
+        const float* src_ptr2 = src + (y + 1) * src_pitch;
+
+        for (int x = 0; x < width; x += 8) {
+
+            __m256 result = _mm256_setzero_ps();
+            __m256 result2 = _mm256_setzero_ps();
+
+            for (int i = 0; i < kernel_size; i += 4) { // is it always mod4 ?
+                __m256 data_1_data_5 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 0] + i, src_ptr + program->pixel_offset[x + 4] + i);
+                __m256 data_2_data_6 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 1] + i, src_ptr + program->pixel_offset[x + 5] + i);
+                __m256 data_3_data_7 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 2] + i, src_ptr + program->pixel_offset[x + 6] + i);
+                __m256 data_4_data_8 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 3] + i, src_ptr + program->pixel_offset[x + 7] + i);
+
+                __m256 data_1_data_5_2 = _mm256_loadu_2_m128(src_ptr2 + program->pixel_offset[x + 0] + i, src_ptr2 + program->pixel_offset[x + 4] + i);
+                __m256 data_2_data_6_2 = _mm256_loadu_2_m128(src_ptr2 + program->pixel_offset[x + 1] + i, src_ptr2 + program->pixel_offset[x + 5] + i);
+                __m256 data_3_data_7_2 = _mm256_loadu_2_m128(src_ptr2 + program->pixel_offset[x + 2] + i, src_ptr2 + program->pixel_offset[x + 6] + i);
+                __m256 data_4_data_8_2 = _mm256_loadu_2_m128(src_ptr2 + program->pixel_offset[x + 3] + i, src_ptr2 + program->pixel_offset[x + 7] + i);
+
+                __m256 coef_1_coef_5 = _mm256_load_2_m128(current_coeff + i + filter_size * 0, current_coeff + i + filter_size * 4);
+                __m256 coef_2_coef_6 = _mm256_load_2_m128(current_coeff + i + filter_size * 1, current_coeff + i + filter_size * 5);
+                __m256 coef_3_coef_7 = _mm256_load_2_m128(current_coeff + i + filter_size * 2, current_coeff + i + filter_size * 6);
+                __m256 coef_4_coef_8 = _mm256_load_2_m128(current_coeff + i + filter_size * 3, current_coeff + i + filter_size * 7);
+
+                _MM_TRANSPOSE8_LANE4_PS(data_1_data_5, data_2_data_6, data_3_data_7, data_4_data_8);
+                _MM_TRANSPOSE8_LANE4_PS(data_1_data_5_2, data_2_data_6_2, data_3_data_7_2, data_4_data_8_2);
+                _MM_TRANSPOSE8_LANE4_PS(coef_1_coef_5, coef_2_coef_6, coef_3_coef_7, coef_4_coef_8);
+
+                result = _mm256_fmadd_ps(data_1_data_5, coef_1_coef_5, result);
+                result = _mm256_fmadd_ps(data_2_data_6, coef_2_coef_6, result);
+                result = _mm256_fmadd_ps(data_3_data_7, coef_3_coef_7, result);
+                result = _mm256_fmadd_ps(data_4_data_8, coef_4_coef_8, result);
+
+                result2 = _mm256_fmadd_ps(data_1_data_5_2, coef_1_coef_5, result2);
+                result2 = _mm256_fmadd_ps(data_2_data_6_2, coef_2_coef_6, result2);
+                result2 = _mm256_fmadd_ps(data_3_data_7_2, coef_3_coef_7, result2);
+                result2 = _mm256_fmadd_ps(data_4_data_8_2, coef_4_coef_8, result2);
+
+            }
+
+            // need to process last non-mod4 kernel samples in scalar way. or can we do over-read up to 3 kernel and source samples safely with main 4-kernel_samples loop ?
+
+            _mm256_store_ps(dst2_ptr + x, result);
+            _mm256_store_ps(dst2_ptr2 + x, result2);
+            current_coeff += filter_size * 8;
+        }
+    }
+
+#if 0
+    for (int y = 0; y < height; y += 4) {
+        current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
+
+        float* AVS_RESTRICT dst2_ptr = dst + y * dst_pitch;
+        float* AVS_RESTRICT dst2_ptr2 = dst + (y + 1) * dst_pitch;
+        float* AVS_RESTRICT dst2_ptr3 = dst + (y + 2) * dst_pitch;
+        float* AVS_RESTRICT dst2_ptr4 = dst + (y + 3) * dst_pitch;
+        const float* src_ptr = src + y * src_pitch;
+        const float* src_ptr2 = src + (y + 1) * src_pitch;
+        const float* src_ptr3 = src + (y + 2) * src_pitch;
+        const float* src_ptr4 = src + (y + 3) * src_pitch;
+
+        for (int x = 0; x < width - 8; x += 8) {
+
+            __m256 result = _mm256_setzero_ps();
+            __m256 result2 = _mm256_setzero_ps();
+            __m256 result3 = _mm256_setzero_ps();
+            __m256 result4 = _mm256_setzero_ps();
+
+            for (int i = 0; i < ksmod4; i += 4) {
+                // to do with 2x 4x4 loads into 4x256 bit registers and using  _MM_TRANSPOSE8_LANE4_PS
+
+                __m256 data_1_data_5 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 0] + i, src_ptr + program->pixel_offset[x + 4] + i);
+                __m256 data_2_data_6 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 1] + i, src_ptr + program->pixel_offset[x + 5] + i);
+                __m256 data_3_data_7 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 2] + i, src_ptr + program->pixel_offset[x + 6] + i);
+                __m256 data_4_data_8 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 3] + i, src_ptr + program->pixel_offset[x + 7] + i);
+
+                __m256 data_1_data_5_2 = _mm256_loadu_2_m128(src_ptr2 + program->pixel_offset[x + 0] + i, src_ptr2 + program->pixel_offset[x + 4] + i);
+                __m256 data_2_data_6_2 = _mm256_loadu_2_m128(src_ptr2 + program->pixel_offset[x + 1] + i, src_ptr2 + program->pixel_offset[x + 5] + i);
+                __m256 data_3_data_7_2 = _mm256_loadu_2_m128(src_ptr2 + program->pixel_offset[x + 2] + i, src_ptr2 + program->pixel_offset[x + 6] + i);
+                __m256 data_4_data_8_2 = _mm256_loadu_2_m128(src_ptr2 + program->pixel_offset[x + 3] + i, src_ptr2 + program->pixel_offset[x + 7] + i);
+
+                __m256 data_1_data_5_3 = _mm256_loadu_2_m128(src_ptr3 + program->pixel_offset[x + 0] + i, src_ptr3 + program->pixel_offset[x + 4] + i);
+                __m256 data_2_data_6_3 = _mm256_loadu_2_m128(src_ptr3 + program->pixel_offset[x + 1] + i, src_ptr3 + program->pixel_offset[x + 5] + i);
+                __m256 data_3_data_7_3 = _mm256_loadu_2_m128(src_ptr3 + program->pixel_offset[x + 2] + i, src_ptr3 + program->pixel_offset[x + 6] + i);
+                __m256 data_4_data_8_3 = _mm256_loadu_2_m128(src_ptr3 + program->pixel_offset[x + 3] + i, src_ptr3 + program->pixel_offset[x + 7] + i);
+
+                __m256 data_1_data_5_4 = _mm256_loadu_2_m128(src_ptr4 + program->pixel_offset[x + 0] + i, src_ptr4 + program->pixel_offset[x + 4] + i);
+                __m256 data_2_data_6_4 = _mm256_loadu_2_m128(src_ptr4 + program->pixel_offset[x + 1] + i, src_ptr4 + program->pixel_offset[x + 5] + i);
+                __m256 data_3_data_7_4 = _mm256_loadu_2_m128(src_ptr4 + program->pixel_offset[x + 2] + i, src_ptr4 + program->pixel_offset[x + 6] + i);
+                __m256 data_4_data_8_4 = _mm256_loadu_2_m128(src_ptr4 + program->pixel_offset[x + 3] + i, src_ptr4 + program->pixel_offset[x + 7] + i);
+
+                __m256 coef_1_coef_5 = _mm256_load_2_m128(current_coeff + i + filter_size * 0, current_coeff + i + filter_size * 4);
+                __m256 coef_2_coef_6 = _mm256_load_2_m128(current_coeff + i + filter_size * 1, current_coeff + i + filter_size * 5);
+                __m256 coef_3_coef_7 = _mm256_load_2_m128(current_coeff + i + filter_size * 2, current_coeff + i + filter_size * 6);
+                __m256 coef_4_coef_8 = _mm256_load_2_m128(current_coeff + i + filter_size * 3, current_coeff + i + filter_size * 7);
+
+
+                _MM_TRANSPOSE8_LANE4_PS(data_1_data_5, data_2_data_6, data_3_data_7, data_4_data_8);
+                _MM_TRANSPOSE8_LANE4_PS(data_1_data_5_2, data_2_data_6_2, data_3_data_7_2, data_4_data_8_2);
+                _MM_TRANSPOSE8_LANE4_PS(data_1_data_5_3, data_2_data_6_3, data_3_data_7_3, data_4_data_8_3);
+                _MM_TRANSPOSE8_LANE4_PS(data_1_data_5_4, data_2_data_6_4, data_3_data_7_4, data_4_data_8_4);
+
+                _MM_TRANSPOSE8_LANE4_PS(coef_1_coef_5, coef_2_coef_6, coef_3_coef_7, coef_4_coef_8);
+
+                result = _mm256_fmadd_ps(data_1_data_5, coef_1_coef_5, result);
+                result = _mm256_fmadd_ps(data_2_data_6, coef_2_coef_6, result);
+                result = _mm256_fmadd_ps(data_3_data_7, coef_3_coef_7, result);
+                result = _mm256_fmadd_ps(data_4_data_8, coef_4_coef_8, result);
+
+                result2 = _mm256_fmadd_ps(data_1_data_5_2, coef_1_coef_5, result2);
+                result2 = _mm256_fmadd_ps(data_2_data_6_2, coef_2_coef_6, result2);
+                result2 = _mm256_fmadd_ps(data_3_data_7_2, coef_3_coef_7, result2);
+                result2 = _mm256_fmadd_ps(data_4_data_8_2, coef_4_coef_8, result2);
+
+                result3 = _mm256_fmadd_ps(data_1_data_5_3, coef_1_coef_5, result3);
+                result3 = _mm256_fmadd_ps(data_2_data_6_3, coef_2_coef_6, result3);
+                result3 = _mm256_fmadd_ps(data_3_data_7_3, coef_3_coef_7, result3);
+                result3 = _mm256_fmadd_ps(data_4_data_8_3, coef_4_coef_8, result3);
+
+                result4 = _mm256_fmadd_ps(data_1_data_5_4, coef_1_coef_5, result4);
+                result4 = _mm256_fmadd_ps(data_2_data_6_4, coef_2_coef_6, result4);
+                result4 = _mm256_fmadd_ps(data_3_data_7_4, coef_3_coef_7, result4);
+                result4 = _mm256_fmadd_ps(data_4_data_8_4, coef_4_coef_8, result4);
+
+            }
+
+            // need to process last non-mod4 kernel samples in scalar way. or can we do over-read up to 3 kernel and source samples safely with main 4-kernel_samples loop ?
+
+            _mm256_store_ps(dst2_ptr + x, result);
+            _mm256_store_ps(dst2_ptr2 + x, result2);
+            _mm256_store_ps(dst2_ptr3 + x, result3);
+            _mm256_store_ps(dst2_ptr4 + x, result4);
+
+            current_coeff += filter_size * 8;
+        }
+    }
+#endif
+}
\ No newline at end of file
diff --git a/avs_core/filters/intel/resample_avx2.h b/avs_core/filters/intel/resample_avx2.h
index a3b163aa3..3d88dc37d 100644
--- a/avs_core/filters/intel/resample_avx2.h
+++ b/avs_core/filters/intel/resample_avx2.h
@@ -52,4 +52,65 @@ void resize_v_avx2_planar_uint16_t(BYTE* dst0, const BYTE* src0, int dst_pitch,
 
 void resize_v_avx2_planar_float(BYTE* dst0, const BYTE* src0, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel);
 
+void resize_h_planar_float_avx_transpose(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+
+// Transpose 4x4 blocks within each lane
+#define _MM_TRANSPOSE8_LANE4_PS(row0, row1, row2, row3) \
+	do { \
+		__m256 __t0, __t1, __t2, __t3; \
+		__t0 = _mm256_unpacklo_ps(row0, row1); \
+		__t1 = _mm256_unpackhi_ps(row0, row1); \
+		__t2 = _mm256_unpacklo_ps(row2, row3); \
+		__t3 = _mm256_unpackhi_ps(row2, row3); \
+		row0 = _mm256_shuffle_ps(__t0, __t2, _MM_SHUFFLE(1, 0, 1, 0)); \
+		row1 = _mm256_shuffle_ps(__t0, __t2, _MM_SHUFFLE(3, 2, 3, 2)); \
+		row2 = _mm256_shuffle_ps(__t1, __t3, _MM_SHUFFLE(1, 0, 1, 0)); \
+		row3 = _mm256_shuffle_ps(__t1, __t3, _MM_SHUFFLE(3, 2, 3, 2)); \
+	} while (0)
+
+#define _MM_TRANSPOSE8_PS(row0, row1, row2, row3, row4, row5, row6, row7) \
+	do { \
+		__m256 __t0, __t1, __t2, __t3, __t4, __t5, __t6, __t7; \
+		__m256 __tt0, __tt1, __tt2, __tt3, __tt4, __tt5, __tt6, __tt7; \
+		__t0 = _mm256_unpacklo_ps(row0, row1); \
+		__t1 = _mm256_unpackhi_ps(row0, row1); \
+		__t2 = _mm256_unpacklo_ps(row2, row3); \
+		__t3 = _mm256_unpackhi_ps(row2, row3); \
+		__t4 = _mm256_unpacklo_ps(row4, row5); \
+		__t5 = _mm256_unpackhi_ps(row4, row5); \
+		__t6 = _mm256_unpacklo_ps(row6, row7); \
+		__t7 = _mm256_unpackhi_ps(row6, row7); \
+		__tt0 = _mm256_shuffle_ps(__t0, __t2, _MM_SHUFFLE(1, 0, 1, 0)); \
+		__tt1 = _mm256_shuffle_ps(__t0, __t2, _MM_SHUFFLE(3, 2, 3, 2)); \
+		__tt2 = _mm256_shuffle_ps(__t1, __t3, _MM_SHUFFLE(1, 0, 1, 0)); \
+		__tt3 = _mm256_shuffle_ps(__t1, __t3, _MM_SHUFFLE(3, 2, 3, 2)); \
+		__tt4 = _mm256_shuffle_ps(__t4, __t6, _MM_SHUFFLE(1, 0, 1, 0)); \
+		__tt5 = _mm256_shuffle_ps(__t4, __t6, _MM_SHUFFLE(3, 2, 3, 2)); \
+		__tt6 = _mm256_shuffle_ps(__t5, __t7, _MM_SHUFFLE(1, 0, 1, 0)); \
+		__tt7 = _mm256_shuffle_ps(__t5, __t7, _MM_SHUFFLE(3, 2, 3, 2)); \
+		row0 = _mm256_permute2f128_ps(__tt0, __tt4, 0x20); \
+		row1 = _mm256_permute2f128_ps(__tt1, __tt5, 0x20); \
+		row2 = _mm256_permute2f128_ps(__tt2, __tt6, 0x20); \
+		row3 = _mm256_permute2f128_ps(__tt3, __tt7, 0x20); \
+		row4 = _mm256_permute2f128_ps(__tt0, __tt4, 0x31); \
+		row5 = _mm256_permute2f128_ps(__tt1, __tt5, 0x31); \
+		row6 = _mm256_permute2f128_ps(__tt2, __tt6, 0x31); \
+		row7 = _mm256_permute2f128_ps(__tt3, __tt7, 0x31); \
+	} while (0)
+
+
+
+#ifndef _mm256_loadu_2_m128
+#define _mm256_loadu_2_m128(/* __m128 const* */ loaddr, \
+                            /* __m128 const* */ hiaddr) \
+    _mm256_set_m128(_mm_loadu_ps(hiaddr), _mm_loadu_ps(loaddr))
+#endif
+
+#ifndef _mm256_load_2_m128
+#define _mm256_load_2_m128(/* __m128 const* */ loaddr, \
+                            /* __m128 const* */ hiaddr) \
+    _mm256_set_m128(_mm_load_ps(hiaddr), _mm_load_ps(loaddr))
+#endif
+
+
 #endif // __Resample_AVX2_H__
diff --git a/avs_core/filters/intel/resample_sse.cpp b/avs_core/filters/intel/resample_sse.cpp
index 13e8924dc..83acefc5f 100644
--- a/avs_core/filters/intel/resample_sse.cpp
+++ b/avs_core/filters/intel/resample_sse.cpp
@@ -1033,7 +1033,8 @@ void resize_h_planar_float_sse_transpose(BYTE* dst8, const BYTE* src8, int dst_p
 	const int kernel_size = program->filter_size_real;
 	const int ksmod4 = kernel_size / 4 * 4;
 	//	const int ksmod8 = kernel_size / 8 * 8;
-
+#if 0
+    // single row processing - slower
 	for (int y = 0; y < height; y++) {
 		current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
 
@@ -1045,12 +1046,12 @@ void resize_h_planar_float_sse_transpose(BYTE* dst8, const BYTE* src8, int dst_p
 			__m128 result = _mm_setzero_ps();
 
 			for (int i = 0; i < ksmod4; i += 4) {
-				__m128 data_1 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + i + 0]);
-				__m128 data_2 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + i + 1]);
-				__m128 data_3 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + i + 2]);
-				__m128 data_4 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + i + 3]);
+				__m128 data_1 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + 0] + i);
+				__m128 data_2 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + 1] + i);
+				__m128 data_3 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + 2] + i);
+				__m128 data_4 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + 3] + i);
 
-				__m128 coeff_1 = _mm_load_ps(current_coeff + i + filter_size * 0); // is it correct for i > 0 ? may be filter_size * (0 + i) ?
+				__m128 coeff_1 = _mm_load_ps(current_coeff + i + filter_size * 0); 
 				__m128 coeff_2 = _mm_load_ps(current_coeff + i + filter_size * 1);
 				__m128 coeff_3 = _mm_load_ps(current_coeff + i + filter_size * 2);
 				__m128 coeff_4 = _mm_load_ps(current_coeff + i + filter_size * 3);
@@ -1058,22 +1059,72 @@ void resize_h_planar_float_sse_transpose(BYTE* dst8, const BYTE* src8, int dst_p
 				_MM_TRANSPOSE4_PS(data_1, data_2, data_3, data_4);
 				_MM_TRANSPOSE4_PS(coeff_1, coeff_2, coeff_3, coeff_4);
 
-				__m128 temp_result = _mm_mul_ps(data_1, coeff_1);
-				result = _mm_add_ps(temp_result, result);
-
-				temp_result = _mm_mul_ps(data_2, coeff_2);
-				result = _mm_add_ps(temp_result, result);
-
-				temp_result = _mm_mul_ps(data_3, coeff_3);
-				result = _mm_add_ps(temp_result, result);
-
-				temp_result = _mm_mul_ps(data_4, coeff_4);
-				result = _mm_add_ps(temp_result, result);
-			}
+                result = _mm_add_ps(_mm_mul_ps(data_1, coeff_1), result);
+                result = _mm_add_ps(_mm_mul_ps(data_2, coeff_2), result);
+                result = _mm_add_ps(_mm_mul_ps(data_3, coeff_3), result);
+                result = _mm_add_ps(_mm_mul_ps(data_4, coeff_4), result);
+            }
 
 			_mm_store_ps(dst2_ptr + x, result);
 			current_coeff += filter_size * 4;
 		}
 	}
+#endif
+
+    for (int y = 0; y < height; y+=2) {
+        current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
+
+        float* AVS_RESTRICT dst2_ptr = dst + y * dst_pitch;
+        float* AVS_RESTRICT dst2_ptr2 = dst + (y + 1) * dst_pitch;
+        const float* src_ptr = src + y * src_pitch;
+        const float* src_ptr2 = src + (y + 1) * src_pitch;
+
+        for (int x = 0; x < width; x += 4) {
+
+            __m128 result = _mm_setzero_ps();
+            __m128 result2 = _mm_setzero_ps();
+
+            for (int i = 0; i < kernel_size; i += 4) { // it is always mod4 ?
+
+                __m128 data_1 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + 0] + i);
+                __m128 data_2 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + 1] + i);
+                __m128 data_3 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + 2] + i);
+                __m128 data_4 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + 3] + i);
+
+                __m128 data_1_2 = _mm_loadu_ps(src_ptr2 + program->pixel_offset[x + 0] + i);
+                __m128 data_2_2 = _mm_loadu_ps(src_ptr2 + program->pixel_offset[x + 1] + i);
+                __m128 data_3_2 = _mm_loadu_ps(src_ptr2 + program->pixel_offset[x + 2] + i);
+                __m128 data_4_2 = _mm_loadu_ps(src_ptr2 + program->pixel_offset[x + 3] + i);
+
+                __m128 coeff_1 = _mm_load_ps(current_coeff + i + filter_size * 0);
+                __m128 coeff_2 = _mm_load_ps(current_coeff + i + filter_size * 1);
+                __m128 coeff_3 = _mm_load_ps(current_coeff + i + filter_size * 2);
+                __m128 coeff_4 = _mm_load_ps(current_coeff + i + filter_size * 3);
+
+                _MM_TRANSPOSE4_PS(data_1, data_2, data_3, data_4);
+                _MM_TRANSPOSE4_PS(data_1_2, data_2_2, data_3_2, data_4_2);
+                _MM_TRANSPOSE4_PS(coeff_1, coeff_2, coeff_3, coeff_4);
+
+                result = _mm_add_ps(_mm_mul_ps(data_1, coeff_1), result);
+                result = _mm_add_ps(_mm_mul_ps(data_2, coeff_2), result);
+                result = _mm_add_ps(_mm_mul_ps(data_3, coeff_3), result);
+                result = _mm_add_ps(_mm_mul_ps(data_4, coeff_4), result);
+
+                result2 = _mm_add_ps(_mm_mul_ps(data_1_2, coeff_1), result2);
+                result2 = _mm_add_ps(_mm_mul_ps(data_2_2, coeff_2), result2);
+                result2 = _mm_add_ps(_mm_mul_ps(data_3_2, coeff_3), result2);
+                result2 = _mm_add_ps(_mm_mul_ps(data_4_2, coeff_4), result2);
+
+            }
+
+            _mm_store_ps(dst2_ptr + x, result);
+            _mm_store_ps(dst2_ptr2 + x, result2);
+
+            current_coeff += filter_size * 4;
+        }
+    }
+
+    // to do - need to process last row of not-mod2 heights
+
 }
 

From 7879878b5548348de069bcb5c10a846cbb52df0a Mon Sep 17 00:00:00 2001
From: DTL2020 <68707763+DTL2020@users.noreply.github.com>
Date: Sun, 4 May 2025 07:24:09 -0700
Subject: [PATCH 04/27] SSE and AVX transpose-based

resizers for float32 up to kernel_size 4.
---
 avs_core/filters/resample.cpp | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/avs_core/filters/resample.cpp b/avs_core/filters/resample.cpp
index 1f52fb21d..4df4b508a 100644
--- a/avs_core/filters/resample.cpp
+++ b/avs_core/filters/resample.cpp
@@ -1585,11 +1585,20 @@ ResamplerH FilteredResizeH::GetResampler(int CPU, int pixelsize, int bits_per_pi
   else { //if (pixelsize == 4)
 #ifdef INTEL_INTRINSICS
     if (CPU & CPUF_AVX2) {
-      return resizer_h_avx2_generic_float;
+//      return resizer_h_avx2_generic_float;
+        if(program->filter_size_real <=4)
+        { 
+            return resize_h_planar_float_avx_transpose_vstripe_ks4;
+        }
+        return resizer_h_avx2_generic_float;
     }
     if (CPU & CPUF_SSSE3) {
-      return resizer_h_ssse3_generic_float;
-//		return resize_h_planar_float_sse_transpose;
+//      return resizer_h_ssse3_generic_float;
+        if (program->filter_size_real <= 4)
+        {
+            return resize_h_planar_float_sse_transpose_vstripe_ks4;
+        }
+		return resizer_h_ssse3_generic_float;
     }
 #endif
     return resize_h_c_planar<float, 0>;

From 7a9dc2d65eb45de72a48590fe47995a7421f47d7 Mon Sep 17 00:00:00 2001
From: DTL2020 <68707763+DTL2020@users.noreply.github.com>
Date: Sun, 4 May 2025 07:25:39 -0700
Subject: [PATCH 05/27] SSE and AVX transpose-

based resizers for kernel_size up to 4.
---
 avs_core/filters/intel/resample_avx2.cpp | 115 ++++++++---------------
 avs_core/filters/intel/resample_avx2.h   |   2 +
 avs_core/filters/intel/resample_sse.cpp  |  50 ++++++++++
 avs_core/filters/intel/resample_sse.h    |   1 +
 4 files changed, 90 insertions(+), 78 deletions(-)

diff --git a/avs_core/filters/intel/resample_avx2.cpp b/avs_core/filters/intel/resample_avx2.cpp
index ea4837fca..93ce72047 100644
--- a/avs_core/filters/intel/resample_avx2.cpp
+++ b/avs_core/filters/intel/resample_avx2.cpp
@@ -1015,8 +1015,8 @@ void resize_h_planar_float_avx_transpose(BYTE* dst8, const BYTE* src8, int dst_p
     const int kernel_size = program->filter_size_real;
     const int ksmod4 = kernel_size / 4 * 4;
 //    const int ksmod8 = kernel_size / 8 * 8;
-
 #if 0
+
     for (int y = 0; y < height; y ++) {
         current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
 
@@ -1039,7 +1039,6 @@ void resize_h_planar_float_avx_transpose(BYTE* dst8, const BYTE* src8, int dst_p
                 __m256 coef_3_coef_7 = _mm256_load_2_m128(current_coeff + i + filter_size * 2, current_coeff + i + filter_size * 6);
                 __m256 coef_4_coef_8 = _mm256_load_2_m128(current_coeff + i + filter_size * 3, current_coeff + i + filter_size * 7);
 
-
                 _MM_TRANSPOSE8_LANE4_PS(data_1_data_5, data_2_data_6, data_3_data_7, data_4_data_8);
                 _MM_TRANSPOSE8_LANE4_PS(coef_1_coef_5, coef_2_coef_6, coef_3_coef_7, coef_4_coef_8);
 
@@ -1054,7 +1053,6 @@ void resize_h_planar_float_avx_transpose(BYTE* dst8, const BYTE* src8, int dst_p
             current_coeff += filter_size * 8;
         }
     }
-
 #endif
 
     for (int y = 0; y < height; y+=2) {
@@ -1110,93 +1108,54 @@ void resize_h_planar_float_avx_transpose(BYTE* dst8, const BYTE* src8, int dst_p
         }
     }
 
-#if 0
-    for (int y = 0; y < height; y += 4) {
-        current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
-
-        float* AVS_RESTRICT dst2_ptr = dst + y * dst_pitch;
-        float* AVS_RESTRICT dst2_ptr2 = dst + (y + 1) * dst_pitch;
-        float* AVS_RESTRICT dst2_ptr3 = dst + (y + 2) * dst_pitch;
-        float* AVS_RESTRICT dst2_ptr4 = dst + (y + 3) * dst_pitch;
-        const float* src_ptr = src + y * src_pitch;
-        const float* src_ptr2 = src + (y + 1) * src_pitch;
-        const float* src_ptr3 = src + (y + 2) * src_pitch;
-        const float* src_ptr4 = src + (y + 3) * src_pitch;
-
-        for (int x = 0; x < width - 8; x += 8) {
-
-            __m256 result = _mm256_setzero_ps();
-            __m256 result2 = _mm256_setzero_ps();
-            __m256 result3 = _mm256_setzero_ps();
-            __m256 result4 = _mm256_setzero_ps();
-
-            for (int i = 0; i < ksmod4; i += 4) {
-                // to do with 2x 4x4 loads into 4x256 bit registers and using  _MM_TRANSPOSE8_LANE4_PS
-
-                __m256 data_1_data_5 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 0] + i, src_ptr + program->pixel_offset[x + 4] + i);
-                __m256 data_2_data_6 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 1] + i, src_ptr + program->pixel_offset[x + 5] + i);
-                __m256 data_3_data_7 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 2] + i, src_ptr + program->pixel_offset[x + 6] + i);
-                __m256 data_4_data_8 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 3] + i, src_ptr + program->pixel_offset[x + 7] + i);
-
-                __m256 data_1_data_5_2 = _mm256_loadu_2_m128(src_ptr2 + program->pixel_offset[x + 0] + i, src_ptr2 + program->pixel_offset[x + 4] + i);
-                __m256 data_2_data_6_2 = _mm256_loadu_2_m128(src_ptr2 + program->pixel_offset[x + 1] + i, src_ptr2 + program->pixel_offset[x + 5] + i);
-                __m256 data_3_data_7_2 = _mm256_loadu_2_m128(src_ptr2 + program->pixel_offset[x + 2] + i, src_ptr2 + program->pixel_offset[x + 6] + i);
-                __m256 data_4_data_8_2 = _mm256_loadu_2_m128(src_ptr2 + program->pixel_offset[x + 3] + i, src_ptr2 + program->pixel_offset[x + 7] + i);
-
-                __m256 data_1_data_5_3 = _mm256_loadu_2_m128(src_ptr3 + program->pixel_offset[x + 0] + i, src_ptr3 + program->pixel_offset[x + 4] + i);
-                __m256 data_2_data_6_3 = _mm256_loadu_2_m128(src_ptr3 + program->pixel_offset[x + 1] + i, src_ptr3 + program->pixel_offset[x + 5] + i);
-                __m256 data_3_data_7_3 = _mm256_loadu_2_m128(src_ptr3 + program->pixel_offset[x + 2] + i, src_ptr3 + program->pixel_offset[x + 6] + i);
-                __m256 data_4_data_8_3 = _mm256_loadu_2_m128(src_ptr3 + program->pixel_offset[x + 3] + i, src_ptr3 + program->pixel_offset[x + 7] + i);
-
-                __m256 data_1_data_5_4 = _mm256_loadu_2_m128(src_ptr4 + program->pixel_offset[x + 0] + i, src_ptr4 + program->pixel_offset[x + 4] + i);
-                __m256 data_2_data_6_4 = _mm256_loadu_2_m128(src_ptr4 + program->pixel_offset[x + 1] + i, src_ptr4 + program->pixel_offset[x + 5] + i);
-                __m256 data_3_data_7_4 = _mm256_loadu_2_m128(src_ptr4 + program->pixel_offset[x + 2] + i, src_ptr4 + program->pixel_offset[x + 6] + i);
-                __m256 data_4_data_8_4 = _mm256_loadu_2_m128(src_ptr4 + program->pixel_offset[x + 3] + i, src_ptr4 + program->pixel_offset[x + 7] + i);
+}
 
-                __m256 coef_1_coef_5 = _mm256_load_2_m128(current_coeff + i + filter_size * 0, current_coeff + i + filter_size * 4);
-                __m256 coef_2_coef_6 = _mm256_load_2_m128(current_coeff + i + filter_size * 1, current_coeff + i + filter_size * 5);
-                __m256 coef_3_coef_7 = _mm256_load_2_m128(current_coeff + i + filter_size * 2, current_coeff + i + filter_size * 6);
-                __m256 coef_4_coef_8 = _mm256_load_2_m128(current_coeff + i + filter_size * 3, current_coeff + i + filter_size * 7);
+// process kernel size from up to 4 - BilinearResize, BicubicResize or sinc up to taps=2
+void resize_h_planar_float_avx_transpose_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel) {
+    int filter_size = program->filter_size;
 
+    const float* AVS_RESTRICT current_coeff;
 
-                _MM_TRANSPOSE8_LANE4_PS(data_1_data_5, data_2_data_6, data_3_data_7, data_4_data_8);
-                _MM_TRANSPOSE8_LANE4_PS(data_1_data_5_2, data_2_data_6_2, data_3_data_7_2, data_4_data_8_2);
-                _MM_TRANSPOSE8_LANE4_PS(data_1_data_5_3, data_2_data_6_3, data_3_data_7_3, data_4_data_8_3);
-                _MM_TRANSPOSE8_LANE4_PS(data_1_data_5_4, data_2_data_6_4, data_3_data_7_4, data_4_data_8_4);
+    src_pitch = src_pitch / sizeof(float);
+    dst_pitch = dst_pitch / sizeof(float);
 
-                _MM_TRANSPOSE8_LANE4_PS(coef_1_coef_5, coef_2_coef_6, coef_3_coef_7, coef_4_coef_8);
+    float* src = (float*)src8;
+    float* dst = (float*)dst8;
 
-                result = _mm256_fmadd_ps(data_1_data_5, coef_1_coef_5, result);
-                result = _mm256_fmadd_ps(data_2_data_6, coef_2_coef_6, result);
-                result = _mm256_fmadd_ps(data_3_data_7, coef_3_coef_7, result);
-                result = _mm256_fmadd_ps(data_4_data_8, coef_4_coef_8, result);
+    current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
 
-                result2 = _mm256_fmadd_ps(data_1_data_5_2, coef_1_coef_5, result2);
-                result2 = _mm256_fmadd_ps(data_2_data_6_2, coef_2_coef_6, result2);
-                result2 = _mm256_fmadd_ps(data_3_data_7_2, coef_3_coef_7, result2);
-                result2 = _mm256_fmadd_ps(data_4_data_8_2, coef_4_coef_8, result2);
+    for (int x = 0; x < width; x += 8) 
+    {
+        __m256 coef_1_coef_5 = _mm256_load_2_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4);
+        __m256 coef_2_coef_6 = _mm256_load_2_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5);
+        __m256 coef_3_coef_7 = _mm256_load_2_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6);
+        __m256 coef_4_coef_8 = _mm256_load_2_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7);
+        
+        _MM_TRANSPOSE8_LANE4_PS(coef_1_coef_5, coef_2_coef_6, coef_3_coef_7, coef_4_coef_8);
 
-                result3 = _mm256_fmadd_ps(data_1_data_5_3, coef_1_coef_5, result3);
-                result3 = _mm256_fmadd_ps(data_2_data_6_3, coef_2_coef_6, result3);
-                result3 = _mm256_fmadd_ps(data_3_data_7_3, coef_3_coef_7, result3);
-                result3 = _mm256_fmadd_ps(data_4_data_8_3, coef_4_coef_8, result3);
+        float* AVS_RESTRICT dst_ptr = dst + x;
+        const float* src_ptr = src;
 
-                result4 = _mm256_fmadd_ps(data_1_data_5_4, coef_1_coef_5, result4);
-                result4 = _mm256_fmadd_ps(data_2_data_6_4, coef_2_coef_6, result4);
-                result4 = _mm256_fmadd_ps(data_3_data_7_4, coef_3_coef_7, result4);
-                result4 = _mm256_fmadd_ps(data_4_data_8_4, coef_4_coef_8, result4);
+        for (int y = 0; y < height; y++) 
+        {
+            __m256 data_1_data_5 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 0], src_ptr + program->pixel_offset[x + 4]);
+            __m256 data_2_data_6 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 1], src_ptr + program->pixel_offset[x + 5]);
+            __m256 data_3_data_7 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 2], src_ptr + program->pixel_offset[x + 6]);
+            __m256 data_4_data_8 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 3], src_ptr + program->pixel_offset[x + 7]);
 
-            }
+            _MM_TRANSPOSE8_LANE4_PS(data_1_data_5, data_2_data_6, data_3_data_7, data_4_data_8);
 
-            // need to process last non-mod4 kernel samples in scalar way. or can we do over-read up to 3 kernel and source samples safely with main 4-kernel_samples loop ?
+            __m256 result = _mm256_mul_ps(data_1_data_5, coef_1_coef_5);
+            result = _mm256_fmadd_ps(data_2_data_6, coef_2_coef_6, result);
+            result = _mm256_fmadd_ps(data_3_data_7, coef_3_coef_7, result);
+            result = _mm256_fmadd_ps(data_4_data_8, coef_4_coef_8, result);
 
-            _mm256_store_ps(dst2_ptr + x, result);
-            _mm256_store_ps(dst2_ptr2 + x, result2);
-            _mm256_store_ps(dst2_ptr3 + x, result3);
-            _mm256_store_ps(dst2_ptr4 + x, result4);
+            _mm256_store_ps(dst_ptr, result);
 
-            current_coeff += filter_size * 8;
+            dst_ptr += dst_pitch;
+            src_ptr += src_pitch;
         }
+        current_coeff += filter_size * 8;
     }
-#endif
+
 }
\ No newline at end of file
diff --git a/avs_core/filters/intel/resample_avx2.h b/avs_core/filters/intel/resample_avx2.h
index 3d88dc37d..cef2a5125 100644
--- a/avs_core/filters/intel/resample_avx2.h
+++ b/avs_core/filters/intel/resample_avx2.h
@@ -54,6 +54,8 @@ void resize_v_avx2_planar_float(BYTE* dst0, const BYTE* src0, int dst_pitch, int
 
 void resize_h_planar_float_avx_transpose(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
 
+void resize_h_planar_float_avx_transpose_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+
 // Transpose 4x4 blocks within each lane
 #define _MM_TRANSPOSE8_LANE4_PS(row0, row1, row2, row3) \
 	do { \
diff --git a/avs_core/filters/intel/resample_sse.cpp b/avs_core/filters/intel/resample_sse.cpp
index 83acefc5f..a3729c295 100644
--- a/avs_core/filters/intel/resample_sse.cpp
+++ b/avs_core/filters/intel/resample_sse.cpp
@@ -1125,6 +1125,56 @@ void resize_h_planar_float_sse_transpose(BYTE* dst8, const BYTE* src8, int dst_p
     }
 
     // to do - need to process last row of not-mod2 heights
+}
+
+// process kernel size from up to 4 - BilinearResize, BicubicResize or sinc up to taps=2
+void resize_h_planar_float_sse_transpose_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel) 
+{
+    int filter_size = program->filter_size;
+
+    const float* AVS_RESTRICT current_coeff;
+
+    src_pitch = src_pitch / sizeof(float);
+    dst_pitch = dst_pitch / sizeof(float);
+
+    float* src = (float*)src8;
+    float* dst = (float*)dst8;
+
+    current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
+
+    for (int x = 0; x < width; x += 4)
+    {
+        __m128 coeff_1 = _mm_load_ps(current_coeff + filter_size * 0);
+        __m128 coeff_2 = _mm_load_ps(current_coeff + filter_size * 1);
+        __m128 coeff_3 = _mm_load_ps(current_coeff + filter_size * 2);
+        __m128 coeff_4 = _mm_load_ps(current_coeff + filter_size * 3);
+
+        _MM_TRANSPOSE4_PS(coeff_1, coeff_2, coeff_3, coeff_4);
+
+        float* AVS_RESTRICT dst_ptr = dst + x;
+        const float* src_ptr = src;
+
+        for (int y = 0; y < height; y++)
+        {
+            __m128 data_1 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + 0]);
+            __m128 data_2 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + 1]);
+            __m128 data_3 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + 2]);
+            __m128 data_4 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + 3]);
+
+            _MM_TRANSPOSE4_PS(data_1, data_2, data_3, data_4);
+
+            __m128 result = _mm_mul_ps(data_1, coeff_1);
+            result = _mm_add_ps(_mm_mul_ps(data_2, coeff_2), result);
+            result = _mm_add_ps(_mm_mul_ps(data_3, coeff_3), result);
+            result = _mm_add_ps(_mm_mul_ps(data_4, coeff_4), result);
+
+            _mm_store_ps(dst_ptr, result);
+
+            dst_ptr += dst_pitch;
+            src_ptr += src_pitch;
+        }
+        current_coeff += filter_size * 4;
+    }
 
 }
 
diff --git a/avs_core/filters/intel/resample_sse.h b/avs_core/filters/intel/resample_sse.h
index e1e20488a..e4febc523 100644
--- a/avs_core/filters/intel/resample_sse.h
+++ b/avs_core/filters/intel/resample_sse.h
@@ -61,5 +61,6 @@ __attribute__((__target__("ssse3")))
 void resizer_h_ssse3_generic_float(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
 
 void resize_h_planar_float_sse_transpose(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+void resize_h_planar_float_sse_transpose_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
 
 #endif // __Resample_SSE_H__

From 4fdda09903080b3b46165ab967968e768c486e09 Mon Sep 17 00:00:00 2001
From: DTL2020 <68707763+DTL2020@users.noreply.github.com>
Date: Fri, 9 May 2025 08:52:02 -0700
Subject: [PATCH 06/27] Added AVX512 for

V-resize and H-resize for kernel_size <=4
---
 avs_core/filters/resample.cpp | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/avs_core/filters/resample.cpp b/avs_core/filters/resample.cpp
index 4df4b508a..ec0916834 100644
--- a/avs_core/filters/resample.cpp
+++ b/avs_core/filters/resample.cpp
@@ -36,6 +36,7 @@
 #ifdef INTEL_INTRINSICS
 #include "intel/resample_sse.h"
 #include "intel/resample_avx2.h"
+#include "intel/resample_avx512.h"
 #include "intel/turn_sse.h"
 #endif
 #include <avs/config.h>
@@ -1584,6 +1585,13 @@ ResamplerH FilteredResizeH::GetResampler(int CPU, int pixelsize, int bits_per_pi
   }
   else { //if (pixelsize == 4)
 #ifdef INTEL_INTRINSICS
+    if (CPU & CPUF_AVX512F) {
+      if (program->filter_size_real <= 4)
+      {
+        return resize_h_planar_float_avx512_transpose_vstripe_ks4;
+      }
+    }
+
     if (CPU & CPUF_AVX2) {
 //      return resizer_h_avx2_generic_float;
         if(program->filter_size_real <=4)
@@ -1798,6 +1806,9 @@ ResamplerV FilteredResizeV::GetResampler(int CPU, int pixelsize, int bits_per_pi
     else // pixelsize== 4
     {
 #ifdef INTEL_INTRINSICS
+      if (CPU & CPUF_AVX512F) {
+        return resize_v_avx512_planar_float;
+      }
       if (CPU & CPUF_AVX2) {
         return resize_v_avx2_planar_float;
       }

From 5bd7a28bb2456c9160af8fdb3a0cdbba320a5c97 Mon Sep 17 00:00:00 2001
From: DTL2020 <68707763+DTL2020@users.noreply.github.com>
Date: Fri, 9 May 2025 08:53:19 -0700
Subject: [PATCH 07/27] Added AVX512 for

V-resize and H-resize for kernel_size <=4
---
 avs_core/filters/intel/resample_avx512.cpp | 183 +++++++++++++++++++++
 avs_core/filters/intel/resample_avx512.h   |  75 +++++++++
 2 files changed, 258 insertions(+)
 create mode 100644 avs_core/filters/intel/resample_avx512.cpp
 create mode 100644 avs_core/filters/intel/resample_avx512.h

diff --git a/avs_core/filters/intel/resample_avx512.cpp b/avs_core/filters/intel/resample_avx512.cpp
new file mode 100644
index 000000000..7a0137c15
--- /dev/null
+++ b/avs_core/filters/intel/resample_avx512.cpp
@@ -0,0 +1,183 @@
+// Avisynth v2.5.  Copyright 2002 Ben Rudiak-Gould et al.
+// http://avisynth.nl
+
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
+// http://www.gnu.org/copyleft/gpl.html .
+//
+// Linking Avisynth statically or dynamically with other modules is making a
+// combined work based on Avisynth.  Thus, the terms and conditions of the GNU
+// General Public License cover the whole combination.
+//
+// As a special exception, the copyright holders of Avisynth give you
+// permission to link Avisynth with independent modules that communicate with
+// Avisynth solely through the interfaces defined in avisynth.h, regardless of the license
+// terms of these independent modules, and to copy and distribute the
+// resulting combined work under terms of your choice, provided that
+// every copy of the combined work is accompanied by a complete copy of
+// the source code of Avisynth (the version of Avisynth used to produce the
+// combined work), being distributed under the terms of the GNU General
+// Public License plus this exception.  An independent module is a module
+// which is not derived from or based on Avisynth, such as 3rd-party filters,
+// import and export plugins, or graphical user interfaces.
+
+//#include "resample_sse.h"
+#include <avs/config.h>
+#include "../core/internal.h"
+
+#include <avs/alignment.h>
+#include <avs/minmax.h>
+
+// experimental simd includes for avx2 compiled files
+#if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
+#include <x86intrin.h>
+// x86intrin.h includes header files for whatever instruction
+// sets are specified on the compiler command line, such as: xopintrin.h, fma4intrin.h
+#else
+#include <immintrin.h> // MS version of immintrin.h covers AVX, AVX2 and FMA3
+#endif // __GNUC__
+
+#if !defined(__FMA__)
+// Assume that all processors that have AVX2 also have FMA3
+#if defined (__GNUC__) && ! defined (__INTEL_COMPILER) && ! defined (__clang__)
+// Prevent error message in g++ when using FMA intrinsics with avx2:
+#pragma message "It is recommended to specify also option -mfma when using -mavx2 or higher"
+#else
+#define __FMA__  1
+#endif
+#endif
+// FMA3 instruction set
+#if defined (__FMA__) && (defined(__GNUC__) || defined(__clang__))  && ! defined (__INTEL_COMPILER)
+#include <fmaintrin.h>
+#endif // __FMA__
+
+
+#include "resample_avx512.h"
+
+//------- 512 bit float Horizontals
+
+// Transpose-based
+// process kernel size from up to 4 - BilinearResize, BicubicResize or sinc up to taps=2
+void resize_h_planar_float_avx512_transpose_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel) {
+    int filter_size = program->filter_size;
+
+    const float* AVS_RESTRICT current_coeff;
+
+    src_pitch = src_pitch / sizeof(float);
+    dst_pitch = dst_pitch / sizeof(float);
+
+    float* src = (float*)src8;
+    float* dst = (float*)dst8;
+
+    current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
+
+    for (int x = 0; x < width; x += 16) // is it safe to read by 16 floats = 64 bytes ?
+    {
+        __m512 c1_c5_c9_c13 = _mm512_load_4_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4, current_coeff + filter_size * 8, current_coeff + filter_size * 12);
+        __m512 c2_c6_c10_c14 = _mm512_load_4_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5, current_coeff + filter_size * 9, current_coeff + filter_size * 13);
+        __m512 c3_c7_c11_c15 = _mm512_load_4_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6, current_coeff + filter_size * 10, current_coeff + filter_size * 14);
+        __m512 c4_c8_c12_c16 = _mm512_load_4_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7, current_coeff + filter_size * 11, current_coeff + filter_size * 15);
+
+        _MM_TRANSPOSE16_LANE4_PS(c1_c5_c9_c13, c2_c6_c10_c14, c3_c7_c11_c15, c4_c8_c12_c16);
+
+        float* AVS_RESTRICT dst_ptr = dst + x;
+        const float* src_ptr = src;
+
+        for (int y = 0; y < height; y++)
+        {
+            __m512 d1_d5_d9_d13 = _mm512_loadu_4_m128(src_ptr + program->pixel_offset[x + 0], src_ptr + program->pixel_offset[x + 4], src_ptr + program->pixel_offset[x + 8], src_ptr + program->pixel_offset[x + 12]);
+            __m512 d2_d6_d10_d14 = _mm512_loadu_4_m128(src_ptr + program->pixel_offset[x + 1], src_ptr + program->pixel_offset[x + 5], src_ptr + program->pixel_offset[x + 9], src_ptr + program->pixel_offset[x + 13]);
+            __m512 d3_d7_d11_d15 = _mm512_loadu_4_m128(src_ptr + program->pixel_offset[x + 2], src_ptr + program->pixel_offset[x + 6], src_ptr + program->pixel_offset[x + 10], src_ptr + program->pixel_offset[x + 14]);
+            __m512 d4_d8_d12_d16 = _mm512_loadu_4_m128(src_ptr + program->pixel_offset[x + 3], src_ptr + program->pixel_offset[x + 7], src_ptr + program->pixel_offset[x + 11], src_ptr + program->pixel_offset[x + 15]);
+
+            _MM_TRANSPOSE16_LANE4_PS(d1_d5_d9_d13, d2_d6_d10_d14, d3_d7_d11_d15, d4_d8_d12_d16);
+
+            __m512 result = _mm512_mul_ps(d1_d5_d9_d13, c1_c5_c9_c13);
+            result = _mm512_fmadd_ps(d2_d6_d10_d14, c2_c6_c10_c14, result);
+            result = _mm512_fmadd_ps(d3_d7_d11_d15, c3_c7_c11_c15, result);
+            result = _mm512_fmadd_ps(d4_d8_d12_d16, c4_c8_c12_c16, result);
+
+            _mm512_store_ps(dst_ptr, result);
+
+            dst_ptr += dst_pitch;
+            src_ptr += src_pitch;
+        }
+        current_coeff += filter_size * 16;
+    }
+
+}
+
+//-------- 512 bit float Verticals
+
+void resize_v_avx512_planar_float(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel)
+{
+  AVS_UNUSED(bits_per_pixel);
+
+  const int filter_size = program->filter_size;
+  const float* AVS_RESTRICT current_coeff = program->pixel_coefficient_float;
+
+  const float* src = (const float*)src8;
+  float* AVS_RESTRICT dst = (float*)dst8;
+  dst_pitch = dst_pitch / sizeof(float);
+  src_pitch = src_pitch / sizeof(float);
+
+  const int kernel_size = program->filter_size_real; // not the aligned
+  const int kernel_size_mod2 = (kernel_size / 2) * 2; // Process pairs of rows for better efficiency
+  const bool notMod2 = kernel_size_mod2 < kernel_size;
+
+  for (int y = 0; y < target_height; y++) {
+    int offset = program->pixel_offset[y];
+    const float* src_ptr = src + offset * src_pitch;
+
+    // 64 byte 16 floats (AVX512 register holds 16 floats)
+    // no need for wmod8, alignment is safe 32 bytes at least - is it safe for 64 bytes ?
+    for (int x = 0; x < width; x += 16) {
+      __m512 result_single = _mm512_setzero_ps();
+      __m512 result_single_2 = _mm512_setzero_ps();
+
+      const float* AVS_RESTRICT src2_ptr = src_ptr + x; // __restrict here
+
+      // Process pairs of rows for better efficiency (2 coeffs/cycle)
+      // two result variables for potential parallel operation
+      int i = 0;
+      for (; i < kernel_size_mod2; i += 2) {
+        __m512 coeff_even = _mm512_set1_ps(current_coeff[i]);
+        __m512 coeff_odd = _mm512_set1_ps(current_coeff[i + 1]);
+
+        __m512 src_even = _mm512_loadu_ps(src2_ptr);
+        __m512 src_odd = _mm512_loadu_ps(src2_ptr + src_pitch);
+
+        result_single = _mm512_fmadd_ps(src_even, coeff_even, result_single);
+        result_single_2 = _mm512_fmadd_ps(src_odd, coeff_odd, result_single_2);
+
+        src2_ptr += 2 * src_pitch;
+      }
+
+      result_single = _mm512_add_ps(result_single, result_single_2);
+
+      // Process the last odd row if needed
+      if (notMod2) {
+        __m512 coeff = _mm512_set1_ps(current_coeff[i]);
+        __m512 src_val = _mm512_loadu_ps(src2_ptr);
+        result_single = _mm512_fmadd_ps(src_val, coeff, result_single);
+      }
+
+      _mm512_store_ps(dst + x, result_single);
+    }
+
+    dst += dst_pitch;
+    current_coeff += filter_size;
+  }
+}
+
diff --git a/avs_core/filters/intel/resample_avx512.h b/avs_core/filters/intel/resample_avx512.h
new file mode 100644
index 000000000..5dd73276b
--- /dev/null
+++ b/avs_core/filters/intel/resample_avx512.h
@@ -0,0 +1,75 @@
+// Avisynth v2.5.  Copyright 2002 Ben Rudiak-Gould et al.
+// http://avisynth.nl
+
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
+// http://www.gnu.org/copyleft/gpl.html .
+//
+// Linking Avisynth statically or dynamically with other modules is making a
+// combined work based on Avisynth.  Thus, the terms and conditions of the GNU
+// General Public License cover the whole combination.
+//
+// As a special exception, the copyright holders of Avisynth give you
+// permission to link Avisynth with independent modules that communicate with
+// Avisynth solely through the interfaces defined in avisynth.h, regardless of the license
+// terms of these independent modules, and to copy and distribute the
+// resulting combined work under terms of your choice, provided that
+// every copy of the combined work is accompanied by a complete copy of
+// the source code of Avisynth (the version of Avisynth used to produce the
+// combined work), being distributed under the terms of the GNU General
+// Public License plus this exception.  An independent module is a module
+// which is not derived from or based on Avisynth, such as 3rd-party filters,
+// import and export plugins, or graphical user interfaces.
+
+#ifndef __Resample_AVX512_H__
+#define __Resample_AVX512_H__
+
+#include <avisynth.h>
+#include "../resample_functions.h"
+
+void resize_h_planar_float_avx512_transpose_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+
+void resize_v_avx512_planar_float(BYTE* dst0, const BYTE* src0, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel);
+
+
+#define _MM_TRANSPOSE16_LANE4_PS(row0, row1, row2, row3) \
+	do { \
+		__m512 __t0, __t1, __t2, __t3; \
+		__t0 = _mm512_unpacklo_ps(row0, row1); \
+		__t1 = _mm512_unpackhi_ps(row0, row1); \
+		__t2 = _mm512_unpacklo_ps(row2, row3); \
+		__t3 = _mm512_unpackhi_ps(row2, row3); \
+		row0 = _mm512_shuffle_ps(__t0, __t2, _MM_SHUFFLE(1, 0, 1, 0)); \
+		row1 = _mm512_shuffle_ps(__t0, __t2, _MM_SHUFFLE(3, 2, 3, 2)); \
+		row2 = _mm512_shuffle_ps(__t1, __t3, _MM_SHUFFLE(1, 0, 1, 0)); \
+		row3 = _mm512_shuffle_ps(__t1, __t3, _MM_SHUFFLE(3, 2, 3, 2)); \
+	} while (0)
+
+#ifndef _mm512_loadu_4_m128
+#define _mm512_loadu_4_m128(/* __m128 const* */ addr1, \
+                            /* __m128 const* */ addr2, \
+                            /* __m128 const* */ addr3, \
+                            /* __m128 const* */ addr4) \
+_mm512_insertf32x4(_mm512_insertf32x4(_mm512_insertf32x4(_mm512_castps128_ps512(_mm_loadu_ps(addr1)), _mm_loadu_ps(addr2), 1), _mm_loadu_ps(addr3), 2), _mm_loadu_ps(addr4), 3)
+#endif
+
+#ifndef _mm512_load_4_m128
+#define _mm512_load_4_m128(/* __m128 const* */ addr1, \
+                            /* __m128 const* */ addr2, \
+                            /* __m128 const* */ addr3, \
+                            /* __m128 const* */ addr4) \
+_mm512_insertf32x4(_mm512_insertf32x4(_mm512_insertf32x4(_mm512_castps128_ps512(_mm_load_ps(addr1)), _mm_load_ps(addr2), 1), _mm_load_ps(addr3), 2), _mm_load_ps(addr4), 3)
+#endif
+
+#endif // __Resample_AVX512_H__

From 4b157138b38af38fb0e6dad148aa344507e8de24 Mon Sep 17 00:00:00 2001
From: DTL2020 <68707763+DTL2020@users.noreply.github.com>
Date: Sat, 10 May 2025 04:06:23 -0700
Subject: [PATCH 08/27] Added permutex-based

float H-resize for AVX2 ks4, ks8. AVX512 ks4, ks8m ks16 (selectors here - some AVX512 functions are not finally debugged)
---
 avs_core/filters/resample.cpp | 36 +++++++++++++++++++++++------------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/avs_core/filters/resample.cpp b/avs_core/filters/resample.cpp
index ec0916834..8ff15bb66 100644
--- a/avs_core/filters/resample.cpp
+++ b/avs_core/filters/resample.cpp
@@ -1586,26 +1586,38 @@ ResamplerH FilteredResizeH::GetResampler(int CPU, int pixelsize, int bits_per_pi
   else { //if (pixelsize == 4)
 #ifdef INTEL_INTRINSICS
     if (CPU & CPUF_AVX512F) {
+      if ((program->filter_size_real <= 16) && (program->filter_size_real > 8))
+      {
+        return resize_h_planar_float_avx512_permutex_vstripe_ks16;
+      }
+      if ((program->filter_size_real <= 8) && (program->filter_size_real > 4))
+      {
+        return resize_h_planar_float_avx512_permutex_vstripe_ks8;
+      }
       if (program->filter_size_real <= 4)
       {
-        return resize_h_planar_float_avx512_transpose_vstripe_ks4;
+
+        return resize_h_planar_float_avx512_permutex_vstripe_ks4;
       }
     }
 
     if (CPU & CPUF_AVX2) {
-//      return resizer_h_avx2_generic_float;
-        if(program->filter_size_real <=4)
-        { 
-            return resize_h_planar_float_avx_transpose_vstripe_ks4;
-        }
-        return resizer_h_avx2_generic_float;
+      if ((program->filter_size_real <= 8) && (program->filter_size_real > 4))
+      {
+        return resize_h_planar_float_avx2_permutex_vstripe_ks8;
+      }
+
+      if(program->filter_size_real <=4)
+      { 
+        return resize_h_planar_float_avx2_permutex_vstripe_ks4;
+      }
+      return resizer_h_avx2_generic_float;
     }
     if (CPU & CPUF_SSSE3) {
-//      return resizer_h_ssse3_generic_float;
-        if (program->filter_size_real <= 4)
-        {
-            return resize_h_planar_float_sse_transpose_vstripe_ks4;
-        }
+      if (program->filter_size_real <= 4)
+      {
+          return resize_h_planar_float_sse_transpose_vstripe_ks4;
+      }
 		return resizer_h_ssse3_generic_float;
     }
 #endif

From 5aa2d5eefb71eafb160b93b91cc1e37b62f77af5 Mon Sep 17 00:00:00 2001
From: DTL2020 <68707763+DTL2020@users.noreply.github.com>
Date: Sat, 10 May 2025 04:08:54 -0700
Subject: [PATCH 09/27] Added permutex-based AVX2/AVX512

float H-resizers:
AVX2 ks4 and ks8
AVX512 ks4, ks8, ks16 (8 and 16 are performance test only - not finally debugged)
---
 avs_core/filters/intel/resample_avx2.cpp   | 262 ++++++++++-
 avs_core/filters/intel/resample_avx2.h     |   5 +
 avs_core/filters/intel/resample_avx512.cpp | 482 +++++++++++++++++++++
 avs_core/filters/intel/resample_avx512.h   |   3 +
 4 files changed, 751 insertions(+), 1 deletion(-)

diff --git a/avs_core/filters/intel/resample_avx2.cpp b/avs_core/filters/intel/resample_avx2.cpp
index 93ce72047..6dd0a1832 100644
--- a/avs_core/filters/intel/resample_avx2.cpp
+++ b/avs_core/filters/intel/resample_avx2.cpp
@@ -1158,4 +1158,264 @@ void resize_h_planar_float_avx_transpose_vstripe_ks4(BYTE* dst8, const BYTE* src
         current_coeff += filter_size * 8;
     }
 
-}
\ No newline at end of file
+}
+
+void resize_h_planar_float_avx_gather_vstripe_ks8(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel)
+{
+    int filter_size = program->filter_size;
+
+    const float* AVS_RESTRICT current_coeff;
+
+    src_pitch = src_pitch / sizeof(float);
+    dst_pitch = dst_pitch / sizeof(float);
+
+    float* src = (float*)src8;
+    float* dst = (float*)dst8;
+
+    current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
+    __m256i one_epi32 = _mm256_set1_epi32(1);
+
+    for (int x = 0; x < width; x += 8)
+    {
+        __m256 coef_0 = _mm256_load_ps(current_coeff + filter_size * 0);
+        __m256 coef_1 = _mm256_load_ps(current_coeff + filter_size * 1);
+        __m256 coef_2 = _mm256_load_ps(current_coeff + filter_size * 2);
+        __m256 coef_3 = _mm256_load_ps(current_coeff + filter_size * 3);
+        __m256 coef_4 = _mm256_load_ps(current_coeff + filter_size * 4);
+        __m256 coef_5 = _mm256_load_ps(current_coeff + filter_size * 5);
+        __m256 coef_6 = _mm256_load_ps(current_coeff + filter_size * 6);
+        __m256 coef_7 = _mm256_load_ps(current_coeff + filter_size * 7);
+
+        _MM_TRANSPOSE8_PS(coef_0, coef_1, coef_2, coef_3, coef_4, coef_5, coef_6, coef_7);
+
+        float* AVS_RESTRICT dst_ptr = dst + x;
+        const float* src_ptr = src;
+
+        for (int y = 0; y < height; y++)
+        {
+//            __m256i offsets = _mm256_load_si256(program->pixel_offset + x); // hope it is always aligned ?
+            __m256i offsets = _mm256_set_epi32(program->pixel_offset[x + 7], program->pixel_offset[x + 6], program->pixel_offset[x + 5], program->pixel_offset[x + 4], program->pixel_offset[x + 3], program->pixel_offset[x + 2], program->pixel_offset[x + 1], program->pixel_offset[x + 0]);
+//            __m256i offsets = _mm256_set1_epi32(program->pixel_offset[x]); // test
+            __m256 data_0 = _mm256_i32gather_ps(src_ptr, offsets, 4);
+
+            offsets = _mm256_add_epi32(offsets, one_epi32);
+            __m256 data_1 = _mm256_i32gather_ps(src_ptr, offsets, 4);
+
+            offsets = _mm256_add_epi32(offsets, one_epi32);
+            __m256 data_2 = _mm256_i32gather_ps(src_ptr, offsets, 4);
+
+            offsets = _mm256_add_epi32(offsets, one_epi32);
+            __m256 data_3 = _mm256_i32gather_ps(src_ptr, offsets, 4);
+
+            offsets = _mm256_add_epi32(offsets, one_epi32);
+            __m256 data_4 = _mm256_i32gather_ps(src_ptr, offsets, 4);
+
+            offsets = _mm256_add_epi32(offsets, one_epi32);
+            __m256 data_5 = _mm256_i32gather_ps(src_ptr, offsets, 4);
+
+            offsets = _mm256_add_epi32(offsets, one_epi32);
+            __m256 data_6 = _mm256_i32gather_ps(src_ptr, offsets, 4);
+
+            offsets = _mm256_add_epi32(offsets, one_epi32);
+            __m256 data_7 = _mm256_i32gather_ps(src_ptr, offsets, 4);
+
+            __m256 result0 = _mm256_mul_ps(data_0, coef_0);
+            __m256 result1 = _mm256_mul_ps(data_4, coef_4);
+            
+            result0 = _mm256_fmadd_ps(data_1, coef_1, result0);
+            result1 = _mm256_fmadd_ps(data_5, coef_5, result1);
+
+            result0 = _mm256_fmadd_ps(data_2, coef_2, result0);
+            result1 = _mm256_fmadd_ps(data_6, coef_6, result1);
+
+            result0 = _mm256_fmadd_ps(data_3, coef_3, result0);
+            result1 = _mm256_fmadd_ps(data_7, coef_7, result1);
+
+            _mm256_store_ps(dst_ptr, _mm256_add_ps(result0, result1));
+
+            dst_ptr += dst_pitch;
+            src_ptr += src_pitch;
+        }
+        current_coeff += filter_size * 8;
+    }
+}
+
+void resize_h_planar_float_avx2_permutex_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel)
+{
+
+  // assert - check if max pixel_offset is not above single load of 8 src floats (or need several loads and more complex permute program)
+
+#ifdef _DEBUG
+  for (int x = 0; x < width; x += 8)
+  {
+    int start_off = program->pixel_offset[x + 0];
+    int end_off = program->pixel_offset[x + 7];
+    assert((end_off - start_off) > 7);
+  }
+#endif
+
+  int filter_size = program->filter_size;
+
+  const float* AVS_RESTRICT current_coeff;
+  __m256i one_epi32 = _mm256_set1_epi32(1);
+
+  src_pitch = src_pitch / sizeof(float);
+  dst_pitch = dst_pitch / sizeof(float);
+
+  float* src = (float*)src8;
+  float* dst = (float*)dst8;
+
+  current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
+
+  for (int x = 0; x < width; x += 8)
+  {
+    // prepare coefs in transposed V-form
+    __m256 coef_0 = _mm256_load_2_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4);
+    __m256 coef_1 = _mm256_load_2_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5);
+    __m256 coef_2 = _mm256_load_2_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6);
+    __m256 coef_3 = _mm256_load_2_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7);
+
+    _MM_TRANSPOSE8_LANE4_PS(coef_0, coef_1, coef_2, coef_3);
+
+    // convert resampling program in H-form into permuting indexes for src transposition in V-form
+    int iStart = program->pixel_offset[x + 0];
+    __m256i perm_0 = _mm256_set_epi32(program->pixel_offset[x + 7] - iStart, program->pixel_offset[x + 6] - iStart, program->pixel_offset[x + 5] - iStart, program->pixel_offset[x + 4] - iStart, program->pixel_offset[x + 3] - iStart, program->pixel_offset[x + 2] - iStart, program->pixel_offset[x + 1] - iStart,  0);
+    __m256i perm_1 = _mm256_add_epi32(perm_0, one_epi32);
+    __m256i perm_2 = _mm256_add_epi32(perm_1, one_epi32);
+    __m256i perm_3 = _mm256_add_epi32(perm_2, one_epi32);
+
+    float* AVS_RESTRICT dst_ptr = dst + x;
+    const float* src_ptr = src + program->pixel_offset[x + 0]; // all permute offsets relative to this start offset
+
+    for (int y = 0; y < height; y++)
+    {
+      __m256 data_src = _mm256_loadu_ps(src_ptr);
+
+      __m256 data_0 = _mm256_permutevar8x32_ps(data_src, perm_0);
+      __m256 data_1 = _mm256_permutevar8x32_ps(data_src, perm_1);
+      __m256 data_2 = _mm256_permutevar8x32_ps(data_src, perm_2);
+      __m256 data_3 = _mm256_permutevar8x32_ps(data_src, perm_3);
+
+      __m256 result0 = _mm256_mul_ps(data_0, coef_0);
+      __m256 result1 = _mm256_mul_ps(data_2, coef_2);
+
+      result0 = _mm256_fmadd_ps(data_1, coef_1, result0);
+      result1 = _mm256_fmadd_ps(data_3, coef_3, result1);
+
+      _mm256_store_ps(dst_ptr, _mm256_add_ps(result0, result1));
+
+      dst_ptr += dst_pitch;
+      src_ptr += src_pitch;
+    }
+    current_coeff += filter_size * 8;
+  }
+}
+
+void resize_h_planar_float_avx2_permutex_vstripe_ks8(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel)
+{
+
+  // assert - check if max pixel_offset is not above single load of 8 src floats (or need several loads and more complex permute program)
+
+#ifdef _DEBUG
+  for (int x = 0; x < width; x += 8)
+  {
+    int start_off = program->pixel_offset[x + 0];
+    int end_off = program->pixel_offset[x + 7];
+    assert((end_off - start_off) > 7);
+  }
+#endif
+
+  int filter_size = program->filter_size; // must be 8
+  assert(filter_size != 8);
+
+  const float* AVS_RESTRICT current_coeff;
+  __m256i one_epi32 = _mm256_set1_epi32(1);
+
+  src_pitch = src_pitch / sizeof(float);
+  dst_pitch = dst_pitch / sizeof(float);
+
+  float* src = (float*)src8;
+  float* dst = (float*)dst8;
+
+  current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
+
+  for (int x = 0; x < width; x += 8)
+  {
+    // prepare coefs in transposed V-form
+    __m256 coef_0 = _mm256_load_2_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4);
+    __m256 coef_1 = _mm256_load_2_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5);
+    __m256 coef_2 = _mm256_load_2_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6);
+    __m256 coef_3 = _mm256_load_2_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7);
+
+    __m256 coef_4 = _mm256_load_2_m128(current_coeff + filter_size * 0 + 4, current_coeff + filter_size * 4 + 4);
+    __m256 coef_5 = _mm256_load_2_m128(current_coeff + filter_size * 1 + 4, current_coeff + filter_size * 5 + 4);
+    __m256 coef_6 = _mm256_load_2_m128(current_coeff + filter_size * 2 + 4, current_coeff + filter_size * 6 + 4);
+    __m256 coef_7 = _mm256_load_2_m128(current_coeff + filter_size * 3 + 4, current_coeff + filter_size * 7 + 4);
+
+    _MM_TRANSPOSE8_LANE4_PS(coef_0, coef_1, coef_2, coef_3);
+    _MM_TRANSPOSE8_LANE4_PS(coef_4, coef_5, coef_6, coef_7);
+
+    // convert resampling program in H-form into permuting indexes for src transposition in V-form
+    int iStart = program->pixel_offset[x + 0];
+    __m256i perm_0 = _mm256_set_epi32(program->pixel_offset[x + 7] - iStart, program->pixel_offset[x + 6] - iStart, program->pixel_offset[x + 5] - iStart, program->pixel_offset[x + 4] - iStart, program->pixel_offset[x + 3] - iStart, program->pixel_offset[x + 2] - iStart, program->pixel_offset[x + 1] - iStart, 0);
+    __m256i perm_1 = _mm256_add_epi32(perm_0, one_epi32);
+    __m256i perm_2 = _mm256_add_epi32(perm_1, one_epi32);
+    __m256i perm_3 = _mm256_add_epi32(perm_2, one_epi32);
+
+    float* AVS_RESTRICT dst_ptr = dst + x;
+    const float* src_ptr = src + program->pixel_offset[x + 0]; // all permute offsets relative to this start offset
+
+    for (int y = 0; y < height; y++)
+    {
+      __m256 result;
+
+/*      __m256i perm_0 = perm_start;
+      __m256i perm_1 = _mm256_add_epi32(perm_0, one_epi32);
+      __m256i perm_2 = _mm256_add_epi32(perm_1, one_epi32);
+      __m256i perm_3 = _mm256_add_epi32(perm_2, one_epi32);
+      */
+      __m256 data_src = _mm256_loadu_ps(src_ptr);
+
+      __m256 data_0 = _mm256_permutevar8x32_ps(data_src, perm_0);
+      __m256 data_1 = _mm256_permutevar8x32_ps(data_src, perm_1);
+      __m256 data_2 = _mm256_permutevar8x32_ps(data_src, perm_2);
+      __m256 data_3 = _mm256_permutevar8x32_ps(data_src, perm_3);
+
+      __m256 result0 = _mm256_mul_ps(data_0, coef_0);
+      __m256 result1 = _mm256_mul_ps(data_2, coef_2);
+
+      result0 = _mm256_fmadd_ps(data_1, coef_1, result0);
+      result1 = _mm256_fmadd_ps(data_3, coef_3, result1);
+
+      result = _mm256_add_ps(result0, result1);
+
+      // next next 4 samples + 4 coefs
+      data_src = _mm256_loadu_ps(src_ptr + 4);
+
+/*      perm_0 = _mm256_add_epi32(perm_0, one_epi32); // are we need to reload next +4 src ?
+      perm_1 = _mm256_add_epi32(perm_1, one_epi32);
+      perm_2 = _mm256_add_epi32(perm_2, one_epi32);
+      perm_3 = _mm256_add_epi32(perm_3, one_epi32);
+      */
+      data_0 = _mm256_permutevar8x32_ps(data_src, perm_0);
+      data_1 = _mm256_permutevar8x32_ps(data_src, perm_1);
+      data_2 = _mm256_permutevar8x32_ps(data_src, perm_2);
+      data_3 = _mm256_permutevar8x32_ps(data_src, perm_3);
+
+      result0 = _mm256_mul_ps(data_0, coef_4);
+      result1 = _mm256_mul_ps(data_2, coef_6);
+
+      result0 = _mm256_fmadd_ps(data_1, coef_5, result0);
+      result1 = _mm256_fmadd_ps(data_3, coef_7, result1);
+
+      result = _mm256_add_ps(result, result0);
+      result = _mm256_add_ps(result, result1);
+
+      _mm256_store_ps(dst_ptr, result);
+
+      dst_ptr += dst_pitch;
+      src_ptr += src_pitch;
+    }
+    current_coeff += filter_size * 8;
+  }
+}
diff --git a/avs_core/filters/intel/resample_avx2.h b/avs_core/filters/intel/resample_avx2.h
index cef2a5125..e57e24a56 100644
--- a/avs_core/filters/intel/resample_avx2.h
+++ b/avs_core/filters/intel/resample_avx2.h
@@ -56,6 +56,11 @@ void resize_h_planar_float_avx_transpose(BYTE* dst8, const BYTE* src8, int dst_p
 
 void resize_h_planar_float_avx_transpose_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
 
+void resize_h_planar_float_avx_gather_vstripe_ks8(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+
+void resize_h_planar_float_avx2_permutex_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+void resize_h_planar_float_avx2_permutex_vstripe_ks8(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+
 // Transpose 4x4 blocks within each lane
 #define _MM_TRANSPOSE8_LANE4_PS(row0, row1, row2, row3) \
 	do { \
diff --git a/avs_core/filters/intel/resample_avx512.cpp b/avs_core/filters/intel/resample_avx512.cpp
index 7a0137c15..b92fa2ab5 100644
--- a/avs_core/filters/intel/resample_avx512.cpp
+++ b/avs_core/filters/intel/resample_avx512.cpp
@@ -118,6 +118,488 @@ void resize_h_planar_float_avx512_transpose_vstripe_ks4(BYTE* dst8, const BYTE*
 
 }
 
+
+void resize_h_planar_float_avx512_permutex_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel)
+{
+
+  // assert - check if max pixel_offset is not above single load of 16 src floats (or need several loads and more complex permute program)
+
+#ifdef _DEBUG
+  for (int x = 0; x < width; x += 16)
+  {
+    int start_off = program->pixel_offset[x + 0];
+    int end_off = program->pixel_offset[x + 15];
+    assert((end_off - start_off) > 15);
+  }
+#endif
+
+  int filter_size = program->filter_size;
+
+  const float* AVS_RESTRICT current_coeff;
+  __m512i one_epi32 = _mm512_set1_epi32(1);
+
+  src_pitch = src_pitch / sizeof(float);
+  dst_pitch = dst_pitch / sizeof(float);
+
+  float* src = (float*)src8;
+  float* dst = (float*)dst8;
+
+  current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
+
+  for (int x = 0; x < width; x += 16)
+  {
+    // prepare coefs in transposed V-form
+    __m512 coef_r0 = _mm512_load_4_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4, current_coeff + filter_size * 8, current_coeff + filter_size * 12);
+    __m512 coef_r1 = _mm512_load_4_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5, current_coeff + filter_size * 9, current_coeff + filter_size * 13);
+    __m512 coef_r2 = _mm512_load_4_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6, current_coeff + filter_size * 10, current_coeff + filter_size * 14);
+    __m512 coef_r3 = _mm512_load_4_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7, current_coeff + filter_size * 11, current_coeff + filter_size * 15);
+
+    _MM_TRANSPOSE16_LANE4_PS(coef_r0, coef_r1, coef_r2, coef_r3);
+
+    // convert resampling program in H-form into permuting indexes for src transposition in V-form
+    int iStart = program->pixel_offset[x + 0];
+    __m512i perm_0 = _mm512_set_epi32(program->pixel_offset[x + 15] - iStart, program->pixel_offset[x + 14] - iStart, program->pixel_offset[x + 13] - iStart, program->pixel_offset[x + 12] - iStart, program->pixel_offset[x + 11] - iStart, program->pixel_offset[x + 10] - iStart, program->pixel_offset[x + 9] - iStart, program->pixel_offset[x + 8] - iStart, \
+      program->pixel_offset[x + 7] - iStart, program->pixel_offset[x + 6] - iStart, program->pixel_offset[x + 5] - iStart, program->pixel_offset[x + 4] - iStart, program->pixel_offset[x + 3] - iStart, program->pixel_offset[x + 2] - iStart, program->pixel_offset[x + 1] - iStart, 0);
+    __m512i perm_1 = _mm512_add_epi32(perm_0, one_epi32);
+    __m512i perm_2 = _mm512_add_epi32(perm_1, one_epi32);
+    __m512i perm_3 = _mm512_add_epi32(perm_2, one_epi32);
+
+    float* AVS_RESTRICT dst_ptr = dst + x;
+    const float* src_ptr = src + program->pixel_offset[x + 0]; // all permute offsets relative to this start offset
+
+#if 0
+    for (int y = 0; y < height; y++) // single row proc
+    {
+      __m512 data_src = _mm512_loadu_ps(src_ptr);
+
+      __m512 data_0 = _mm512_permutexvar_ps(perm_0, data_src);
+      __m512 data_1 = _mm512_permutexvar_ps(perm_1, data_src);
+      __m512 data_2 = _mm512_permutexvar_ps(perm_2, data_src);
+      __m512 data_3 = _mm512_permutexvar_ps(perm_3, data_src);
+
+      __m512 result0 = _mm512_mul_ps(data_0, coef_r0);
+      __m512 result1 = _mm512_mul_ps(data_2, coef_r2);
+
+      result0 = _mm512_fmadd_ps(data_1, coef_r1, result0);
+      result1 = _mm512_fmadd_ps(data_3, coef_r3, result1);
+
+      _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1));
+
+      dst_ptr += dst_pitch;
+      src_ptr += src_pitch;
+    }
+#endif
+
+    const int height_mod2 = (height / 2) * 2; // Process pairs of rows for better efficiency
+    // dual-rows not worst in performance - may be left for the future better memory performance and compute performance hosts
+    for (int y = 0; y < height_mod2; y+=2)
+    {
+      __m512 data_src = _mm512_loadu_ps(src_ptr);
+      __m512 data_src_2 = _mm512_loadu_ps(src_ptr + src_pitch);
+
+      __m512 data_0 = _mm512_permutexvar_ps(perm_0, data_src);
+      __m512 data_1 = _mm512_permutexvar_ps(perm_1, data_src);
+      __m512 data_2 = _mm512_permutexvar_ps(perm_2, data_src);
+      __m512 data_3 = _mm512_permutexvar_ps(perm_3, data_src);
+
+      __m512 data_0_2 = _mm512_permutexvar_ps(perm_0, data_src_2);
+      __m512 data_1_2 = _mm512_permutexvar_ps(perm_1, data_src_2);
+      __m512 data_2_2 = _mm512_permutexvar_ps(perm_2, data_src_2);
+      __m512 data_3_2 = _mm512_permutexvar_ps(perm_3, data_src_2);
+
+      __m512 result0 = _mm512_mul_ps(data_0, coef_r0);
+      __m512 result1 = _mm512_mul_ps(data_0_2, coef_r0);
+
+      result0 = _mm512_fmadd_ps(data_1, coef_r1, result0);
+      result1 = _mm512_fmadd_ps(data_1_2, coef_r1, result1);
+
+      result0 = _mm512_fmadd_ps(data_2, coef_r2, result0);
+      result1 = _mm512_fmadd_ps(data_2_2, coef_r2, result1);
+
+      result0 = _mm512_fmadd_ps(data_3, coef_r3, result0);
+      result1 = _mm512_fmadd_ps(data_3_2, coef_r3, result1);
+
+      _mm512_store_ps(dst_ptr, result0);
+      _mm512_store_ps(dst_ptr + dst_pitch, result1);
+
+      dst_ptr += dst_pitch * 2;
+      src_ptr += src_pitch * 2;
+    }
+
+    if (height > height_mod2) // last row
+    {
+      __m512 data_src = _mm512_loadu_ps(src_ptr);
+
+      __m512 data_0 = _mm512_permutexvar_ps(perm_0, data_src);
+      __m512 data_1 = _mm512_permutexvar_ps(perm_1, data_src);
+      __m512 data_2 = _mm512_permutexvar_ps(perm_2, data_src);
+      __m512 data_3 = _mm512_permutexvar_ps(perm_3, data_src);
+
+      __m512 result0 = _mm512_mul_ps(data_0, coef_r0);
+      __m512 result1 = _mm512_mul_ps(data_2, coef_r2);
+
+      result0 = _mm512_fmadd_ps(data_1, coef_r1, result0);
+      result1 = _mm512_fmadd_ps(data_3, coef_r3, result1);
+
+      _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1));
+    }
+
+    current_coeff += filter_size * 16;
+  }
+}
+
+void resize_h_planar_float_avx512_permutex_vstripe_ks8(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel)
+{
+  // assert - check if max pixel_offset is not above single load of 16 src floats (or need several loads and more complex permute program)
+
+#ifdef _DEBUG
+  for (int x = 0; x < width; x += 16)
+  {
+    int start_off = program->pixel_offset[x + 0];
+    int end_off = program->pixel_offset[x + 15];
+    assert((end_off - start_off) > 15);
+  }
+#endif
+
+  int filter_size = program->filter_size;
+
+  const float* AVS_RESTRICT current_coeff;
+  __m512i one_epi32 = _mm512_set1_epi32(1);
+
+  src_pitch = src_pitch / sizeof(float);
+  dst_pitch = dst_pitch / sizeof(float);
+
+  float* src = (float*)src8;
+  float* dst = (float*)dst8;
+
+  current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
+
+  for (int x = 0; x < width; x += 16)
+  {
+    // prepare coefs in transposed V-form, use gathering - not very slow until TRANSPOSE8_ is designed
+
+    __m512i offsets = _mm512_set_epi32(filter_size * 7, filter_size * 6, filter_size * 5, filter_size * 4, filter_size * 3, filter_size * 2, filter_size * 1, filter_size * 0, \
+                                       filter_size * 7, filter_size * 6, filter_size * 5, filter_size * 4, filter_size * 3, filter_size * 2, filter_size * 1, filter_size * 0 );
+
+    __m512 coef_r0 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r1 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r2 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r3 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r4 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r5 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r6 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r7 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+
+    // convert resampling program in H-form into permuting indexes for src transposition in V-form
+    int iStart = program->pixel_offset[x + 0];
+    __m512i perm_0 = _mm512_set_epi32(program->pixel_offset[x + 15] - iStart, program->pixel_offset[x + 14] - iStart, program->pixel_offset[x + 13] - iStart, program->pixel_offset[x + 12] - iStart, program->pixel_offset[x + 11] - iStart, program->pixel_offset[x + 10] - iStart, program->pixel_offset[x + 9] - iStart, program->pixel_offset[x + 8] - iStart, \
+      program->pixel_offset[x + 7] - iStart, program->pixel_offset[x + 6] - iStart, program->pixel_offset[x + 5] - iStart, program->pixel_offset[x + 4] - iStart, program->pixel_offset[x + 3] - iStart, program->pixel_offset[x + 2] - iStart, program->pixel_offset[x + 1] - iStart, 0);
+    __m512i perm_1 = _mm512_add_epi32(perm_0, one_epi32);
+    __m512i perm_2 = _mm512_add_epi32(perm_1, one_epi32);
+    __m512i perm_3 = _mm512_add_epi32(perm_2, one_epi32);
+    __m512i perm_4 = _mm512_add_epi32(perm_3, one_epi32);
+    __m512i perm_5 = _mm512_add_epi32(perm_4, one_epi32);
+    __m512i perm_6 = _mm512_add_epi32(perm_5, one_epi32);
+    __m512i perm_7 = _mm512_add_epi32(perm_6, one_epi32);
+
+    float* AVS_RESTRICT dst_ptr = dst + x;
+    const float* src_ptr = src + program->pixel_offset[x + 0]; // all permute offsets relative to this start offset
+#if 0
+    for (int y = 0; y < height; y++) // single row proc
+    {
+      __m512 data_src = _mm512_loadu_ps(src_ptr);
+
+      __m512 data_0 = _mm512_permutexvar_ps(perm_0, data_src);
+      __m512 data_1 = _mm512_permutexvar_ps(perm_1, data_src);
+      __m512 data_2 = _mm512_permutexvar_ps(perm_2, data_src);
+      __m512 data_3 = _mm512_permutexvar_ps(perm_3, data_src);
+      __m512 data_4 = _mm512_permutexvar_ps(perm_4, data_src);
+      __m512 data_5 = _mm512_permutexvar_ps(perm_5, data_src);
+      __m512 data_6 = _mm512_permutexvar_ps(perm_6, data_src);
+      __m512 data_7 = _mm512_permutexvar_ps(perm_7, data_src);
+
+      __m512 result0 = _mm512_mul_ps(data_0, coef_r0);
+      __m512 result1 = _mm512_mul_ps(data_4, coef_r4);
+
+      result0 = _mm512_fmadd_ps(data_1, coef_r1, result0);
+      result1 = _mm512_fmadd_ps(data_5, coef_r5, result1);
+
+      result0 = _mm512_fmadd_ps(data_2, coef_r2, result0);
+      result1 = _mm512_fmadd_ps(data_6, coef_r6, result1);
+
+      result0 = _mm512_fmadd_ps(data_3, coef_r3, result0);
+      result1 = _mm512_fmadd_ps(data_7, coef_r7, result1);
+
+      _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1));
+
+      dst_ptr += dst_pitch;
+      src_ptr += src_pitch;
+    }
+#endif
+
+    const int height_mod2 = (height / 2) * 2; // Process pairs of rows for better efficiency
+    // dual-rows not worst in performance - may be left for the future better memory performance and compute performance hosts
+    for (int y = 0; y < height_mod2; y += 2)
+    {
+      __m512 data_src = _mm512_loadu_ps(src_ptr);
+      __m512 data_src_2 = _mm512_loadu_ps(src_ptr + src_pitch);
+
+      __m512 data_0 = _mm512_permutexvar_ps(perm_0, data_src);
+      __m512 data_1 = _mm512_permutexvar_ps(perm_1, data_src);
+      __m512 data_2 = _mm512_permutexvar_ps(perm_2, data_src);
+      __m512 data_3 = _mm512_permutexvar_ps(perm_3, data_src);
+      __m512 data_4 = _mm512_permutexvar_ps(perm_4, data_src);
+      __m512 data_5 = _mm512_permutexvar_ps(perm_5, data_src);
+      __m512 data_6 = _mm512_permutexvar_ps(perm_6, data_src);
+      __m512 data_7 = _mm512_permutexvar_ps(perm_7, data_src);
+
+      __m512 data_0_2 = _mm512_permutexvar_ps(perm_0, data_src_2);
+      __m512 data_1_2 = _mm512_permutexvar_ps(perm_1, data_src_2);
+      __m512 data_2_2 = _mm512_permutexvar_ps(perm_2, data_src_2);
+      __m512 data_3_2 = _mm512_permutexvar_ps(perm_3, data_src_2);
+      __m512 data_4_2 = _mm512_permutexvar_ps(perm_4, data_src_2);
+      __m512 data_5_2 = _mm512_permutexvar_ps(perm_5, data_src_2);
+      __m512 data_6_2 = _mm512_permutexvar_ps(perm_6, data_src_2);
+      __m512 data_7_2 = _mm512_permutexvar_ps(perm_7, data_src_2);
+
+      __m512 result0 = _mm512_mul_ps(data_0, coef_r0);
+      __m512 result1 = _mm512_mul_ps(data_0_2, coef_r0);
+
+      result0 = _mm512_fmadd_ps(data_1, coef_r1, result0);
+      result1 = _mm512_fmadd_ps(data_1_2, coef_r1, result1);
+
+      result0 = _mm512_fmadd_ps(data_2, coef_r2, result0);
+      result1 = _mm512_fmadd_ps(data_2_2, coef_r2, result1);
+
+      result0 = _mm512_fmadd_ps(data_3, coef_r3, result0);
+      result1 = _mm512_fmadd_ps(data_3_2, coef_r3, result1);
+
+      result0 = _mm512_fmadd_ps(data_4, coef_r4, result0);
+      result1 = _mm512_fmadd_ps(data_4_2, coef_r4, result1);
+
+      result0 = _mm512_fmadd_ps(data_5, coef_r5, result0);
+      result1 = _mm512_fmadd_ps(data_5_2, coef_r5, result1);
+
+      result0 = _mm512_fmadd_ps(data_6, coef_r6, result0);
+      result1 = _mm512_fmadd_ps(data_6_2, coef_r6, result1);
+
+      result0 = _mm512_fmadd_ps(data_7, coef_r7, result0);
+      result1 = _mm512_fmadd_ps(data_7_2, coef_r7, result1);
+
+      _mm512_store_ps(dst_ptr, result0);
+      _mm512_store_ps(dst_ptr + dst_pitch, result1);
+
+      dst_ptr += dst_pitch * 2;
+      src_ptr += src_pitch * 2;
+    }
+
+    if (height > height_mod2) // last row
+    {
+      __m512 data_src = _mm512_loadu_ps(src_ptr);
+
+      __m512 data_0 = _mm512_permutexvar_ps(perm_0, data_src);
+      __m512 data_1 = _mm512_permutexvar_ps(perm_1, data_src);
+      __m512 data_2 = _mm512_permutexvar_ps(perm_2, data_src);
+      __m512 data_3 = _mm512_permutexvar_ps(perm_3, data_src);
+      __m512 data_4 = _mm512_permutexvar_ps(perm_4, data_src);
+      __m512 data_5 = _mm512_permutexvar_ps(perm_5, data_src);
+      __m512 data_6 = _mm512_permutexvar_ps(perm_6, data_src);
+      __m512 data_7 = _mm512_permutexvar_ps(perm_7, data_src);
+
+      __m512 result0 = _mm512_mul_ps(data_0, coef_r0);
+      __m512 result1 = _mm512_mul_ps(data_4, coef_r4);
+
+      result0 = _mm512_fmadd_ps(data_1, coef_r1, result0);
+      result1 = _mm512_fmadd_ps(data_5, coef_r5, result1);
+
+      result0 = _mm512_fmadd_ps(data_2, coef_r2, result0);
+      result1 = _mm512_fmadd_ps(data_6, coef_r6, result1);
+
+      result0 = _mm512_fmadd_ps(data_3, coef_r3, result0);
+      result1 = _mm512_fmadd_ps(data_7, coef_r7, result1);
+
+      _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1));
+    }
+
+    current_coeff += filter_size * 16;
+  }
+}
+
+void resize_h_planar_float_avx512_permutex_vstripe_ks16(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel)
+{
+  // assert - check if max pixel_offset is not above single load of 16 src floats (or need several loads and more complex permute program)
+#ifdef _DEBUG
+  for (int x = 0; x < width; x += 16)
+  {
+    int start_off = program->pixel_offset[x + 0];
+    int end_off = program->pixel_offset[x + 15];
+    assert((end_off - start_off) > 15);
+  }
+#endif
+
+  int filter_size = program->filter_size;
+
+  const float* AVS_RESTRICT current_coeff;
+  __m512i one_epi32 = _mm512_set1_epi32(1);
+
+  src_pitch = src_pitch / sizeof(float);
+  dst_pitch = dst_pitch / sizeof(float);
+
+  float* src = (float*)src8;
+  float* dst = (float*)dst8;
+
+  current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
+
+  for (int x = 0; x < width; x += 16)
+  {
+    // prepare coefs in transposed V-form, use gathering - not very slow until TRANSPOSE8_ is designed
+
+    __m512i offsets = _mm512_set_epi32(filter_size * 15, filter_size * 14, filter_size * 13, filter_size * 12, filter_size * 11, filter_size * 10, filter_size * 9, filter_size * 8, \
+      filter_size * 7, filter_size * 6, filter_size * 5, filter_size * 4, filter_size * 3, filter_size * 2, filter_size * 1, filter_size * 0);
+
+    __m512 coef_r0 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r1 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r2 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r3 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r4 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r5 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r6 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r7 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r8 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r9 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r10 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r11 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r12 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r13 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r14 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r15 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+
+    // convert resampling program in H-form into permuting indexes for src transposition in V-form
+    int iStart = program->pixel_offset[x + 0];
+    __m512i perm_0 = _mm512_set_epi32(program->pixel_offset[x + 15] - iStart, program->pixel_offset[x + 14] - iStart, program->pixel_offset[x + 13] - iStart, program->pixel_offset[x + 12] - iStart, program->pixel_offset[x + 11] - iStart, program->pixel_offset[x + 10] - iStart, program->pixel_offset[x + 9] - iStart, program->pixel_offset[x + 8] - iStart, \
+      program->pixel_offset[x + 7] - iStart, program->pixel_offset[x + 6] - iStart, program->pixel_offset[x + 5] - iStart, program->pixel_offset[x + 4] - iStart, program->pixel_offset[x + 3] - iStart, program->pixel_offset[x + 2] - iStart, program->pixel_offset[x + 1] - iStart, 0);
+    __m512i perm_1 = _mm512_add_epi32(perm_0, one_epi32);
+    __m512i perm_2 = _mm512_add_epi32(perm_1, one_epi32);
+    __m512i perm_3 = _mm512_add_epi32(perm_2, one_epi32);
+    __m512i perm_4 = _mm512_add_epi32(perm_3, one_epi32);
+    __m512i perm_5 = _mm512_add_epi32(perm_4, one_epi32);
+    __m512i perm_6 = _mm512_add_epi32(perm_5, one_epi32);
+    __m512i perm_7 = _mm512_add_epi32(perm_6, one_epi32);
+    __m512i perm_8 = _mm512_add_epi32(perm_7, one_epi32);
+    __m512i perm_9 = _mm512_add_epi32(perm_8, one_epi32);
+    __m512i perm_10 = _mm512_add_epi32(perm_9, one_epi32);
+    __m512i perm_11 = _mm512_add_epi32(perm_10, one_epi32);
+    __m512i perm_12 = _mm512_add_epi32(perm_11, one_epi32);
+    __m512i perm_13 = _mm512_add_epi32(perm_12, one_epi32);
+    __m512i perm_14 = _mm512_add_epi32(perm_13, one_epi32);
+    __m512i perm_15 = _mm512_add_epi32(perm_14, one_epi32); // to do : test if better to add one_epi32 in the loop and only store perm_0 complex to fill dataword
+
+    float* AVS_RESTRICT dst_ptr = dst + x;
+    const float* src_ptr = src + program->pixel_offset[x + 0]; // all permute offsets relative to this start offset
+
+    for (int y = 0; y < height; y++) // single row proc
+    {
+      __m512 data_src = _mm512_loadu_ps(src_ptr);
+
+      __m512 data_0 = _mm512_permutexvar_ps(perm_0, data_src);
+      __m512 data_1 = _mm512_permutexvar_ps(perm_1, data_src);
+      __m512 data_2 = _mm512_permutexvar_ps(perm_2, data_src);
+      __m512 data_3 = _mm512_permutexvar_ps(perm_3, data_src);
+      __m512 data_4 = _mm512_permutexvar_ps(perm_4, data_src);
+      __m512 data_5 = _mm512_permutexvar_ps(perm_5, data_src);
+      __m512 data_6 = _mm512_permutexvar_ps(perm_6, data_src);
+      __m512 data_7 = _mm512_permutexvar_ps(perm_7, data_src);
+      __m512 data_8 = _mm512_permutexvar_ps(perm_8, data_src);
+      __m512 data_9 = _mm512_permutexvar_ps(perm_9, data_src);
+      __m512 data_10 = _mm512_permutexvar_ps(perm_10, data_src);
+      __m512 data_11 = _mm512_permutexvar_ps(perm_11, data_src);
+      __m512 data_12 = _mm512_permutexvar_ps(perm_12, data_src);
+      __m512 data_13 = _mm512_permutexvar_ps(perm_13, data_src);
+      __m512 data_14 = _mm512_permutexvar_ps(perm_14, data_src);
+      __m512 data_15 = _mm512_permutexvar_ps(perm_15, data_src);
+
+      __m512 result0 = _mm512_mul_ps(data_0, coef_r0);
+      __m512 result1 = _mm512_mul_ps(data_8, coef_r8);
+
+      result0 = _mm512_fmadd_ps(data_1, coef_r1, result0);
+      result1 = _mm512_fmadd_ps(data_9, coef_r9, result1);
+
+      result0 = _mm512_fmadd_ps(data_2, coef_r2, result0);
+      result1 = _mm512_fmadd_ps(data_10, coef_r10, result1);
+
+      result0 = _mm512_fmadd_ps(data_3, coef_r3, result0);
+      result1 = _mm512_fmadd_ps(data_11, coef_r11, result1);
+
+      result0 = _mm512_fmadd_ps(data_4, coef_r4, result0);
+      result1 = _mm512_fmadd_ps(data_12, coef_r12, result1);
+
+      result0 = _mm512_fmadd_ps(data_5, coef_r5, result0);
+      result1 = _mm512_fmadd_ps(data_13, coef_r13, result1);
+
+      result0 = _mm512_fmadd_ps(data_6, coef_r6, result0);
+      result1 = _mm512_fmadd_ps(data_14, coef_r14, result1);
+
+      result0 = _mm512_fmadd_ps(data_7, coef_r7, result0);
+      result1 = _mm512_fmadd_ps(data_15, coef_r15, result1);
+
+      _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1));
+
+      dst_ptr += dst_pitch;
+      src_ptr += src_pitch;
+    }
+
+    current_coeff += filter_size * 16;
+  }
+}
+
+
+
 //-------- 512 bit float Verticals
 
 void resize_v_avx512_planar_float(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel)
diff --git a/avs_core/filters/intel/resample_avx512.h b/avs_core/filters/intel/resample_avx512.h
index 5dd73276b..dc98e0052 100644
--- a/avs_core/filters/intel/resample_avx512.h
+++ b/avs_core/filters/intel/resample_avx512.h
@@ -39,6 +39,9 @@
 #include "../resample_functions.h"
 
 void resize_h_planar_float_avx512_transpose_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+void resize_h_planar_float_avx512_permutex_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+void resize_h_planar_float_avx512_permutex_vstripe_ks8(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+void resize_h_planar_float_avx512_permutex_vstripe_ks16(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
 
 void resize_v_avx512_planar_float(BYTE* dst0, const BYTE* src0, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel);
 

From 489c19aefee83cb426ed6100755775a7a2f2a2cb Mon Sep 17 00:00:00 2001
From: DTL2020 <68707763+DTL2020@users.noreply.github.com>
Date: Sat, 10 May 2025 04:11:19 -0700
Subject: [PATCH 10/27] Cleanup for AVX2 ks8

---
 avs_core/filters/intel/resample_avx2.cpp | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/avs_core/filters/intel/resample_avx2.cpp b/avs_core/filters/intel/resample_avx2.cpp
index 6dd0a1832..5e22a2f59 100644
--- a/avs_core/filters/intel/resample_avx2.cpp
+++ b/avs_core/filters/intel/resample_avx2.cpp
@@ -1244,7 +1244,6 @@ void resize_h_planar_float_avx2_permutex_vstripe_ks4(BYTE* dst8, const BYTE* src
 {
 
   // assert - check if max pixel_offset is not above single load of 8 src floats (or need several loads and more complex permute program)
-
 #ifdef _DEBUG
   for (int x = 0; x < width; x += 8)
   {
@@ -1315,7 +1314,6 @@ void resize_h_planar_float_avx2_permutex_vstripe_ks8(BYTE* dst8, const BYTE* src
 {
 
   // assert - check if max pixel_offset is not above single load of 8 src floats (or need several loads and more complex permute program)
-
 #ifdef _DEBUG
   for (int x = 0; x < width; x += 8)
   {
@@ -1368,12 +1366,6 @@ void resize_h_planar_float_avx2_permutex_vstripe_ks8(BYTE* dst8, const BYTE* src
     for (int y = 0; y < height; y++)
     {
       __m256 result;
-
-/*      __m256i perm_0 = perm_start;
-      __m256i perm_1 = _mm256_add_epi32(perm_0, one_epi32);
-      __m256i perm_2 = _mm256_add_epi32(perm_1, one_epi32);
-      __m256i perm_3 = _mm256_add_epi32(perm_2, one_epi32);
-      */
       __m256 data_src = _mm256_loadu_ps(src_ptr);
 
       __m256 data_0 = _mm256_permutevar8x32_ps(data_src, perm_0);
@@ -1392,11 +1384,6 @@ void resize_h_planar_float_avx2_permutex_vstripe_ks8(BYTE* dst8, const BYTE* src
       // next next 4 samples + 4 coefs
       data_src = _mm256_loadu_ps(src_ptr + 4);
 
-/*      perm_0 = _mm256_add_epi32(perm_0, one_epi32); // are we need to reload next +4 src ?
-      perm_1 = _mm256_add_epi32(perm_1, one_epi32);
-      perm_2 = _mm256_add_epi32(perm_2, one_epi32);
-      perm_3 = _mm256_add_epi32(perm_3, one_epi32);
-      */
       data_0 = _mm256_permutevar8x32_ps(data_src, perm_0);
       data_1 = _mm256_permutevar8x32_ps(data_src, perm_1);
       data_2 = _mm256_permutevar8x32_ps(data_src, perm_2);

From 5a85b227364ede1b992d1e16b4e90a8c8c259cf2 Mon Sep 17 00:00:00 2001
From: DTL2020 <68707763+DTL2020@users.noreply.github.com>
Date: Tue, 13 May 2025 14:32:30 -0700
Subject: [PATCH 11/27] More versions for

resize_h_planar_float_avx512_permutex_vstripe_ks4 . Fastest for big frame size and many threads - with 64 output samples in single row output (smallest number of SDRAM read-write streams ?).
---
 avs_core/filters/intel/resample_avx512.cpp | 374 ++++++++++++++++++++-
 1 file changed, 369 insertions(+), 5 deletions(-)

diff --git a/avs_core/filters/intel/resample_avx512.cpp b/avs_core/filters/intel/resample_avx512.cpp
index b92fa2ab5..d9b1bd90f 100644
--- a/avs_core/filters/intel/resample_avx512.cpp
+++ b/avs_core/filters/intel/resample_avx512.cpp
@@ -125,18 +125,18 @@ void resize_h_planar_float_avx512_permutex_vstripe_ks4(BYTE* dst8, const BYTE* s
   // assert - check if max pixel_offset is not above single load of 16 src floats (or need several loads and more complex permute program)
 
 #ifdef _DEBUG
-  for (int x = 0; x < width; x += 16)
+  for (int x = 0; x < width; x++) // check each pair ?
   {
     int start_off = program->pixel_offset[x + 0];
     int end_off = program->pixel_offset[x + 15];
-    assert((end_off - start_off) > 15);
+    assert((end_off - start_off) < 15);
   }
 #endif
 
   int filter_size = program->filter_size;
 
   const float* AVS_RESTRICT current_coeff;
-  __m512i one_epi32 = _mm512_set1_epi32(1);
+  __m512i one_epi32 = _mm512_set1_epi32(1); 
 
   src_pitch = src_pitch / sizeof(float);
   dst_pitch = dst_pitch / sizeof(float);
@@ -146,6 +146,7 @@ void resize_h_planar_float_avx512_permutex_vstripe_ks4(BYTE* dst8, const BYTE* s
 
   current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
 
+/* // 16 output samples in H-direction per vstripe
   for (int x = 0; x < width; x += 16)
   {
     // prepare coefs in transposed V-form
@@ -246,6 +247,369 @@ void resize_h_planar_float_avx512_permutex_vstripe_ks4(BYTE* dst8, const BYTE* s
 
     current_coeff += filter_size * 16;
   }
+*/ // 16 output samples per vstripe
+
+/*
+  // 32 output samples per vstripe
+  for (int x = 0; x < width; x += 32) // processing by 32 sample - it is safe at the end of row ?
+  {
+    // prepare coefs in transposed V-form
+    __m512 coef_r0 = _mm512_load_4_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4, current_coeff + filter_size * 8, current_coeff + filter_size * 12);
+    __m512 coef_r1 = _mm512_load_4_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5, current_coeff + filter_size * 9, current_coeff + filter_size * 13);
+    __m512 coef_r2 = _mm512_load_4_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6, current_coeff + filter_size * 10, current_coeff + filter_size * 14);
+    __m512 coef_r3 = _mm512_load_4_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7, current_coeff + filter_size * 11, current_coeff + filter_size * 15);
+
+    // prepare coefs in transposed V-form
+    __m512 coef_r0_2 = _mm512_load_4_m128(current_coeff + filter_size * 16, current_coeff + filter_size * 20, current_coeff + filter_size * 24, current_coeff + filter_size * 28);
+    __m512 coef_r1_2 = _mm512_load_4_m128(current_coeff + filter_size * 17, current_coeff + filter_size * 21, current_coeff + filter_size * 25, current_coeff + filter_size * 29);
+    __m512 coef_r2_2 = _mm512_load_4_m128(current_coeff + filter_size * 18, current_coeff + filter_size * 22, current_coeff + filter_size * 26, current_coeff + filter_size * 30);
+    __m512 coef_r3_2 = _mm512_load_4_m128(current_coeff + filter_size * 19, current_coeff + filter_size * 23, current_coeff + filter_size * 27, current_coeff + filter_size * 31);
+
+    _MM_TRANSPOSE16_LANE4_PS(coef_r0, coef_r1, coef_r2, coef_r3);
+    _MM_TRANSPOSE16_LANE4_PS(coef_r0_2, coef_r1_2, coef_r2_2, coef_r3_2);
+
+    // convert resampling program in H-form into permuting indexes for src transposition in V-form
+    int iStart = program->pixel_offset[x + 0];
+    __m512i perm_0 = _mm512_set_epi32(program->pixel_offset[x + 15] - iStart, program->pixel_offset[x + 14] - iStart, program->pixel_offset[x + 13] - iStart, program->pixel_offset[x + 12] - iStart, program->pixel_offset[x + 11] - iStart, program->pixel_offset[x + 10] - iStart, program->pixel_offset[x + 9] - iStart, program->pixel_offset[x + 8] - iStart, \
+      program->pixel_offset[x + 7] - iStart, program->pixel_offset[x + 6] - iStart, program->pixel_offset[x + 5] - iStart, program->pixel_offset[x + 4] - iStart, program->pixel_offset[x + 3] - iStart, program->pixel_offset[x + 2] - iStart, program->pixel_offset[x + 1] - iStart, 0);
+    __m512i perm_1 = _mm512_add_epi32(perm_0, one_epi32);
+    __m512i perm_2 = _mm512_add_epi32(perm_1, one_epi32);
+    __m512i perm_3 = _mm512_add_epi32(perm_2, one_epi32);
+
+    int iStart_2 = program->pixel_offset[x + 16];
+    __m512i perm_0_2 = _mm512_set_epi32(program->pixel_offset[x + 31] - iStart_2, program->pixel_offset[x + 30] - iStart_2, program->pixel_offset[x + 29] - iStart_2, program->pixel_offset[x + 28] - iStart_2, program->pixel_offset[x + 27] - iStart_2, program->pixel_offset[x + 26] - iStart_2, program->pixel_offset[x + 25] - iStart_2, program->pixel_offset[x + 24] - iStart_2, \
+      program->pixel_offset[x + 23] - iStart_2, program->pixel_offset[x + 22] - iStart_2, program->pixel_offset[x + 21] - iStart_2, program->pixel_offset[x + 20] - iStart_2, program->pixel_offset[x + 19] - iStart_2, program->pixel_offset[x + 18] - iStart_2, program->pixel_offset[x + 17] - iStart_2, program->pixel_offset[x + 16] - iStart_2);
+    __m512i perm_1_2 = _mm512_add_epi32(perm_0, one_epi32);
+    __m512i perm_2_2 = _mm512_add_epi32(perm_1, one_epi32);
+    __m512i perm_3_2 = _mm512_add_epi32(perm_2, one_epi32);
+
+    float* AVS_RESTRICT dst_ptr = dst + x;
+    float* AVS_RESTRICT dst_ptr_2 = dst + x + 16;
+    const float* src_ptr = src + program->pixel_offset[x + 0]; // all permute offsets relative to this start offset
+    const float* src_ptr_2 = src + program->pixel_offset[x + 16]; // all permute offsets relative to this start offset
+#if 0
+    for (int y = 0; y < height; y++) // single row proc, 32 output samples
+    {
+      __m512 data_src = _mm512_loadu_ps(src_ptr);
+      __m512 data_src_2 = _mm512_loadu_ps(src_ptr_2);
+
+      __m512 data_0 = _mm512_permutexvar_ps(perm_0, data_src);
+      __m512 data_1 = _mm512_permutexvar_ps(perm_1, data_src);
+      __m512 data_2 = _mm512_permutexvar_ps(perm_2, data_src);
+      __m512 data_3 = _mm512_permutexvar_ps(perm_3, data_src);
+
+      __m512 data_0_2 = _mm512_permutexvar_ps(perm_0_2, data_src_2);
+      __m512 data_1_2 = _mm512_permutexvar_ps(perm_1_2, data_src_2);
+      __m512 data_2_2 = _mm512_permutexvar_ps(perm_2_2, data_src_2);
+      __m512 data_3_2 = _mm512_permutexvar_ps(perm_3_2, data_src_2);
+
+      __m512 result0 = _mm512_mul_ps(data_0, coef_r0);
+      __m512 result1 = _mm512_mul_ps(data_2, coef_r2);
+
+      __m512 result0_2 = _mm512_mul_ps(data_0_2, coef_r0_2);
+      __m512 result1_2 = _mm512_mul_ps(data_2_2, coef_r2_2);
+
+      result0 = _mm512_fmadd_ps(data_1, coef_r1, result0);
+      result1 = _mm512_fmadd_ps(data_3, coef_r3, result1);
+
+      result0_2 = _mm512_fmadd_ps(data_1_2, coef_r1_2, result0_2);
+      result1_2 = _mm512_fmadd_ps(data_3_2, coef_r3_2, result1_2);
+
+      _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1));
+      _mm512_store_ps(dst_ptr_2, _mm512_add_ps(result0_2, result1_2));
+
+      dst_ptr += dst_pitch;
+      src_ptr += src_pitch;
+
+      dst_ptr_2 += dst_pitch;
+      src_ptr_2 += src_pitch;
+
+    }
+#endif // single row
+    //dual rows and 32 per vstripe
+    const int height_mod2 = (height / 2) * 2; // Process pairs of rows for better efficiency
+    // dual-rows not worst in performance - may be left for the future better memory performance and compute performance hosts
+    for (int y = 0; y < height_mod2; y += 2)
+    {
+      __m512 data_src = _mm512_loadu_ps(src_ptr);
+      __m512 data_src_2 = _mm512_loadu_ps(src_ptr_2);
+
+      __m512 data_src_2r = _mm512_loadu_ps(src_ptr + src_pitch);
+      __m512 data_src_2_2r = _mm512_loadu_ps(src_ptr_2 + src_pitch);
+
+      __m512 data_0 = _mm512_permutexvar_ps(perm_0, data_src);
+      __m512 data_1 = _mm512_permutexvar_ps(perm_1, data_src);
+      __m512 data_2 = _mm512_permutexvar_ps(perm_2, data_src);
+      __m512 data_3 = _mm512_permutexvar_ps(perm_3, data_src);
+
+      __m512 data_0_2 = _mm512_permutexvar_ps(perm_0_2, data_src_2);
+      __m512 data_1_2 = _mm512_permutexvar_ps(perm_1_2, data_src_2);
+      __m512 data_2_2 = _mm512_permutexvar_ps(perm_2_2, data_src_2);
+      __m512 data_3_2 = _mm512_permutexvar_ps(perm_3_2, data_src_2);
+
+      __m512 data_0_2r = _mm512_permutexvar_ps(perm_0, data_src_2r);
+      __m512 data_1_2r = _mm512_permutexvar_ps(perm_1, data_src_2r);
+      __m512 data_2_2r = _mm512_permutexvar_ps(perm_2, data_src_2r);
+      __m512 data_3_2r = _mm512_permutexvar_ps(perm_3, data_src_2r);
+
+      __m512 data_0_2_2r = _mm512_permutexvar_ps(perm_0_2, data_src_2_2r);
+      __m512 data_1_2_2r = _mm512_permutexvar_ps(perm_1_2, data_src_2_2r);
+      __m512 data_2_2_2r = _mm512_permutexvar_ps(perm_2_2, data_src_2_2r);
+      __m512 data_3_2_2r = _mm512_permutexvar_ps(perm_3_2, data_src_2_2r);
+
+      // 1r
+      __m512 result0 = _mm512_mul_ps(data_0, coef_r0);
+      __m512 result1 = _mm512_mul_ps(data_2, coef_r2);
+
+      __m512 result0_2 = _mm512_mul_ps(data_0_2, coef_r0_2);
+      __m512 result1_2 = _mm512_mul_ps(data_2_2, coef_r2_2);
+
+      result0 = _mm512_fmadd_ps(data_1, coef_r1, result0);
+      result1 = _mm512_fmadd_ps(data_3, coef_r3, result1);
+
+      result0_2 = _mm512_fmadd_ps(data_1_2, coef_r1_2, result0_2);
+      result1_2 = _mm512_fmadd_ps(data_3_2, coef_r3_2, result1_2);
+
+      // 2r
+      __m512 result0_2r = _mm512_mul_ps(data_0_2r, coef_r0);
+      __m512 result1_2r = _mm512_mul_ps(data_2_2r, coef_r2);
+
+      __m512 result0_2_2r = _mm512_mul_ps(data_0_2_2r, coef_r0_2);
+      __m512 result1_2_2r = _mm512_mul_ps(data_2_2_2r, coef_r2_2);
+
+      result0_2r = _mm512_fmadd_ps(data_1_2r, coef_r1, result0_2r);
+      result1_2r = _mm512_fmadd_ps(data_3_2r, coef_r3, result1_2r);
+
+      result0_2_2r = _mm512_fmadd_ps(data_1_2_2r, coef_r1_2, result0_2_2r);
+      result1_2_2r = _mm512_fmadd_ps(data_3_2_2r, coef_r3_2, result1_2_2r);
+
+
+      _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1));
+      _mm512_store_ps(dst_ptr_2, _mm512_add_ps(result0_2, result1_2));
+
+      _mm512_store_ps(dst_ptr + dst_pitch, _mm512_add_ps(result0_2r, result1_2r));
+      _mm512_store_ps(dst_ptr_2 + dst_pitch, _mm512_add_ps(result0_2_2r, result1_2_2r));
+
+
+      dst_ptr += dst_pitch * 2;
+      src_ptr += src_pitch * 2;
+
+      dst_ptr_2 += dst_pitch * 2;
+      src_ptr_2 += src_pitch * 2;
+
+    }
+*/
+
+
+// some slower than 32 per vstripe with small cacheabe frame sizes and best performance with large frame size and many threads (most SDRAM controller friendly ?)
+
+  for (int x = 0; x < width; x += 64) // processing by 64 output sample - it is safe at the end of row ?
+  {
+    // prepare coefs in transposed V-form
+    __m512 coef_r0 = _mm512_load_4_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4, current_coeff + filter_size * 8, current_coeff + filter_size * 12);
+    __m512 coef_r1 = _mm512_load_4_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5, current_coeff + filter_size * 9, current_coeff + filter_size * 13);
+    __m512 coef_r2 = _mm512_load_4_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6, current_coeff + filter_size * 10, current_coeff + filter_size * 14);
+    __m512 coef_r3 = _mm512_load_4_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7, current_coeff + filter_size * 11, current_coeff + filter_size * 15);
+
+    // prepare coefs in transposed V-form
+    __m512 coef_r0_2 = _mm512_load_4_m128(current_coeff + filter_size * 16, current_coeff + filter_size * 20, current_coeff + filter_size * 24, current_coeff + filter_size * 28);
+    __m512 coef_r1_2 = _mm512_load_4_m128(current_coeff + filter_size * 17, current_coeff + filter_size * 21, current_coeff + filter_size * 25, current_coeff + filter_size * 29);
+    __m512 coef_r2_2 = _mm512_load_4_m128(current_coeff + filter_size * 18, current_coeff + filter_size * 22, current_coeff + filter_size * 26, current_coeff + filter_size * 30);
+    __m512 coef_r3_2 = _mm512_load_4_m128(current_coeff + filter_size * 19, current_coeff + filter_size * 23, current_coeff + filter_size * 27, current_coeff + filter_size * 31);
+
+    // prepare coefs in transposed V-form
+    __m512 coef_r0_3 = _mm512_load_4_m128(current_coeff + filter_size * 32, current_coeff + filter_size * 36, current_coeff + filter_size * 40, current_coeff + filter_size * 44);
+    __m512 coef_r1_3 = _mm512_load_4_m128(current_coeff + filter_size * 33, current_coeff + filter_size * 37, current_coeff + filter_size * 41, current_coeff + filter_size * 45);
+    __m512 coef_r2_3 = _mm512_load_4_m128(current_coeff + filter_size * 34, current_coeff + filter_size * 38, current_coeff + filter_size * 42, current_coeff + filter_size * 46);
+    __m512 coef_r3_3 = _mm512_load_4_m128(current_coeff + filter_size * 35, current_coeff + filter_size * 39, current_coeff + filter_size * 43, current_coeff + filter_size * 47);
+
+    // prepare coefs in transposed V-form
+    __m512 coef_r0_4 = _mm512_load_4_m128(current_coeff + filter_size * 48, current_coeff + filter_size * 52, current_coeff + filter_size * 56, current_coeff + filter_size * 60);
+    __m512 coef_r1_4 = _mm512_load_4_m128(current_coeff + filter_size * 49, current_coeff + filter_size * 53, current_coeff + filter_size * 57, current_coeff + filter_size * 61);
+    __m512 coef_r2_4 = _mm512_load_4_m128(current_coeff + filter_size * 50, current_coeff + filter_size * 54, current_coeff + filter_size * 58, current_coeff + filter_size * 62);
+    __m512 coef_r3_4 = _mm512_load_4_m128(current_coeff + filter_size * 51, current_coeff + filter_size * 55, current_coeff + filter_size * 59, current_coeff + filter_size * 63);
+
+    _MM_TRANSPOSE16_LANE4_PS(coef_r0, coef_r1, coef_r2, coef_r3);
+    _MM_TRANSPOSE16_LANE4_PS(coef_r0_2, coef_r1_2, coef_r2_2, coef_r3_2);
+    _MM_TRANSPOSE16_LANE4_PS(coef_r0_3, coef_r1_3, coef_r2_3, coef_r3_3);
+    _MM_TRANSPOSE16_LANE4_PS(coef_r0_4, coef_r1_4, coef_r2_4, coef_r3_4);
+
+    // convert resampling program in H-form into permuting indexes for src transposition in V-form
+    int iStart = program->pixel_offset[x + 0];
+    __m512i perm_0 = _mm512_set_epi32(program->pixel_offset[x + 15] - iStart, program->pixel_offset[x + 14] - iStart, program->pixel_offset[x + 13] - iStart, program->pixel_offset[x + 12] - iStart, program->pixel_offset[x + 11] - iStart, program->pixel_offset[x + 10] - iStart, program->pixel_offset[x + 9] - iStart, program->pixel_offset[x + 8] - iStart, \
+      program->pixel_offset[x + 7] - iStart, program->pixel_offset[x + 6] - iStart, program->pixel_offset[x + 5] - iStart, program->pixel_offset[x + 4] - iStart, program->pixel_offset[x + 3] - iStart, program->pixel_offset[x + 2] - iStart, program->pixel_offset[x + 1] - iStart, 0);
+
+    int iStart_2 = program->pixel_offset[x + 16];
+    __m512i perm_0_2 = _mm512_set_epi32(program->pixel_offset[x + 31] - iStart_2, program->pixel_offset[x + 30] - iStart_2, program->pixel_offset[x + 29] - iStart_2, program->pixel_offset[x + 28] - iStart_2, program->pixel_offset[x + 27] - iStart_2, program->pixel_offset[x + 26] - iStart_2, program->pixel_offset[x + 25] - iStart_2, program->pixel_offset[x + 24] - iStart_2, \
+      program->pixel_offset[x + 23] - iStart_2, program->pixel_offset[x + 22] - iStart_2, program->pixel_offset[x + 21] - iStart_2, program->pixel_offset[x + 20] - iStart_2, program->pixel_offset[x + 19] - iStart_2, program->pixel_offset[x + 18] - iStart_2, program->pixel_offset[x + 17] - iStart_2, program->pixel_offset[x + 16] - iStart_2);
+
+    int iStart_3 = program->pixel_offset[x + 32];
+    __m512i perm_0_3 = _mm512_set_epi32(program->pixel_offset[x + 47] - iStart_3, program->pixel_offset[x + 46] - iStart_3, program->pixel_offset[x + 45] - iStart_3, program->pixel_offset[x + 44] - iStart_3, program->pixel_offset[x + 43] - iStart_3, program->pixel_offset[x + 42] - iStart_3, program->pixel_offset[x + 41] - iStart_3, program->pixel_offset[x + 40] - iStart_3, \
+      program->pixel_offset[x + 39] - iStart_3, program->pixel_offset[x + 38] - iStart_3, program->pixel_offset[x + 37] - iStart_3, program->pixel_offset[x + 36] - iStart_3, program->pixel_offset[x + 35] - iStart_3, program->pixel_offset[x + 34] - iStart_3, program->pixel_offset[x + 33] - iStart_3, program->pixel_offset[x + 32] - iStart_3);
+
+    int iStart_4 = program->pixel_offset[x + 48];
+    __m512i perm_0_4 = _mm512_set_epi32(program->pixel_offset[x + 63] - iStart_4, program->pixel_offset[x + 62] - iStart_4, program->pixel_offset[x + 61] - iStart_4, program->pixel_offset[x + 60] - iStart_4, program->pixel_offset[x + 59] - iStart_4, program->pixel_offset[x + 58] - iStart_4, program->pixel_offset[x + 57] - iStart_4, program->pixel_offset[x + 56] - iStart_4, \
+      program->pixel_offset[x + 55] - iStart_4, program->pixel_offset[x + 54] - iStart_4, program->pixel_offset[x + 53] - iStart_4, program->pixel_offset[x + 52] - iStart_4, program->pixel_offset[x + 51] - iStart_4, program->pixel_offset[x + 50] - iStart_4, program->pixel_offset[x + 49] - iStart_4, program->pixel_offset[x + 48] - iStart_4);
+
+
+    float* AVS_RESTRICT dst_ptr = dst + x;
+    const float* src_ptr = src + program->pixel_offset[x + 0]; // all permute offsets relative to this start offset
+    const float* src_ptr_2 = src + program->pixel_offset[x + 16]; // all permute offsets relative to this start offset
+    const float* src_ptr_3 = src + program->pixel_offset[x + 32]; // all permute offsets relative to this start offset
+    const float* src_ptr_4 = src + program->pixel_offset[x + 48]; // all permute offsets relative to this start offset
+
+    for (int y = 0; y < height; y++) // single row proc, 32 output samples
+    {
+      __m512 data_src = _mm512_loadu_ps(src_ptr);
+      __m512 data_src_2 = _mm512_loadu_ps(src_ptr_2);
+      __m512 data_src_3 = _mm512_loadu_ps(src_ptr_3);
+      __m512 data_src_4 = _mm512_loadu_ps(src_ptr_4);
+
+      // 1st
+      __m512 data_0 = _mm512_permutexvar_ps(perm_0, data_src);
+
+      __m512i perm_next = _mm512_add_epi32(perm_0, one_epi32);
+      __m512 data_1 = _mm512_permutexvar_ps(perm_next, data_src);
+
+      perm_next = _mm512_add_epi32(perm_next, one_epi32);
+      __m512 data_2 = _mm512_permutexvar_ps(perm_next, data_src);
+
+      perm_next = _mm512_add_epi32(perm_next, one_epi32);
+      __m512 data_3 = _mm512_permutexvar_ps(perm_next, data_src);
+
+      // 2nd
+      __m512 data_0_2 = _mm512_permutexvar_ps(perm_0_2, data_src_2);
+
+      __m512i perm_next_2 = _mm512_add_epi32(perm_0_2, one_epi32);
+      __m512 data_1_2 = _mm512_permutexvar_ps(perm_next_2, data_src_2);
+
+      perm_next_2 = _mm512_add_epi32(perm_0_2, one_epi32);
+      __m512 data_2_2 = _mm512_permutexvar_ps(perm_next_2, data_src_2);
+
+      perm_next_2 = _mm512_add_epi32(perm_next_2, one_epi32);
+      __m512 data_3_2 = _mm512_permutexvar_ps(perm_next_2, data_src_2);
+
+      // 3rd
+      __m512 data_0_3 = _mm512_permutexvar_ps(perm_0_3, data_src_3);
+
+      __m512i perm_next_3 = _mm512_add_epi32(perm_0_3, one_epi32);
+      __m512 data_1_3 = _mm512_permutexvar_ps(perm_next_3, data_src_3);
+
+      perm_next_3 = _mm512_add_epi32(perm_0_3, one_epi32);
+      __m512 data_2_3 = _mm512_permutexvar_ps(perm_next_3, data_src_3);
+
+      perm_next_3 = _mm512_add_epi32(perm_next_3, one_epi32);
+      __m512 data_3_3 = _mm512_permutexvar_ps(perm_next_3, data_src_3);
+
+      // 4th
+      __m512 data_0_4 = _mm512_permutexvar_ps(perm_0_4, data_src_4);
+
+      __m512i perm_next_4 = _mm512_add_epi32(perm_0_4, one_epi32);
+      __m512 data_1_4 = _mm512_permutexvar_ps(perm_next_4, data_src_4);
+
+      perm_next_4 = _mm512_add_epi32(perm_0_4, one_epi32);
+      __m512 data_2_4 = _mm512_permutexvar_ps(perm_next_4, data_src_4);
+
+      perm_next_4 = _mm512_add_epi32(perm_next_4, one_epi32);
+      __m512 data_3_4 = _mm512_permutexvar_ps(perm_next_4, data_src_3);
+
+
+      __m512 result0 = _mm512_mul_ps(data_0, coef_r0);
+      __m512 result1 = _mm512_mul_ps(data_2, coef_r2);
+
+      __m512 result0_2 = _mm512_mul_ps(data_0_2, coef_r0_2);
+      __m512 result1_2 = _mm512_mul_ps(data_2_2, coef_r2_2);
+
+      __m512 result0_3 = _mm512_mul_ps(data_0_3, coef_r0_3);
+      __m512 result1_3 = _mm512_mul_ps(data_2_3, coef_r2_3);
+
+      __m512 result0_4 = _mm512_mul_ps(data_0_4, coef_r0_4);
+      __m512 result1_4 = _mm512_mul_ps(data_2_4, coef_r2_4);
+
+      result0 = _mm512_fmadd_ps(data_1, coef_r1, result0);
+      result1 = _mm512_fmadd_ps(data_3, coef_r3, result1);
+
+      result0_2 = _mm512_fmadd_ps(data_1_2, coef_r1_2, result0_2);
+      result1_2 = _mm512_fmadd_ps(data_3_2, coef_r3_2, result1_2);
+
+      result0_3 = _mm512_fmadd_ps(data_1_3, coef_r1_3, result0_3);
+      result1_3 = _mm512_fmadd_ps(data_3_3, coef_r3_3, result1_3);
+
+      _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1));
+      _mm512_store_ps(dst_ptr + 16, _mm512_add_ps(result0_2, result1_2));
+      _mm512_store_ps(dst_ptr + 32, _mm512_add_ps(result0_3, result1_3));
+      _mm512_store_ps(dst_ptr + 48, _mm512_add_ps(result0_4, result1_4));
+
+      dst_ptr += dst_pitch;
+
+      src_ptr += src_pitch;
+      src_ptr_2 += src_pitch;
+      src_ptr_3 += src_pitch;
+      src_ptr_4 += src_pitch;
+
+    }
+
+
+/*
+    const int height_mod2 = (height / 2) * 2; // Process pairs of rows for better efficiency
+    // dual-rows not worst in performance - may be left for the future better memory performance and compute performance hosts
+    for (int y = 0; y < height_mod2; y += 2)
+    {
+      __m512 data_src = _mm512_loadu_ps(src_ptr);
+      __m512 data_src_2 = _mm512_loadu_ps(src_ptr + src_pitch);
+
+      __m512 data_0 = _mm512_permutexvar_ps(perm_0, data_src);
+      __m512 data_1 = _mm512_permutexvar_ps(perm_1, data_src);
+      __m512 data_2 = _mm512_permutexvar_ps(perm_2, data_src);
+      __m512 data_3 = _mm512_permutexvar_ps(perm_3, data_src);
+
+      __m512 data_0_2 = _mm512_permutexvar_ps(perm_0, data_src_2);
+      __m512 data_1_2 = _mm512_permutexvar_ps(perm_1, data_src_2);
+      __m512 data_2_2 = _mm512_permutexvar_ps(perm_2, data_src_2);
+      __m512 data_3_2 = _mm512_permutexvar_ps(perm_3, data_src_2);
+
+      __m512 result0 = _mm512_mul_ps(data_0, coef_r0);
+      __m512 result1 = _mm512_mul_ps(data_0_2, coef_r0);
+
+      result0 = _mm512_fmadd_ps(data_1, coef_r1, result0);
+      result1 = _mm512_fmadd_ps(data_1_2, coef_r1, result1);
+
+      result0 = _mm512_fmadd_ps(data_2, coef_r2, result0);
+      result1 = _mm512_fmadd_ps(data_2_2, coef_r2, result1);
+
+      result0 = _mm512_fmadd_ps(data_3, coef_r3, result0);
+      result1 = _mm512_fmadd_ps(data_3_2, coef_r3, result1);
+
+      _mm512_store_ps(dst_ptr, result0);
+      _mm512_store_ps(dst_ptr + dst_pitch, result1);
+
+      dst_ptr += dst_pitch * 2;
+      src_ptr += src_pitch * 2;
+    }
+
+    if (height > height_mod2) // last row
+    {
+      __m512 data_src = _mm512_loadu_ps(src_ptr);
+
+      __m512 data_0 = _mm512_permutexvar_ps(perm_0, data_src);
+      __m512 data_1 = _mm512_permutexvar_ps(perm_1, data_src);
+      __m512 data_2 = _mm512_permutexvar_ps(perm_2, data_src);
+      __m512 data_3 = _mm512_permutexvar_ps(perm_3, data_src);
+
+      __m512 result0 = _mm512_mul_ps(data_0, coef_r0);
+      __m512 result1 = _mm512_mul_ps(data_2, coef_r2);
+
+      result0 = _mm512_fmadd_ps(data_1, coef_r1, result0);
+      result1 = _mm512_fmadd_ps(data_3, coef_r3, result1);
+
+      _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1));
+    }
+*/ // dual rows
+
+    current_coeff += filter_size * 64;
+  }
+
+
 }
 
 void resize_h_planar_float_avx512_permutex_vstripe_ks8(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel)
@@ -264,7 +628,7 @@ void resize_h_planar_float_avx512_permutex_vstripe_ks8(BYTE* dst8, const BYTE* s
   int filter_size = program->filter_size;
 
   const float* AVS_RESTRICT current_coeff;
-  __m512i one_epi32 = _mm512_set1_epi32(1);
+  __m512i one_epi32 = _mm512_set1_epi32(1); 
 
   src_pitch = src_pitch / sizeof(float);
   dst_pitch = dst_pitch / sizeof(float);
@@ -455,7 +819,7 @@ void resize_h_planar_float_avx512_permutex_vstripe_ks16(BYTE* dst8, const BYTE*
   int filter_size = program->filter_size;
 
   const float* AVS_RESTRICT current_coeff;
-  __m512i one_epi32 = _mm512_set1_epi32(1);
+  __m512i one_epi32 = _mm512_set1_epi32(1); 
 
   src_pitch = src_pitch / sizeof(float);
   dst_pitch = dst_pitch / sizeof(float);

From d58b402be460e1918eb9949d2079a6ae26ed794b Mon Sep 17 00:00:00 2001
From: DTL2020 <68707763+DTL2020@users.noreply.github.com>
Date: Thu, 22 May 2025 18:28:35 +0300
Subject: [PATCH 12/27] Pinterf update of 20.05.2025 and

added new universal function for AVX2 float ks4 processing using auto-selection between gathering by all addresses offsets or small load and permuting.
---
 avs_core/filters/resample.cpp           | 111 ++++++++++++++----------
 avs_core/filters/resample_functions.cpp |   4 +-
 avs_core/filters/resample_functions.h   |  29 ++++---
 3 files changed, 86 insertions(+), 58 deletions(-)

diff --git a/avs_core/filters/resample.cpp b/avs_core/filters/resample.cpp
index 8ff15bb66..9bb6cd149 100644
--- a/avs_core/filters/resample.cpp
+++ b/avs_core/filters/resample.cpp
@@ -36,7 +36,9 @@
 #ifdef INTEL_INTRINSICS
 #include "intel/resample_sse.h"
 #include "intel/resample_avx2.h"
+#ifdef INTEL_INTRINSICS_AVX512
 #include "intel/resample_avx512.h"
+#endif
 #include "intel/turn_sse.h"
 #endif
 #include <avs/config.h>
@@ -72,15 +74,34 @@
 // while maintaining correct coefficient positioning and proper zero padding.
 
 
+static void checkAndSetOverread(int end_pos, SafeLimit& safelimit, int start_pos, int i, int source_size) {
+  if (end_pos > source_size) {
+    if (!safelimit.overread_possible) {
+      safelimit.overread_possible = true;
+      safelimit.source_overread_offset = start_pos;
+      safelimit.source_overread_beyond_targetx = i;
+    }
+  }
+}
+
+
 void resize_prepare_coeffs(ResamplingProgram* p, IScriptEnvironment* env, int filter_size_alignment) {
   p->filter_size_alignment = filter_size_alignment;
-  p->overread_possible = false;
+  p->safelimit_filter_size_aligned.overread_possible = false;
+  p->safelimit_4_pixels.overread_possible = false;
+  p->safelimit_8_pixels.overread_possible = false;
+  p->safelimit_16_pixels.overread_possible = false;
+  p->safelimit_32_pixels.overread_possible = false;
 
   // note: filter_size_real was the max(kernel_sizes[])
   int filter_size_aligned = AlignNumber(p->filter_size_real, p->filter_size_alignment);
 
   int target_size_aligned = AlignNumber(p->target_size, ALIGN_RESIZER_TARGET_SIZE);
 
+  // align target_size to 8 units to allow safe up to 8 pixels/cycle in H resizers. modded later.
+  p->target_size_alignment = ALIGN_RESIZER_TARGET_SIZE;
+
+
   // Common variables for both float and integer paths
   void* new_coeff = nullptr;
   void* src_coeff = nullptr;
@@ -147,28 +168,27 @@ void resize_prepare_coeffs(ResamplingProgram* p, IScriptEnvironment* env, int fi
     // we must protect against source scanline overread.
     // Using this not in only 32-bit float resizers is new in 3.7.4.
     const int start_pos = p->pixel_offset[i];
-    const int end_pos_aligned = start_pos + filter_size_aligned - 1;
     const int end_pos = start_pos + p->filter_size_real - 1;
     if (end_pos >= p->source_size) {
       // This issue has already been fixed, so it cannot occur.
     }
 
     // Check for SIMD optimization limits
-    if (end_pos_aligned >= p->source_size) {
-      if (!p->overread_possible) {
-        // Register the first occurrence, because we are entering the danger zone from here.
-        // Up to this point, template-based alignment-aware quick code can be used
-        // in H resizers. But beyond this point an e.g. _mm256_loadu_si256() would read into 
-        // invalid memory area at the end of the frame buffer.
-        p->overread_possible = true;
-        p->source_overread_offset = start_pos;
-        p->source_overread_beyond_targetx = i; 
-      }
-    }
+    // a.) when filter_size_aligned pixels are read (e.g. 16 byte SIMD load: 4 float pixels must be safely read)
+    // b.-e.) same for exacly 4, 8, 16 and 32 pixels
+    // We register only the first occurrence, because we are entering the danger zone from there.
+    // Up to this point, it is safe to read 4/8/... pixels from "start_pos" in the actual line.
+    // e.g. reading 4 floats will not read beyond the last pixel in line. Used in modified H resizers.
+
+    checkAndSetOverread(start_pos + filter_size_aligned - 1, p->safelimit_filter_size_aligned, start_pos, i, p->source_size);
+    checkAndSetOverread(start_pos + 4 - 1, p->safelimit_4_pixels, start_pos, i, p->source_size);
+    checkAndSetOverread(start_pos + 8 - 1, p->safelimit_8_pixels, start_pos, i, p->source_size);
+    checkAndSetOverread(start_pos + 16 - 1, p->safelimit_16_pixels, start_pos, i, p->source_size);
+    checkAndSetOverread(start_pos + 32 - 1, p->safelimit_32_pixels, start_pos, i, p->source_size);
   }
 
   // Fill the extra offset after target_size with fake values.
-  // Our aim is to have a safe, up to 8 pixels/cycle simd loop for V resizers.
+  // Our aim is to have a safe, up to 8 pixels/cycle simd loop for V and specific H resizers.
   // Their coeffs will be 0, so they don't count if such coeffs
   // are multiplied with invalid pixels.
   if (p->target_size < target_size_aligned) {
@@ -177,6 +197,8 @@ void resize_prepare_coeffs(ResamplingProgram* p, IScriptEnvironment* env, int fi
     for (int i = p->target_size; i < target_size_aligned; ++i) {
       p->kernel_sizes[i] = p->filter_size_real;
       p->pixel_offset[i] = 0; // 0th pixel offset makes no harm
+      // even if this ensures the in-line safety, alternative H resizer implementations must
+      // not read beyond last line, where y>=height.
     }
   }
 
@@ -1045,7 +1067,7 @@ void resizer_h_c_generic_uint8_16_vectorized(BYTE* dst8, const BYTE* src8, int d
   dst_pitch /= sizeof(pixel_t);
   src_pitch /= sizeof(pixel_t);
 
-  const int w_safe_mod8 = (program->overread_possible ? program->source_overread_beyond_targetx : width) / 8 * 8;
+  const int w_safe_mod8 = (program->safelimit_filter_size_aligned.overread_possible ? program->safelimit_filter_size_aligned.source_overread_beyond_targetx : width) / 8 * 8;
 
   for (int y = 0; y < height; y++) {
     const short* current_coeff_base = program->pixel_coefficient;
@@ -1585,40 +1607,42 @@ ResamplerH FilteredResizeH::GetResampler(int CPU, int pixelsize, int bits_per_pi
   }
   else { //if (pixelsize == 4)
 #ifdef INTEL_INTRINSICS
-    if (CPU & CPUF_AVX512F) {
-      if ((program->filter_size_real <= 16) && (program->filter_size_real > 8))
-      {
-        return resize_h_planar_float_avx512_permutex_vstripe_ks16;
-      }
-      if ((program->filter_size_real <= 8) && (program->filter_size_real > 4))
-      {
-        return resize_h_planar_float_avx512_permutex_vstripe_ks8;
-      }
-      if (program->filter_size_real <= 4)
-      {
-
-        return resize_h_planar_float_avx512_permutex_vstripe_ks4;
+#ifdef INTEL_INTRINSICS_AVX512
+    if ((CPU & CPUF_AVX512F) && program->filter_size_real <= 4) {
+      //return resize_h_planar_float_avx2_permutex_vstripe_ks4;
+      switch (program->filter_size_real) {
+      case 1: return resize_h_planar_float_avx512_transpose_vstripe_ks4<1>; break;
+      case 2: return resize_h_planar_float_avx512_transpose_vstripe_ks4<2>; break;
+      case 3: return resize_h_planar_float_avx512_transpose_vstripe_ks4<3>; break;
+      case 4: return resize_h_planar_float_avx512_transpose_vstripe_ks4<0>; break;
       }
     }
-
+#endif
     if (CPU & CPUF_AVX2) {
-      if ((program->filter_size_real <= 8) && (program->filter_size_real > 4))
-      {
-        return resize_h_planar_float_avx2_permutex_vstripe_ks8;
-      }
-
-      if(program->filter_size_real <=4)
-      { 
-        return resize_h_planar_float_avx2_permutex_vstripe_ks4;
+      //return resize_h_planar_float_avx2_permutex_vstripe_ks4;
+      
+      switch (program->filter_size_real) {
+/*      case 1: return resize_h_planar_float_avx_transpose_vstripe_ks4<1>; break;
+      case 2: return resize_h_planar_float_avx_transpose_vstripe_ks4<2>; break;
+      case 3: return resize_h_planar_float_avx_transpose_vstripe_ks4<3>; break;
+      case 4: return resize_h_planar_float_avx_transpose_vstripe_ks4<0>; break;*/
+      case 1: return resize_h_planar_float_avx2_gather_permutex_vstripe_ks4<1>; break;
+      case 2: return resize_h_planar_float_avx2_gather_permutex_vstripe_ks4<2>; break;
+      case 3: return resize_h_planar_float_avx2_gather_permutex_vstripe_ks4<3>; break;
+      case 4: return resize_h_planar_float_avx2_gather_permutex_vstripe_ks4<0>; break;
+      default: return resizer_h_avx2_generic_float;
       }
-      return resizer_h_avx2_generic_float;
+      
     }
     if (CPU & CPUF_SSSE3) {
-      if (program->filter_size_real <= 4)
-      {
-          return resize_h_planar_float_sse_transpose_vstripe_ks4;
+      //      return resizer_h_ssse3_generic_float;
+      switch (program->filter_size_real) {
+      case 1: return resize_h_planar_float_sse_transpose_vstripe_ks4<1>; break;
+      case 2: return resize_h_planar_float_sse_transpose_vstripe_ks4<2>; break;
+      case 3: return resize_h_planar_float_sse_transpose_vstripe_ks4<3>; break;
+      case 4: return resize_h_planar_float_sse_transpose_vstripe_ks4<0>; break;
+      default: return resizer_h_ssse3_generic_float;
       }
-		return resizer_h_ssse3_generic_float;
     }
 #endif
     return resize_h_c_planar<float, 0>;
@@ -1818,9 +1842,6 @@ ResamplerV FilteredResizeV::GetResampler(int CPU, int pixelsize, int bits_per_pi
     else // pixelsize== 4
     {
 #ifdef INTEL_INTRINSICS
-      if (CPU & CPUF_AVX512F) {
-        return resize_v_avx512_planar_float;
-      }
       if (CPU & CPUF_AVX2) {
         return resize_v_avx2_planar_float;
       }
diff --git a/avs_core/filters/resample_functions.cpp b/avs_core/filters/resample_functions.cpp
index 437bc6a73..4b0b3399e 100644
--- a/avs_core/filters/resample_functions.cpp
+++ b/avs_core/filters/resample_functions.cpp
@@ -539,9 +539,9 @@ ResamplingProgram* ResamplingFunction::GetResamplingProgram(int source_size, dou
     // in order not to have NaN floats
     if (start_pos + AlignNumber(fir_filter_size, ALIGN_FLOAT_RESIZER_COEFF_SIZE) - 1 > source_size - 1)
     {
-      if (!program->overread_possible) {
+      if (!program->overread_possible_filter_size_aligned) {
         // register the first occurance
-        program->overread_possible = true;
+        program->overread_possible_filter_size_aligned = true;
         program->source_overread_offset = start_pos;
         program->source_overread_beyond_targetx = i;
       }
diff --git a/avs_core/filters/resample_functions.h b/avs_core/filters/resample_functions.h
index 4e7b71f58..738046824 100644
--- a/avs_core/filters/resample_functions.h
+++ b/avs_core/filters/resample_functions.h
@@ -48,10 +48,16 @@ constexpr int FPScale = 1 << FPScale8bits; // fixed point scaler (1<<14)
 // for 16 bits: one bit less
 constexpr int FPScale16bits = 13;
 constexpr int FPScale16 = 1 << FPScale16bits; // fixed point scaler for 10-16 bit SIMD signed operation
-constexpr int ALIGN_RESIZER_TARGET_SIZE = 8;
+constexpr int ALIGN_RESIZER_TARGET_SIZE = 16; // 16: avx512 float Hoprizontal
 // 09-14-2002 - Vlad59 - Lanczos3Resize - Constant added
 #define M_PI 3.14159265358979323846
 
+struct SafeLimit {
+  bool overread_possible;
+  int source_overread_offset;
+  int source_overread_beyond_targetx;
+};
+
 struct ResamplingProgram {
   IScriptEnvironment * Env;
   int source_size, target_size;
@@ -59,6 +65,7 @@ struct ResamplingProgram {
   int filter_size;
   int filter_size_real; // maybe less than filter_size if dimensions are small
   int filter_size_alignment; // for info, 1 (C, nonvector-friendly), 8 (sse or avx2) or 16 (avx2)
+  int target_size_alignment; // coeff table exists (and containt zero coeffs) even beyond target_size. Helps alternative H resizers.
 
   // Array of Integer indicate starting point of sampling
   std::vector<int> pixel_offset;
@@ -74,24 +81,24 @@ struct ResamplingProgram {
   std::vector<short> kernel_sizes; 
   // 3.7.4- can be different for each line but then they get equalized and aligned.
 
-  // anti-overread helpers for float resizer simd code reading 8 pixels from a given offset
-  bool overread_possible;
-  int source_overread_offset; // offset from where reading 8 bytes requires masking garbage on the right side
-  int source_overread_beyond_targetx; 
   // in H resizers danger zone starts from here.
-  // When reading aligned_filter_size elements from (src+offset) no longer fits image scanline dimensions
-
+  // When reading multiple (SIMD load) source pixels from (src+offset) and it no
+  // longer fits image scanline dimensions (width)
+  SafeLimit safelimit_filter_size_aligned = { false, -1, -1 };
+  SafeLimit safelimit_4_pixels = { false, -1, -1 };
+  SafeLimit safelimit_8_pixels = { false, -1, -1 };
+  SafeLimit safelimit_16_pixels = { false, -1, -1 };
+  SafeLimit safelimit_32_pixels = { false, -1, -1 };
 
   ResamplingProgram(int filter_size, int source_size, int target_size, double crop_start, double crop_size, int bits_per_pixel, IScriptEnvironment* env)
     : Env(env), source_size(source_size), target_size(target_size), crop_start(crop_start), crop_size(crop_size), filter_size(filter_size), filter_size_real(filter_size),
     bits_per_pixel(bits_per_pixel), pixel_coefficient(0), pixel_coefficient_float(0)
   {
-    overread_possible = false;
-    source_overread_offset = -1;
-    source_overread_beyond_targetx = -1;
 
-    // align target_size to 8 units to allow safe 8 pixels/cycle in H resizers
+    
     filter_size_alignment = 1;
+    // align target_size to 8 units to allow safe up to 8 pixels/cycle in H resizers. modded later.
+    target_size_alignment = 1;
     // resize_prepare_coeff can override and realign the size of coefficient table
     if (bits_per_pixel < 32)
       pixel_coefficient = (short*)Env->Allocate(sizeof(short) * target_size * filter_size, 64, AVS_NORMAL_ALLOC);

From df75a2cfc13f14c5dc9c6bb6e4e3e041365702d4 Mon Sep 17 00:00:00 2001
From: DTL2020 <68707763+DTL2020@users.noreply.github.com>
Date: Thu, 22 May 2025 18:30:35 +0300
Subject: [PATCH 13/27] Added Pinterf update from 20.05.2025 and

new universal function of AVX2 float ks4 processing with auto-selection between 2 source loading methods
---
 avs_core/filters/intel/resample_avx2.cpp   | 673 +++++++++++++++++----
 avs_core/filters/intel/resample_avx2.h     |  85 +--
 avs_core/filters/intel/resample_avx512.cpp | 593 ++++++------------
 avs_core/filters/intel/resample_avx512.h   |  86 ++-
 avs_core/filters/intel/resample_sse.cpp    | 356 +++++++----
 avs_core/filters/intel/resample_sse.h      |   2 +
 6 files changed, 1121 insertions(+), 674 deletions(-)

diff --git a/avs_core/filters/intel/resample_avx2.cpp b/avs_core/filters/intel/resample_avx2.cpp
index 5e22a2f59..96fa1533a 100644
--- a/avs_core/filters/intel/resample_avx2.cpp
+++ b/avs_core/filters/intel/resample_avx2.cpp
@@ -365,7 +365,7 @@ static void internal_resizer_h_avx2_generic_uint8_16_t(BYTE* dst8, const BYTE* s
   dst_pitch /= sizeof(pixel_t);
   src_pitch /= sizeof(pixel_t);
 
-  const int w_safe_mod8 = (program->overread_possible ? program->source_overread_beyond_targetx : width) / 8 * 8;
+  const int w_safe_mod8 = (program->safelimit_filter_size_aligned.overread_possible ? program->safelimit_filter_size_aligned.source_overread_beyond_targetx : width) / 8 * 8;
 
   for (int y = 0; y < height; y++) {
     const short* AVS_RESTRICT current_coeff_base = program->pixel_coefficient;
@@ -586,7 +586,7 @@ static void internal_resizer_h_avx2_generic_float(BYTE* dst8, const BYTE* src8,
   dst_pitch = dst_pitch / sizeof(float);
   src_pitch = src_pitch / sizeof(float);
 
-  const int w_safe_mod8 = (program->overread_possible ? program->source_overread_beyond_targetx : width) / 8 * 8;
+  const int w_safe_mod8 = (program->safelimit_filter_size_aligned.overread_possible ? program->safelimit_filter_size_aligned.source_overread_beyond_targetx : width) / 8 * 8;
 
   for (int y = 0; y < height; y++) {
     float* current_coeff_base = program->pixel_coefficient_float;
@@ -627,7 +627,7 @@ void resizer_h_avx2_generic_float(BYTE* dst8, const BYTE* src8, int dst_pitch, i
 // end of H float
 
 //-------- 256 bit Verticals
-/*
+#if 0
 void resize_v_avx2_planar_uint8_t(BYTE* AVS_RESTRICT dst, const BYTE* src, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel)
 {
   AVS_UNUSED(bits_per_pixel);
@@ -702,7 +702,7 @@ void resize_v_avx2_planar_uint8_t(BYTE* AVS_RESTRICT dst, const BYTE* src, int d
     current_coeff += filter_size;
   }
 }
-*/
+#else
 
 void resize_v_avx2_planar_uint8_t(BYTE* AVS_RESTRICT dst, const BYTE* src, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel)
 {
@@ -822,7 +822,7 @@ void resize_v_avx2_planar_uint8_t(BYTE* AVS_RESTRICT dst, const BYTE* src, int d
         current_coeff += filter_size;
     }
 }
-
+#endif
 template<bool lessthan16bit>
 void resize_v_avx2_planar_uint16_t(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel)
 {
@@ -1110,146 +1110,287 @@ void resize_h_planar_float_avx_transpose(BYTE* dst8, const BYTE* src8, int dst_p
 
 }
 
-// process kernel size from up to 4 - BilinearResize, BicubicResize or sinc up to taps=2
+
+// Safe dual lane partial load with AVX
+// Read exactly N pixels, avoiding
+// - reading beyond the end of the source buffer.
+// - avoid NaN contamination, since event with zero coefficients NaN * 0 = NaN
+template <int Nmod4>
+AVS_FORCEINLINE static __m256 _mm256_load_partial_safe_2_m128(const float* src_ptr_offsetted1, const float* src_ptr_offsetted2) {
+  __m128 s1;
+  __m128 s2;
+  switch (Nmod4) {
+  case 1:
+    s1 = _mm_set_ps(0.0f, 0.0f, 0.0f, src_ptr_offsetted1[0]);
+    s2 = _mm_set_ps(0.0f, 0.0f, 0.0f, src_ptr_offsetted2[0]);
+    // ideally: movss
+    break;
+  case 2:
+    s1 = _mm_set_ps(0.0f, 0.0f, src_ptr_offsetted1[1], src_ptr_offsetted1[0]);
+    s2 = _mm_set_ps(0.0f, 0.0f, src_ptr_offsetted2[1], src_ptr_offsetted2[0]);
+    // ideally: movsd
+    break;
+  case 3:
+    s1 = _mm_set_ps(0.0f, src_ptr_offsetted1[2], src_ptr_offsetted1[1], src_ptr_offsetted1[0]);
+    s2 = _mm_set_ps(0.0f, src_ptr_offsetted2[2], src_ptr_offsetted2[1], src_ptr_offsetted2[0]);
+    // ideally: movss + movsd + shuffle or movsd + insert
+    break;
+  case 0:
+    s1 = _mm_set_ps(src_ptr_offsetted1[3], src_ptr_offsetted1[2], src_ptr_offsetted1[1], src_ptr_offsetted1[0]);
+    s2 = _mm_set_ps(src_ptr_offsetted2[3], src_ptr_offsetted2[2], src_ptr_offsetted2[1], src_ptr_offsetted2[0]);
+    // ideally: movups
+    break;
+  default:
+    s1 = _mm_setzero_ps(); // n/a cannot happen
+    s2 = _mm_setzero_ps();
+  }
+  return _mm256_set_m128(s2, s1);
+}
+
+
+// Processes a horizontal resampling kernel of up to four coefficients for float pixel types.
+// Supports BilinearResize, BicubicResize, or sinc with up to 2 taps (filter size <= 4).
+// AVX optimization loads and processes four float coefficients and eight pixels simultaneously.
+// The 'filtersizemod4' template parameter (0-3) helps optimize for different filter sizes modulo 4.
+// This AVX2 requires only filter_size_alignment of 4.
+template<int filtersizemod4>
 void resize_h_planar_float_avx_transpose_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel) {
-    int filter_size = program->filter_size;
+  assert(filtersizemod4 >= 0 && filtersizemod4 <= 3);
 
-    const float* AVS_RESTRICT current_coeff;
+  const int filter_size = program->filter_size; // aligned, practically the coeff table stride
 
-    src_pitch = src_pitch / sizeof(float);
-    dst_pitch = dst_pitch / sizeof(float);
+  src_pitch /= sizeof(float);
+  dst_pitch /= sizeof(float);
 
-    float* src = (float*)src8;
-    float* dst = (float*)dst8;
+  float* src = (float*)src8;
+  float* dst = (float*)dst8;
+
+  const float* AVS_RESTRICT current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
+
+  constexpr int PIXELS_AT_A_TIME = 8; // Process eight pixels in parallel using AVX2 (2x4 using m128 lanes)
+
+  // 'source_overread_beyond_targetx' indicates if the filter kernel can read beyond the target width.
+  // Even if the filter alignment allows larger reads, our safety boundary for unaligned loads starts at 4 pixels back
+  // from the target width, as we load 4 floats at once with '_mm_loadu_ps'.
+  const int width_safe_mod = (program->safelimit_4_pixels.overread_possible ? program->safelimit_4_pixels.source_overread_beyond_targetx : width) / PIXELS_AT_A_TIME * PIXELS_AT_A_TIME;
 
-    current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
+  // Preconditions:
+  assert(program->filter_size_real <= 4); // We preload all relevant coefficients (up to 4) before the height loop.
 
-    for (int x = 0; x < width; x += 8) 
+  // 'target_size_alignment' ensures we can safely access coefficients using offsets like
+  // 'filter_size * 7' when processing 8 H pixels at a time or
+  // 'filter_size * 15' when processing 16 H pixels at a time
+  assert(program->target_size_alignment >= 8);
+
+  // Ensure that coefficient loading beyond the valid target size is safe for 4x4 float loads.
+  assert(program->filter_size_alignment >= 4);
+
+  int x = 0;
+
+  // This 'auto' lambda construct replaces the need of templates
+  auto do_h_float_core = [&](auto partial_load) {
+    // Load up to 2x4 coefficients at once before the height loop.
+    // Pre-loading and transposing coefficients keeps register usage efficient.
+    // Assumes 'filter_size_aligned' is at least 4.
+
+    // Coefficients for the source pixel offset (for src_ptr + begin1 [0..3] and for src_ptr + begin5 [0..3] )
+    __m256 coef_1_coef_5 = _mm256_load_2_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4);
+    __m256 coef_2_coef_6 = _mm256_load_2_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5);
+    __m256 coef_3_coef_7 = _mm256_load_2_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6);
+    __m256 coef_4_coef_8 = _mm256_load_2_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7);
+
+    _MM_TRANSPOSE8_LANE4_PS(coef_1_coef_5, coef_2_coef_6, coef_3_coef_7, coef_4_coef_8);
+
+    float* AVS_RESTRICT dst_ptr = dst + x;
+    const float* src_ptr = src;
+
+    // Pixel offsets for the current target x-positions.
+    // Even for x >= width, these offsets are guaranteed to be within the allocated 'target_size_alignment'.
+    const int begin1 = program->pixel_offset[x + 0];
+    const int begin2 = program->pixel_offset[x + 1];
+    const int begin3 = program->pixel_offset[x + 2];
+    const int begin4 = program->pixel_offset[x + 3];
+    const int begin5 = program->pixel_offset[x + 4];
+    const int begin6 = program->pixel_offset[x + 5];
+    const int begin7 = program->pixel_offset[x + 6];
+    const int begin8 = program->pixel_offset[x + 7];
+
+    for (int y = 0; y < height; y++)
     {
-        __m256 coef_1_coef_5 = _mm256_load_2_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4);
-        __m256 coef_2_coef_6 = _mm256_load_2_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5);
-        __m256 coef_3_coef_7 = _mm256_load_2_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6);
-        __m256 coef_4_coef_8 = _mm256_load_2_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7);
-        
-        _MM_TRANSPOSE8_LANE4_PS(coef_1_coef_5, coef_2_coef_6, coef_3_coef_7, coef_4_coef_8);
-
-        float* AVS_RESTRICT dst_ptr = dst + x;
-        const float* src_ptr = src;
-
-        for (int y = 0; y < height; y++) 
-        {
-            __m256 data_1_data_5 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 0], src_ptr + program->pixel_offset[x + 4]);
-            __m256 data_2_data_6 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 1], src_ptr + program->pixel_offset[x + 5]);
-            __m256 data_3_data_7 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 2], src_ptr + program->pixel_offset[x + 6]);
-            __m256 data_4_data_8 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 3], src_ptr + program->pixel_offset[x + 7]);
-
-            _MM_TRANSPOSE8_LANE4_PS(data_1_data_5, data_2_data_6, data_3_data_7, data_4_data_8);
-
-            __m256 result = _mm256_mul_ps(data_1_data_5, coef_1_coef_5);
-            result = _mm256_fmadd_ps(data_2_data_6, coef_2_coef_6, result);
-            result = _mm256_fmadd_ps(data_3_data_7, coef_3_coef_7, result);
-            result = _mm256_fmadd_ps(data_4_data_8, coef_4_coef_8, result);
-
-            _mm256_store_ps(dst_ptr, result);
-
-            dst_ptr += dst_pitch;
-            src_ptr += src_pitch;
-        }
-        current_coeff += filter_size * 8;
-    }
+      __m256 data_1_data_5;
+      __m256 data_2_data_6;
+      __m256 data_3_data_7;
+      __m256 data_4_data_8;
+
+      if constexpr (partial_load) {
+        // In the potentially unsafe zone (near the right edge of the image), we use a safe loading function
+        // to prevent reading beyond the allocated source scanline. This handles cases where loading 4 floats
+        // starting from 'src_ptr + beginX' might exceed the source buffer.
+
+        // Example of the unsafe scenario: If target width is 320, a naive load at src_ptr + 317
+        // would attempt to read floats at indices 317, 318, 319, and 320, potentially going out of bounds.
+
+        // Two main issues in the unsafe zone:
+        // 1.) Out-of-bounds memory access: Reading beyond the allocated memory for the source scanline can
+        //     lead to access violations and crashes. '_mm_loadu_ps' attempts to load 16 bytes, so even if
+        //     the starting address is within bounds, subsequent reads might not be.
+        // 2.) Garbage or NaN values: Even if a read doesn't cause a crash, accessing uninitialized or
+        //     out-of-bounds memory (especially for float types) can result in garbage data, including NaN.
+        //     Multiplying by a valid coefficient and accumulating this NaN can contaminate the final result.
+
+        // '_mm256_load_partial_safe_2_m128' safely loads up to 'filter_size_real' pixels and pads with zeros if needed,
+        // preventing out-of-bounds reads and ensuring predictable results even near the image edges.
+
+        data_1_data_5 = _mm256_load_partial_safe_2_m128<filtersizemod4>(src_ptr + begin1, src_ptr + begin5);
+        data_2_data_6 = _mm256_load_partial_safe_2_m128<filtersizemod4>(src_ptr + begin2, src_ptr + begin6);
+        data_3_data_7 = _mm256_load_partial_safe_2_m128<filtersizemod4>(src_ptr + begin3, src_ptr + begin7);
+        data_4_data_8 = _mm256_load_partial_safe_2_m128<filtersizemod4>(src_ptr + begin4, src_ptr + begin8);
+      }
+      else {
+        // In the safe zone, we can directly load 4 pixels at a time using unaligned loads.
+        data_1_data_5 = _mm256_loadu_2_m128(src_ptr + begin1, src_ptr + begin5);
+        data_2_data_6 = _mm256_loadu_2_m128(src_ptr + begin2, src_ptr + begin6);
+        data_3_data_7 = _mm256_loadu_2_m128(src_ptr + begin3, src_ptr + begin7);
+        data_4_data_8 = _mm256_loadu_2_m128(src_ptr + begin4, src_ptr + begin8);
+      }
 
+      _MM_TRANSPOSE8_LANE4_PS(data_1_data_5, data_2_data_6, data_3_data_7, data_4_data_8);
+
+      __m256 result = _mm256_mul_ps(data_1_data_5, coef_1_coef_5);
+      result = _mm256_fmadd_ps(data_2_data_6, coef_2_coef_6, result);
+      result = _mm256_fmadd_ps(data_3_data_7, coef_3_coef_7, result);
+      result = _mm256_fmadd_ps(data_4_data_8, coef_4_coef_8, result);
+
+      _mm256_store_ps(dst_ptr, result);
+
+      dst_ptr += dst_pitch;
+      src_ptr += src_pitch;
+    } // y
+    current_coeff += filter_size * 8; // Move to the next set of coefficients for the next 8 output pixels
+    }; // end of lambda
+
+  // Process the 'safe zone' where direct full unaligned loads are acceptable.
+  for (; x < width_safe_mod; x += PIXELS_AT_A_TIME)
+  {
+    do_h_float_core(std::false_type{}); // partial_load == false, use direct _mm_loadu_ps
+  }
+
+  // Process the potentially 'unsafe zone' near the image edge, using safe loading.
+  for (; x < width; x += PIXELS_AT_A_TIME)
+  {
+    do_h_float_core(std::true_type{}); // partial_load == true, use the safer '_mm256_load_partial_safe_2_m128'
+  }
 }
 
+// Instantiate them
+template void resize_h_planar_float_avx_transpose_vstripe_ks4<0>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+template void resize_h_planar_float_avx_transpose_vstripe_ks4<1>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+template void resize_h_planar_float_avx_transpose_vstripe_ks4<2>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+template void resize_h_planar_float_avx_transpose_vstripe_ks4<3>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+
 void resize_h_planar_float_avx_gather_vstripe_ks8(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel)
 {
-    int filter_size = program->filter_size;
+  int filter_size = program->filter_size;
 
-    const float* AVS_RESTRICT current_coeff;
+  const float* AVS_RESTRICT current_coeff;
 
-    src_pitch = src_pitch / sizeof(float);
-    dst_pitch = dst_pitch / sizeof(float);
+  src_pitch = src_pitch / sizeof(float);
+  dst_pitch = dst_pitch / sizeof(float);
 
-    float* src = (float*)src8;
-    float* dst = (float*)dst8;
+  float* src = (float*)src8;
+  float* dst = (float*)dst8;
 
-    current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
-    __m256i one_epi32 = _mm256_set1_epi32(1);
+  current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
+  __m256i one_epi32 = _mm256_set1_epi32(1);
 
-    for (int x = 0; x < width; x += 8)
-    {
-        __m256 coef_0 = _mm256_load_ps(current_coeff + filter_size * 0);
-        __m256 coef_1 = _mm256_load_ps(current_coeff + filter_size * 1);
-        __m256 coef_2 = _mm256_load_ps(current_coeff + filter_size * 2);
-        __m256 coef_3 = _mm256_load_ps(current_coeff + filter_size * 3);
-        __m256 coef_4 = _mm256_load_ps(current_coeff + filter_size * 4);
-        __m256 coef_5 = _mm256_load_ps(current_coeff + filter_size * 5);
-        __m256 coef_6 = _mm256_load_ps(current_coeff + filter_size * 6);
-        __m256 coef_7 = _mm256_load_ps(current_coeff + filter_size * 7);
+  for (int x = 0; x < width; x += 8)
+  {
+    __m256 coef_0 = _mm256_load_ps(current_coeff + filter_size * 0);
+    __m256 coef_1 = _mm256_load_ps(current_coeff + filter_size * 1);
+    __m256 coef_2 = _mm256_load_ps(current_coeff + filter_size * 2);
+    __m256 coef_3 = _mm256_load_ps(current_coeff + filter_size * 3);
+    __m256 coef_4 = _mm256_load_ps(current_coeff + filter_size * 4);
+    __m256 coef_5 = _mm256_load_ps(current_coeff + filter_size * 5);
+    __m256 coef_6 = _mm256_load_ps(current_coeff + filter_size * 6);
+    __m256 coef_7 = _mm256_load_ps(current_coeff + filter_size * 7);
 
-        _MM_TRANSPOSE8_PS(coef_0, coef_1, coef_2, coef_3, coef_4, coef_5, coef_6, coef_7);
+    _MM_TRANSPOSE8_PS(coef_0, coef_1, coef_2, coef_3, coef_4, coef_5, coef_6, coef_7);
 
-        float* AVS_RESTRICT dst_ptr = dst + x;
-        const float* src_ptr = src;
+    float* AVS_RESTRICT dst_ptr = dst + x;
+    const float* src_ptr = src;
 
-        for (int y = 0; y < height; y++)
-        {
-//            __m256i offsets = _mm256_load_si256(program->pixel_offset + x); // hope it is always aligned ?
-            __m256i offsets = _mm256_set_epi32(program->pixel_offset[x + 7], program->pixel_offset[x + 6], program->pixel_offset[x + 5], program->pixel_offset[x + 4], program->pixel_offset[x + 3], program->pixel_offset[x + 2], program->pixel_offset[x + 1], program->pixel_offset[x + 0]);
-//            __m256i offsets = _mm256_set1_epi32(program->pixel_offset[x]); // test
-            __m256 data_0 = _mm256_i32gather_ps(src_ptr, offsets, 4);
+    for (int y = 0; y < height; y++)
+    {
+      //            __m256i offsets = _mm256_load_si256(program->pixel_offset + x); // hope it is always aligned ?
+      __m256i offsets = _mm256_set_epi32(program->pixel_offset[x + 7], program->pixel_offset[x + 6], program->pixel_offset[x + 5], program->pixel_offset[x + 4], program->pixel_offset[x + 3], program->pixel_offset[x + 2], program->pixel_offset[x + 1], program->pixel_offset[x + 0]);
+      //            __m256i offsets = _mm256_set1_epi32(program->pixel_offset[x]); // test
+      __m256 data_0 = _mm256_i32gather_ps(src_ptr, offsets, 4);
 
-            offsets = _mm256_add_epi32(offsets, one_epi32);
-            __m256 data_1 = _mm256_i32gather_ps(src_ptr, offsets, 4);
+      offsets = _mm256_add_epi32(offsets, one_epi32);
+      __m256 data_1 = _mm256_i32gather_ps(src_ptr, offsets, 4);
 
-            offsets = _mm256_add_epi32(offsets, one_epi32);
-            __m256 data_2 = _mm256_i32gather_ps(src_ptr, offsets, 4);
+      offsets = _mm256_add_epi32(offsets, one_epi32);
+      __m256 data_2 = _mm256_i32gather_ps(src_ptr, offsets, 4);
 
-            offsets = _mm256_add_epi32(offsets, one_epi32);
-            __m256 data_3 = _mm256_i32gather_ps(src_ptr, offsets, 4);
+      offsets = _mm256_add_epi32(offsets, one_epi32);
+      __m256 data_3 = _mm256_i32gather_ps(src_ptr, offsets, 4);
 
-            offsets = _mm256_add_epi32(offsets, one_epi32);
-            __m256 data_4 = _mm256_i32gather_ps(src_ptr, offsets, 4);
+      offsets = _mm256_add_epi32(offsets, one_epi32);
+      __m256 data_4 = _mm256_i32gather_ps(src_ptr, offsets, 4);
 
-            offsets = _mm256_add_epi32(offsets, one_epi32);
-            __m256 data_5 = _mm256_i32gather_ps(src_ptr, offsets, 4);
+      offsets = _mm256_add_epi32(offsets, one_epi32);
+      __m256 data_5 = _mm256_i32gather_ps(src_ptr, offsets, 4);
 
-            offsets = _mm256_add_epi32(offsets, one_epi32);
-            __m256 data_6 = _mm256_i32gather_ps(src_ptr, offsets, 4);
+      offsets = _mm256_add_epi32(offsets, one_epi32);
+      __m256 data_6 = _mm256_i32gather_ps(src_ptr, offsets, 4);
 
-            offsets = _mm256_add_epi32(offsets, one_epi32);
-            __m256 data_7 = _mm256_i32gather_ps(src_ptr, offsets, 4);
+      offsets = _mm256_add_epi32(offsets, one_epi32);
+      __m256 data_7 = _mm256_i32gather_ps(src_ptr, offsets, 4);
 
-            __m256 result0 = _mm256_mul_ps(data_0, coef_0);
-            __m256 result1 = _mm256_mul_ps(data_4, coef_4);
-            
-            result0 = _mm256_fmadd_ps(data_1, coef_1, result0);
-            result1 = _mm256_fmadd_ps(data_5, coef_5, result1);
+      __m256 result0 = _mm256_mul_ps(data_0, coef_0);
+      __m256 result1 = _mm256_mul_ps(data_4, coef_4);
 
-            result0 = _mm256_fmadd_ps(data_2, coef_2, result0);
-            result1 = _mm256_fmadd_ps(data_6, coef_6, result1);
+      result0 = _mm256_fmadd_ps(data_1, coef_1, result0);
+      result1 = _mm256_fmadd_ps(data_5, coef_5, result1);
 
-            result0 = _mm256_fmadd_ps(data_3, coef_3, result0);
-            result1 = _mm256_fmadd_ps(data_7, coef_7, result1);
+      result0 = _mm256_fmadd_ps(data_2, coef_2, result0);
+      result1 = _mm256_fmadd_ps(data_6, coef_6, result1);
 
-            _mm256_store_ps(dst_ptr, _mm256_add_ps(result0, result1));
+      result0 = _mm256_fmadd_ps(data_3, coef_3, result0);
+      result1 = _mm256_fmadd_ps(data_7, coef_7, result1);
 
-            dst_ptr += dst_pitch;
-            src_ptr += src_pitch;
-        }
-        current_coeff += filter_size * 8;
+      _mm256_store_ps(dst_ptr, _mm256_add_ps(result0, result1));
+
+      dst_ptr += dst_pitch;
+      src_ptr += src_pitch;
     }
+    current_coeff += filter_size * 8;
+  }
 }
 
-void resize_h_planar_float_avx2_permutex_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel)
-{
+void resize_h_planar_float_avx2_permutex_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel) {
 
   // assert - check if max pixel_offset is not above single load of 8 src floats (or need several loads and more complex permute program)
-#ifdef _DEBUG
+  // probably this is a valid assumption; there can be no jumps in source pixel indexes, it would mean that the
+  // filter would neglect some pixels in the source image, which is not allowed by the filter design
+#if 1 //def _DEBUG
   for (int x = 0; x < width; x += 8)
   {
     int start_off = program->pixel_offset[x + 0];
     int end_off = program->pixel_offset[x + 7];
-    assert((end_off - start_off) > 7);
+    assert((end_off - start_off) <= 7);
+
+    start_off = program->pixel_offset[x + 1];
+    end_off = program->pixel_offset[x + 7 + 1];
+    assert((end_off - start_off) <= 7);
+
+    start_off = program->pixel_offset[x + 2];
+    end_off = program->pixel_offset[x + 7 + 2];
+    assert((end_off - start_off) <= 7);
+
+    start_off = program->pixel_offset[x + 3];
+    end_off = program->pixel_offset[x + 7 + 3];
+    assert((end_off - start_off) <= 7);
   }
 #endif
 
@@ -1278,10 +1419,38 @@ void resize_h_planar_float_avx2_permutex_vstripe_ks4(BYTE* dst8, const BYTE* src
 
     // convert resampling program in H-form into permuting indexes for src transposition in V-form
     int iStart = program->pixel_offset[x + 0];
-    __m256i perm_0 = _mm256_set_epi32(program->pixel_offset[x + 7] - iStart, program->pixel_offset[x + 6] - iStart, program->pixel_offset[x + 5] - iStart, program->pixel_offset[x + 4] - iStart, program->pixel_offset[x + 3] - iStart, program->pixel_offset[x + 2] - iStart, program->pixel_offset[x + 1] - iStart,  0);
+
+    __m256i perm_0 = _mm256_set_epi32(
+      program->pixel_offset[x + 7] - iStart,
+      program->pixel_offset[x + 6] - iStart,
+      program->pixel_offset[x + 5] - iStart,
+      program->pixel_offset[x + 4] - iStart,
+      program->pixel_offset[x + 3] - iStart,
+      program->pixel_offset[x + 2] - iStart,
+      program->pixel_offset[x + 1] - iStart,
+      0);
+    __m256i perm_1 = _mm256_add_epi32(perm_0, one_epi32);
+    one_epi32 = _mm256_set1_epi32(program->pixel_offset[x + 2] - program->pixel_offset[x + 1]);
+    __m256i perm_2 = _mm256_add_epi32(perm_1, one_epi32);
+    one_epi32 = _mm256_set1_epi32(program->pixel_offset[x + 3] - program->pixel_offset[x + 2]);
+    __m256i perm_3 = _mm256_add_epi32(perm_2, one_epi32);
+    /*
+    one_epi32 = _mm256_set1_epi32(program->pixel_offset[x + 1] - program->pixel_offset[x + 0]);
     __m256i perm_1 = _mm256_add_epi32(perm_0, one_epi32);
+    one_epi32 = _mm256_set1_epi32(program->pixel_offset[x + 2] - program->pixel_offset[x + 1]);
     __m256i perm_2 = _mm256_add_epi32(perm_1, one_epi32);
+    one_epi32 = _mm256_set1_epi32(program->pixel_offset[x + 3] - program->pixel_offset[x + 2]);
     __m256i perm_3 = _mm256_add_epi32(perm_2, one_epi32);
+    __m256i perm_1 = _mm256_set_epi32(
+      program->pixel_offset[x + 7] - iStart,
+      program->pixel_offset[x + 6] - iStart,
+      program->pixel_offset[x + 5] - iStart,
+      program->pixel_offset[x + 4] - iStart,
+      program->pixel_offset[x + 3] - iStart,
+      program->pixel_offset[x + 2] - iStart,
+      program->pixel_offset[x + 1] - iStart,
+      0);
+    */
 
     float* AVS_RESTRICT dst_ptr = dst + x;
     const float* src_ptr = src + program->pixel_offset[x + 0]; // all permute offsets relative to this start offset
@@ -1310,6 +1479,247 @@ void resize_h_planar_float_avx2_permutex_vstripe_ks4(BYTE* dst8, const BYTE* src
   }
 }
 
+
+/* Universal function supporting 2 ways of processing depending on the max offset of the source samples to read in the resampling program :
+1. For high upsampling ratios it uses low read (single 8 float source samples) and permute-transpose before V-fma
+2. For downsample and no-resize convolution - use each input sequence gathering by direct addressing
+*/
+template<int filtersizemod4>
+void resize_h_planar_float_avx2_gather_permutex_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel)
+{
+  assert(filtersizemod4 >= 0 && filtersizemod4 <= 3);
+
+  const int filter_size = program->filter_size; // aligned, practically the coeff table stride
+
+  src_pitch /= sizeof(float);
+  dst_pitch /= sizeof(float);
+
+  float* src = (float*)src8;
+  float* dst = (float*)dst8;
+
+  const float* AVS_RESTRICT current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
+
+  constexpr int PIXELS_AT_A_TIME = 8; // Process eight pixels in parallel using AVX2 (2x4 using m128 lanes)
+
+  // 'source_overread_beyond_targetx' indicates if the filter kernel can read beyond the target width.
+  // Even if the filter alignment allows larger reads, our safety boundary for unaligned loads starts at 4 pixels back
+  // from the target width, as we load 4 floats at once with '_mm_loadu_ps'.
+  const int width_safe_mod = (program->safelimit_4_pixels.overread_possible ? program->safelimit_4_pixels.source_overread_beyond_targetx : width) / PIXELS_AT_A_TIME * PIXELS_AT_A_TIME;
+
+  // Preconditions:
+  assert(program->filter_size_real <= 4); // We preload all relevant coefficients (up to 4) before the height loop.
+
+  // 'target_size_alignment' ensures we can safely access coefficients using offsets like
+  // 'filter_size * 7' when processing 8 H pixels at a time or
+  // 'filter_size * 15' when processing 16 H pixels at a time
+  assert(program->target_size_alignment >= 8);
+
+  // Ensure that coefficient loading beyond the valid target size is safe for 4x4 float loads.
+  assert(program->filter_size_alignment >= 4);
+
+  bool bDoGather = false;
+  // Analyse input resampling program to select method of processing
+  for (int x = 0; x < width - 8; x += 8) // -8 to save from vector overrread at program->pixel_offset[x + 7 + 3]; ?
+  {
+    int start_off = program->pixel_offset[x + 0];
+    int end_off = program->pixel_offset[x + 7];
+    if ((end_off - start_off) + (program->filter_size_real - 1) > 8) bDoGather = true;
+
+    start_off = program->pixel_offset[x + 1];
+    end_off = program->pixel_offset[x + 7 + 1];
+    if ((end_off - start_off) + (program->filter_size_real - 1) > 8) bDoGather = true;
+
+    start_off = program->pixel_offset[x + 2];
+    end_off = program->pixel_offset[x + 7 + 2];
+    if ((end_off - start_off) + (program->filter_size_real - 1) > 8) bDoGather = true;
+
+    start_off = program->pixel_offset[x + 3];
+    end_off = program->pixel_offset[x + 7 + 3];
+    if ((end_off - start_off) + (program->filter_size_real - 1) > 8) bDoGather = true; 
+  }
+
+  if (bDoGather)
+  {
+    int x = 0;
+
+    // This 'auto' lambda construct replaces the need of templates
+    auto do_h_float_core = [&](auto partial_load) {
+      // Load up to 2x4 coefficients at once before the height loop.
+      // Pre-loading and transposing coefficients keeps register usage efficient.
+      // Assumes 'filter_size_aligned' is at least 4.
+
+      // Coefficients for the source pixel offset (for src_ptr + begin1 [0..3] and for src_ptr + begin5 [0..3] )
+      __m256 coef_1_coef_5 = _mm256_load_2_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4);
+      __m256 coef_2_coef_6 = _mm256_load_2_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5);
+      __m256 coef_3_coef_7 = _mm256_load_2_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6);
+      __m256 coef_4_coef_8 = _mm256_load_2_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7);
+
+      _MM_TRANSPOSE8_LANE4_PS(coef_1_coef_5, coef_2_coef_6, coef_3_coef_7, coef_4_coef_8);
+
+      float* AVS_RESTRICT dst_ptr = dst + x;
+      const float* src_ptr = src;
+
+      // Pixel offsets for the current target x-positions.
+      // Even for x >= width, these offsets are guaranteed to be within the allocated 'target_size_alignment'.
+      const int begin1 = program->pixel_offset[x + 0];
+      const int begin2 = program->pixel_offset[x + 1];
+      const int begin3 = program->pixel_offset[x + 2];
+      const int begin4 = program->pixel_offset[x + 3];
+      const int begin5 = program->pixel_offset[x + 4];
+      const int begin6 = program->pixel_offset[x + 5];
+      const int begin7 = program->pixel_offset[x + 6];
+      const int begin8 = program->pixel_offset[x + 7];
+
+      for (int y = 0; y < height; y++)
+      {
+        __m256 data_1_data_5;
+        __m256 data_2_data_6;
+        __m256 data_3_data_7;
+        __m256 data_4_data_8;
+
+        if constexpr (partial_load) {
+          // In the potentially unsafe zone (near the right edge of the image), we use a safe loading function
+          // to prevent reading beyond the allocated source scanline. This handles cases where loading 4 floats
+          // starting from 'src_ptr + beginX' might exceed the source buffer.
+
+          // Example of the unsafe scenario: If target width is 320, a naive load at src_ptr + 317
+          // would attempt to read floats at indices 317, 318, 319, and 320, potentially going out of bounds.
+
+          // Two main issues in the unsafe zone:
+          // 1.) Out-of-bounds memory access: Reading beyond the allocated memory for the source scanline can
+          //     lead to access violations and crashes. '_mm_loadu_ps' attempts to load 16 bytes, so even if
+          //     the starting address is within bounds, subsequent reads might not be.
+          // 2.) Garbage or NaN values: Even if a read doesn't cause a crash, accessing uninitialized or
+          //     out-of-bounds memory (especially for float types) can result in garbage data, including NaN.
+          //     Multiplying by a valid coefficient and accumulating this NaN can contaminate the final result.
+
+          // '_mm256_load_partial_safe_2_m128' safely loads up to 'filter_size_real' pixels and pads with zeros if needed,
+          // preventing out-of-bounds reads and ensuring predictable results even near the image edges.
+
+          data_1_data_5 = _mm256_load_partial_safe_2_m128<filtersizemod4>(src_ptr + begin1, src_ptr + begin5);
+          data_2_data_6 = _mm256_load_partial_safe_2_m128<filtersizemod4>(src_ptr + begin2, src_ptr + begin6);
+          data_3_data_7 = _mm256_load_partial_safe_2_m128<filtersizemod4>(src_ptr + begin3, src_ptr + begin7);
+          data_4_data_8 = _mm256_load_partial_safe_2_m128<filtersizemod4>(src_ptr + begin4, src_ptr + begin8);
+        }
+        else {
+          // In the safe zone, we can directly load 4 pixels at a time using unaligned loads.
+          data_1_data_5 = _mm256_loadu_2_m128(src_ptr + begin1, src_ptr + begin5);
+          data_2_data_6 = _mm256_loadu_2_m128(src_ptr + begin2, src_ptr + begin6);
+          data_3_data_7 = _mm256_loadu_2_m128(src_ptr + begin3, src_ptr + begin7);
+          data_4_data_8 = _mm256_loadu_2_m128(src_ptr + begin4, src_ptr + begin8);
+        }
+
+        _MM_TRANSPOSE8_LANE4_PS(data_1_data_5, data_2_data_6, data_3_data_7, data_4_data_8);
+
+        __m256 result = _mm256_mul_ps(data_1_data_5, coef_1_coef_5);
+        result = _mm256_fmadd_ps(data_2_data_6, coef_2_coef_6, result);
+        result = _mm256_fmadd_ps(data_3_data_7, coef_3_coef_7, result);
+        result = _mm256_fmadd_ps(data_4_data_8, coef_4_coef_8, result);
+
+        _mm256_store_ps(dst_ptr, result);
+
+        dst_ptr += dst_pitch;
+        src_ptr += src_pitch;
+      } // y
+      current_coeff += filter_size * 8; // Move to the next set of coefficients for the next 8 output pixels
+    }; // end of lambda
+
+  // Process the 'safe zone' where direct full unaligned loads are acceptable.
+    for (; x < width_safe_mod; x += PIXELS_AT_A_TIME)
+    {
+      do_h_float_core(std::false_type{}); // partial_load == false, use direct _mm_loadu_ps
+    }
+
+    // Process the potentially 'unsafe zone' near the image edge, using safe loading.
+    for (; x < width; x += PIXELS_AT_A_TIME)
+    {
+      do_h_float_core(std::true_type{}); // partial_load == true, use the safer '_mm256_load_partial_safe_2_m128'
+    }
+  } // if bDoGather
+  else
+  {
+    // do permutex-based upsample
+    for (int x = 0; x < width; x += 8)
+    {
+      // prepare coefs in transposed V-form
+      __m256 coef_0 = _mm256_load_2_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4);
+      __m256 coef_1 = _mm256_load_2_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5);
+      __m256 coef_2 = _mm256_load_2_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6);
+      __m256 coef_3 = _mm256_load_2_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7);
+
+      _MM_TRANSPOSE8_LANE4_PS(coef_0, coef_1, coef_2, coef_3);
+
+      // convert resampling program in H-form into permuting indexes for src transposition in V-form
+      int iStart = program->pixel_offset[x + 0];
+
+      __m256i perm_0 = _mm256_set_epi32(
+        program->pixel_offset[x + 7] - iStart,
+        program->pixel_offset[x + 6] - iStart,
+        program->pixel_offset[x + 5] - iStart,
+        program->pixel_offset[x + 4] - iStart,
+        program->pixel_offset[x + 3] - iStart,
+        program->pixel_offset[x + 2] - iStart,
+        program->pixel_offset[x + 1] - iStart,
+        0);
+      __m256i one_epi32 = _mm256_set1_epi32(1);
+      __m256i perm_1 = _mm256_add_epi32(perm_0, one_epi32);
+      one_epi32 = _mm256_set1_epi32(program->pixel_offset[x + 2] - program->pixel_offset[x + 1]);
+      __m256i perm_2 = _mm256_add_epi32(perm_1, one_epi32);
+      one_epi32 = _mm256_set1_epi32(program->pixel_offset[x + 3] - program->pixel_offset[x + 2]);
+      __m256i perm_3 = _mm256_add_epi32(perm_2, one_epi32);
+      /*
+      one_epi32 = _mm256_set1_epi32(program->pixel_offset[x + 1] - program->pixel_offset[x + 0]);
+      __m256i perm_1 = _mm256_add_epi32(perm_0, one_epi32);
+      one_epi32 = _mm256_set1_epi32(program->pixel_offset[x + 2] - program->pixel_offset[x + 1]);
+      __m256i perm_2 = _mm256_add_epi32(perm_1, one_epi32);
+      one_epi32 = _mm256_set1_epi32(program->pixel_offset[x + 3] - program->pixel_offset[x + 2]);
+      __m256i perm_3 = _mm256_add_epi32(perm_2, one_epi32);
+      __m256i perm_1 = _mm256_set_epi32(
+        program->pixel_offset[x + 7] - iStart,
+        program->pixel_offset[x + 6] - iStart,
+        program->pixel_offset[x + 5] - iStart,
+        program->pixel_offset[x + 4] - iStart,
+        program->pixel_offset[x + 3] - iStart,
+        program->pixel_offset[x + 2] - iStart,
+        program->pixel_offset[x + 1] - iStart,
+        0);
+      */
+
+      float* AVS_RESTRICT dst_ptr = dst + x;
+      const float* src_ptr = src + program->pixel_offset[x + 0]; // all permute offsets relative to this start offset
+
+      for (int y = 0; y < height; y++)
+      {
+        __m256 data_src = _mm256_loadu_ps(src_ptr);
+
+        __m256 data_0 = _mm256_permutevar8x32_ps(data_src, perm_0);
+        __m256 data_1 = _mm256_permutevar8x32_ps(data_src, perm_1);
+        __m256 data_2 = _mm256_permutevar8x32_ps(data_src, perm_2);
+        __m256 data_3 = _mm256_permutevar8x32_ps(data_src, perm_3);
+
+        __m256 result0 = _mm256_mul_ps(data_0, coef_0);
+        __m256 result1 = _mm256_mul_ps(data_2, coef_2);
+
+        result0 = _mm256_fmadd_ps(data_1, coef_1, result0);
+        result1 = _mm256_fmadd_ps(data_3, coef_3, result1);
+
+        _mm256_store_ps(dst_ptr, _mm256_add_ps(result0, result1));
+
+        dst_ptr += dst_pitch;
+        src_ptr += src_pitch;
+      }
+      current_coeff += filter_size * 8;
+    }
+  }
+}
+
+// Instantiate them
+template void resize_h_planar_float_avx2_gather_permutex_vstripe_ks4<0>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+template void resize_h_planar_float_avx2_gather_permutex_vstripe_ks4<1>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+template void resize_h_planar_float_avx2_gather_permutex_vstripe_ks4<2>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+template void resize_h_planar_float_avx2_gather_permutex_vstripe_ks4<3>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+
+
 void resize_h_planar_float_avx2_permutex_vstripe_ks8(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel)
 {
 
@@ -1406,3 +1816,56 @@ void resize_h_planar_float_avx2_permutex_vstripe_ks8(BYTE* dst8, const BYTE* src
     current_coeff += filter_size * 8;
   }
 }
+
+#if 0
+// Original DTL2020, made end-contition safe
+// process kernel size from up to 4 - BilinearResize, BicubicResize or sinc up to taps=2
+void resize_h_planar_float_avx_transpose_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel) {
+  int filter_size = program->filter_size;
+
+  const float* AVS_RESTRICT current_coeff;
+
+  src_pitch = src_pitch / sizeof(float);
+  dst_pitch = dst_pitch / sizeof(float);
+
+  float* src = (float*)src8;
+  float* dst = (float*)dst8;
+
+  current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
+
+  for (int x = 0; x < width; x += 8)
+  {
+    __m256 coef_1_coef_5 = _mm256_load_2_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4);
+    __m256 coef_2_coef_6 = _mm256_load_2_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5);
+    __m256 coef_3_coef_7 = _mm256_load_2_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6);
+    __m256 coef_4_coef_8 = _mm256_load_2_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7);
+
+    _MM_TRANSPOSE8_LANE4_PS(coef_1_coef_5, coef_2_coef_6, coef_3_coef_7, coef_4_coef_8);
+
+    float* AVS_RESTRICT dst_ptr = dst + x;
+    const float* src_ptr = src;
+
+    for (int y = 0; y < height; y++)
+    {
+      __m256 data_1_data_5 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 0], src_ptr + program->pixel_offset[x + 4]);
+      __m256 data_2_data_6 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 1], src_ptr + program->pixel_offset[x + 5]);
+      __m256 data_3_data_7 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 2], src_ptr + program->pixel_offset[x + 6]);
+      __m256 data_4_data_8 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 3], src_ptr + program->pixel_offset[x + 7]);
+
+      _MM_TRANSPOSE8_LANE4_PS(data_1_data_5, data_2_data_6, data_3_data_7, data_4_data_8);
+
+      __m256 result = _mm256_mul_ps(data_1_data_5, coef_1_coef_5);
+      result = _mm256_fmadd_ps(data_2_data_6, coef_2_coef_6, result);
+      result = _mm256_fmadd_ps(data_3_data_7, coef_3_coef_7, result);
+      result = _mm256_fmadd_ps(data_4_data_8, coef_4_coef_8, result);
+
+      _mm256_store_ps(dst_ptr, result);
+
+      dst_ptr += dst_pitch;
+      src_ptr += src_pitch;
+    }
+    current_coeff += filter_size * 8;
+  }
+
+}
+#endif
diff --git a/avs_core/filters/intel/resample_avx2.h b/avs_core/filters/intel/resample_avx2.h
index e57e24a56..d5b2baa5d 100644
--- a/avs_core/filters/intel/resample_avx2.h
+++ b/avs_core/filters/intel/resample_avx2.h
@@ -54,56 +54,57 @@ void resize_v_avx2_planar_float(BYTE* dst0, const BYTE* src0, int dst_pitch, int
 
 void resize_h_planar_float_avx_transpose(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
 
+template<int filtersizemod4>
 void resize_h_planar_float_avx_transpose_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
 
-void resize_h_planar_float_avx_gather_vstripe_ks8(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
-
 void resize_h_planar_float_avx2_permutex_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
-void resize_h_planar_float_avx2_permutex_vstripe_ks8(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+
+template<int filtersizemod4>
+void resize_h_planar_float_avx2_gather_permutex_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
 
 // Transpose 4x4 blocks within each lane
 #define _MM_TRANSPOSE8_LANE4_PS(row0, row1, row2, row3) \
-	do { \
-		__m256 __t0, __t1, __t2, __t3; \
-		__t0 = _mm256_unpacklo_ps(row0, row1); \
-		__t1 = _mm256_unpackhi_ps(row0, row1); \
-		__t2 = _mm256_unpacklo_ps(row2, row3); \
-		__t3 = _mm256_unpackhi_ps(row2, row3); \
-		row0 = _mm256_shuffle_ps(__t0, __t2, _MM_SHUFFLE(1, 0, 1, 0)); \
-		row1 = _mm256_shuffle_ps(__t0, __t2, _MM_SHUFFLE(3, 2, 3, 2)); \
-		row2 = _mm256_shuffle_ps(__t1, __t3, _MM_SHUFFLE(1, 0, 1, 0)); \
-		row3 = _mm256_shuffle_ps(__t1, __t3, _MM_SHUFFLE(3, 2, 3, 2)); \
-	} while (0)
+  do { \
+    __m256 __t0, __t1, __t2, __t3; \
+    __t0 = _mm256_unpacklo_ps(row0, row1); \
+    __t1 = _mm256_unpackhi_ps(row0, row1); \
+    __t2 = _mm256_unpacklo_ps(row2, row3); \
+    __t3 = _mm256_unpackhi_ps(row2, row3); \
+    row0 = _mm256_shuffle_ps(__t0, __t2, _MM_SHUFFLE(1, 0, 1, 0)); \
+    row1 = _mm256_shuffle_ps(__t0, __t2, _MM_SHUFFLE(3, 2, 3, 2)); \
+    row2 = _mm256_shuffle_ps(__t1, __t3, _MM_SHUFFLE(1, 0, 1, 0)); \
+    row3 = _mm256_shuffle_ps(__t1, __t3, _MM_SHUFFLE(3, 2, 3, 2)); \
+  } while (0)
 
 #define _MM_TRANSPOSE8_PS(row0, row1, row2, row3, row4, row5, row6, row7) \
-	do { \
-		__m256 __t0, __t1, __t2, __t3, __t4, __t5, __t6, __t7; \
-		__m256 __tt0, __tt1, __tt2, __tt3, __tt4, __tt5, __tt6, __tt7; \
-		__t0 = _mm256_unpacklo_ps(row0, row1); \
-		__t1 = _mm256_unpackhi_ps(row0, row1); \
-		__t2 = _mm256_unpacklo_ps(row2, row3); \
-		__t3 = _mm256_unpackhi_ps(row2, row3); \
-		__t4 = _mm256_unpacklo_ps(row4, row5); \
-		__t5 = _mm256_unpackhi_ps(row4, row5); \
-		__t6 = _mm256_unpacklo_ps(row6, row7); \
-		__t7 = _mm256_unpackhi_ps(row6, row7); \
-		__tt0 = _mm256_shuffle_ps(__t0, __t2, _MM_SHUFFLE(1, 0, 1, 0)); \
-		__tt1 = _mm256_shuffle_ps(__t0, __t2, _MM_SHUFFLE(3, 2, 3, 2)); \
-		__tt2 = _mm256_shuffle_ps(__t1, __t3, _MM_SHUFFLE(1, 0, 1, 0)); \
-		__tt3 = _mm256_shuffle_ps(__t1, __t3, _MM_SHUFFLE(3, 2, 3, 2)); \
-		__tt4 = _mm256_shuffle_ps(__t4, __t6, _MM_SHUFFLE(1, 0, 1, 0)); \
-		__tt5 = _mm256_shuffle_ps(__t4, __t6, _MM_SHUFFLE(3, 2, 3, 2)); \
-		__tt6 = _mm256_shuffle_ps(__t5, __t7, _MM_SHUFFLE(1, 0, 1, 0)); \
-		__tt7 = _mm256_shuffle_ps(__t5, __t7, _MM_SHUFFLE(3, 2, 3, 2)); \
-		row0 = _mm256_permute2f128_ps(__tt0, __tt4, 0x20); \
-		row1 = _mm256_permute2f128_ps(__tt1, __tt5, 0x20); \
-		row2 = _mm256_permute2f128_ps(__tt2, __tt6, 0x20); \
-		row3 = _mm256_permute2f128_ps(__tt3, __tt7, 0x20); \
-		row4 = _mm256_permute2f128_ps(__tt0, __tt4, 0x31); \
-		row5 = _mm256_permute2f128_ps(__tt1, __tt5, 0x31); \
-		row6 = _mm256_permute2f128_ps(__tt2, __tt6, 0x31); \
-		row7 = _mm256_permute2f128_ps(__tt3, __tt7, 0x31); \
-	} while (0)
+  do { \
+    __m256 __t0, __t1, __t2, __t3, __t4, __t5, __t6, __t7; \
+    __m256 __tt0, __tt1, __tt2, __tt3, __tt4, __tt5, __tt6, __tt7; \
+    __t0 = _mm256_unpacklo_ps(row0, row1); \
+    __t1 = _mm256_unpackhi_ps(row0, row1); \
+    __t2 = _mm256_unpacklo_ps(row2, row3); \
+    __t3 = _mm256_unpackhi_ps(row2, row3); \
+    __t4 = _mm256_unpacklo_ps(row4, row5); \
+    __t5 = _mm256_unpackhi_ps(row4, row5); \
+    __t6 = _mm256_unpacklo_ps(row6, row7); \
+    __t7 = _mm256_unpackhi_ps(row6, row7); \
+    __tt0 = _mm256_shuffle_ps(__t0, __t2, _MM_SHUFFLE(1, 0, 1, 0)); \
+    __tt1 = _mm256_shuffle_ps(__t0, __t2, _MM_SHUFFLE(3, 2, 3, 2)); \
+    __tt2 = _mm256_shuffle_ps(__t1, __t3, _MM_SHUFFLE(1, 0, 1, 0)); \
+    __tt3 = _mm256_shuffle_ps(__t1, __t3, _MM_SHUFFLE(3, 2, 3, 2)); \
+    __tt4 = _mm256_shuffle_ps(__t4, __t6, _MM_SHUFFLE(1, 0, 1, 0)); \
+    __tt5 = _mm256_shuffle_ps(__t4, __t6, _MM_SHUFFLE(3, 2, 3, 2)); \
+    __tt6 = _mm256_shuffle_ps(__t5, __t7, _MM_SHUFFLE(1, 0, 1, 0)); \
+    __tt7 = _mm256_shuffle_ps(__t5, __t7, _MM_SHUFFLE(3, 2, 3, 2)); \
+    row0 = _mm256_permute2f128_ps(__tt0, __tt4, 0x20); \
+    row1 = _mm256_permute2f128_ps(__tt1, __tt5, 0x20); \
+    row2 = _mm256_permute2f128_ps(__tt2, __tt6, 0x20); \
+    row3 = _mm256_permute2f128_ps(__tt3, __tt7, 0x20); \
+    row4 = _mm256_permute2f128_ps(__tt0, __tt4, 0x31); \
+    row5 = _mm256_permute2f128_ps(__tt1, __tt5, 0x31); \
+    row6 = _mm256_permute2f128_ps(__tt2, __tt6, 0x31); \
+    row7 = _mm256_permute2f128_ps(__tt3, __tt7, 0x31); \
+  } while (0)
 
 
 
diff --git a/avs_core/filters/intel/resample_avx512.cpp b/avs_core/filters/intel/resample_avx512.cpp
index d9b1bd90f..a711bc97a 100644
--- a/avs_core/filters/intel/resample_avx512.cpp
+++ b/avs_core/filters/intel/resample_avx512.cpp
@@ -32,41 +32,202 @@
 // which is not derived from or based on Avisynth, such as 3rd-party filters,
 // import and export plugins, or graphical user interfaces.
 
-//#include "resample_sse.h"
 #include <avs/config.h>
 #include "../core/internal.h"
 
 #include <avs/alignment.h>
 #include <avs/minmax.h>
 
-// experimental simd includes for avx2 compiled files
-#if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
-#include <x86intrin.h>
-// x86intrin.h includes header files for whatever instruction
-// sets are specified on the compiler command line, such as: xopintrin.h, fma4intrin.h
-#else
-#include <immintrin.h> // MS version of immintrin.h covers AVX, AVX2 and FMA3
-#endif // __GNUC__
-
-#if !defined(__FMA__)
-// Assume that all processors that have AVX2 also have FMA3
-#if defined (__GNUC__) && ! defined (__INTEL_COMPILER) && ! defined (__clang__)
-// Prevent error message in g++ when using FMA intrinsics with avx2:
-#pragma message "It is recommended to specify also option -mfma when using -mavx2 or higher"
-#else
-#define __FMA__  1
-#endif
-#endif
-// FMA3 instruction set
-#if defined (__FMA__) && (defined(__GNUC__) || defined(__clang__))  && ! defined (__INTEL_COMPILER)
-#include <fmaintrin.h>
-#endif // __FMA__
+#include "resample_avx512.h"
+//------- 512 bit float Horizontals
 
+// Safe quad lane partial load with AVX512
+// Read exactly N pixels (where N mod 4 is the template parameter), avoiding
+// - reading beyond the end of the source buffer.
+// - avoid NaN contamination by padding with zeros.
+template <int Nmod4>
+AVS_FORCEINLINE static __m512 _mm512_load_partial_safe_4_m128(const float* src_ptr_offsetted1, const float* src_ptr_offsetted2, const float* src_ptr_offsetted3, const float* src_ptr_offsetted4) {
+  __m128 s1, s2, s3, s4;
+  switch (Nmod4) {
+  case 1:
+    s1 = _mm_set_ps(0.0f, 0.0f, 0.0f, src_ptr_offsetted1[0]);
+    s2 = _mm_set_ps(0.0f, 0.0f, 0.0f, src_ptr_offsetted2[0]);
+    s3 = _mm_set_ps(0.0f, 0.0f, 0.0f, src_ptr_offsetted3[0]);
+    s4 = _mm_set_ps(0.0f, 0.0f, 0.0f, src_ptr_offsetted4[0]);
+    // ideally: movss
+    break;
+  case 2:
+    s1 = _mm_set_ps(0.0f, 0.0f, src_ptr_offsetted1[1], src_ptr_offsetted1[0]);
+    s2 = _mm_set_ps(0.0f, 0.0f, src_ptr_offsetted2[1], src_ptr_offsetted2[0]);
+    s3 = _mm_set_ps(0.0f, 0.0f, src_ptr_offsetted3[1], src_ptr_offsetted3[0]);
+    s4 = _mm_set_ps(0.0f, 0.0f, src_ptr_offsetted4[1], src_ptr_offsetted4[0]);
+    // ideally: movsd
+    break;
+  case 3:
+    s1 = _mm_set_ps(0.0f, src_ptr_offsetted1[2], src_ptr_offsetted1[1], src_ptr_offsetted1[0]);
+    s2 = _mm_set_ps(0.0f, src_ptr_offsetted2[2], src_ptr_offsetted2[1], src_ptr_offsetted2[0]);
+    s3 = _mm_set_ps(0.0f, src_ptr_offsetted3[2], src_ptr_offsetted3[1], src_ptr_offsetted3[0]);
+    s4 = _mm_set_ps(0.0f, src_ptr_offsetted4[2], src_ptr_offsetted4[1], src_ptr_offsetted4[0]);
+    // ideally: movss + movsd + shuffle or movsd + insert
+    break;
+  case 0:
+    s1 = _mm_set_ps(src_ptr_offsetted1[3], src_ptr_offsetted1[2], src_ptr_offsetted1[1], src_ptr_offsetted1[0]);
+    s2 = _mm_set_ps(src_ptr_offsetted2[3], src_ptr_offsetted2[2], src_ptr_offsetted2[1], src_ptr_offsetted2[0]);
+    s3 = _mm_set_ps(src_ptr_offsetted3[3], src_ptr_offsetted3[2], src_ptr_offsetted3[1], src_ptr_offsetted3[0]);
+    s4 = _mm_set_ps(src_ptr_offsetted4[3], src_ptr_offsetted4[2], src_ptr_offsetted4[1], src_ptr_offsetted4[0]);
+    // ideally: movups
+    break;
+  default:
+    s1 = _mm_setzero_ps(); // n/a cannot happen
+    s2 = _mm_setzero_ps();
+    s3 = _mm_setzero_ps();
+    s4 = _mm_setzero_ps();
+  }
+  __m512 result = _mm512_castps128_ps512(s1); // Cast the first __m128 to __m512
+  result = _mm512_insertf32x4(result, s2, 1); // Insert the second __m128 at position 1
+  result = _mm512_insertf32x4(result, s3, 2); // Insert the third __m128 at position 2
+  result = _mm512_insertf32x4(result, s4, 3); // Insert the fourth __m128 at position 3
+  return result;
+}
 
-#include "resample_avx512.h"
 
-//------- 512 bit float Horizontals
 
+
+// Processes a horizontal resampling kernel of up to four coefficients for float pixel types.
+// Supports BilinearResize, BicubicResize, or sinc with up to 2 taps (filter size <= 4).
+// AVX512 optimization loads and processes four float coefficients and sixteen pixels simultaneously.
+// The 'filtersizemod4' template parameter (0-3) helps optimize for different filter sizes modulo 4.
+// This AVX512 requires only filter_size_alignment of 4.
+template<int filtersizemod4>
+void resize_h_planar_float_avx512_transpose_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel) {
+  assert(filtersizemod4 >= 0 && filtersizemod4 <= 3);
+
+  const int filter_size = program->filter_size; // aligned, practically the coeff table stride
+
+  src_pitch /= sizeof(float);
+  dst_pitch /= sizeof(float);
+
+  float* src = (float*)src8;
+  float* dst = (float*)dst8;
+
+  const float* AVS_RESTRICT current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
+
+  constexpr int PIXELS_AT_A_TIME = 16; // Process sixteen pixels in parallel using AVX512 (4x4 using m128 lanes)
+
+  // 'source_overread_beyond_targetx' indicates if the filter kernel can read beyond the target width.
+  // Even if the filter alignment allows larger reads, our safety boundary for unaligned loads starts at 4 pixels back
+  // from the target width, as we load 4 floats at once conceptually with our safe load.
+  const int width_safe_mod = (program->safelimit_4_pixels.overread_possible ? program->safelimit_4_pixels.source_overread_beyond_targetx : width) / PIXELS_AT_A_TIME * PIXELS_AT_A_TIME;
+
+  // Preconditions:
+  assert(program->filter_size_real <= 4); // We preload all relevant coefficients (up to 4) before the height loop.
+
+  // 'target_size_alignment' ensures we can safely access coefficients using offsets like
+  // 'filter_size * 7' when processing 8 H pixels at a time or
+  // 'filter_size * 15' when processing 16 H pixels at a time
+  assert(program->target_size_alignment >= 16); // Adjusted for 16 pixels
+  assert(FRAME_ALIGN >= 64); // Adjusted for 16 pixels AviSynth+ default
+
+  // Ensure that coefficient loading beyond the valid target size is safe for 4x4 float loads.
+  assert(program->filter_size_alignment >= 4);
+
+  int x = 0;
+
+  // This 'auto' lambda construct replaces the need of templates
+  auto do_h_float_core = [&](auto partial_load) {
+    // Load up to 4x4 coefficients at once before the height loop.
+    // Pre-loading and transposing coefficients keeps register usage efficient.
+    // Assumes 'filter_size_aligned' is at least 4.
+
+    // Coefficients for the source pixel offset (for src_ptr + begin1 [0..3], begin5 [0..3], begin9 [0..3], begin13 [0..3])
+    __m512 coef_1_5_9_13 = _mm512_load_4_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4, current_coeff + filter_size * 8, current_coeff + filter_size * 12);
+    __m512 coef_2_6_10_14 = _mm512_load_4_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5, current_coeff + filter_size * 9, current_coeff + filter_size * 13);
+    __m512 coef_3_7_11_15 = _mm512_load_4_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6, current_coeff + filter_size * 10, current_coeff + filter_size * 14);
+    __m512 coef_4_8_12_16 = _mm512_load_4_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7, current_coeff + filter_size * 11, current_coeff + filter_size * 15);
+
+    _MM_TRANSPOSE16_LANE4_PS(coef_1_5_9_13, coef_2_6_10_14, coef_3_7_11_15, coef_4_8_12_16);
+
+    float* AVS_RESTRICT dst_ptr = dst + x;
+    const float* src_ptr = src;
+
+    // Pixel offsets for the current target x-positions.
+    // Even for x >= width, these offsets are guaranteed to be within the allocated 'target_size_alignment'.
+    const int begin1 = program->pixel_offset[x + 0];
+    const int begin2 = program->pixel_offset[x + 1];
+    const int begin3 = program->pixel_offset[x + 2];
+    const int begin4 = program->pixel_offset[x + 3];
+    const int begin5 = program->pixel_offset[x + 4];
+    const int begin6 = program->pixel_offset[x + 5];
+    const int begin7 = program->pixel_offset[x + 6];
+    const int begin8 = program->pixel_offset[x + 7];
+    const int begin9 = program->pixel_offset[x + 8];
+    const int begin10 = program->pixel_offset[x + 9];
+    const int begin11 = program->pixel_offset[x + 10];
+    const int begin12 = program->pixel_offset[x + 11];
+    const int begin13 = program->pixel_offset[x + 12];
+    const int begin14 = program->pixel_offset[x + 13];
+    const int begin15 = program->pixel_offset[x + 14];
+    const int begin16 = program->pixel_offset[x + 15];
+
+    for (int y = 0; y < height; y++)
+    {
+      __m512 data_1_5_9_13;
+      __m512 data_2_6_10_14;
+      __m512 data_3_7_11_15;
+      __m512 data_4_8_12_16;
+
+      if constexpr (partial_load) {
+        // In the potentially unsafe zone (near the right edge of the image), we use a safe loading function
+        // to prevent reading beyond the allocated source scanline.
+
+        data_1_5_9_13 = _mm512_load_partial_safe_4_m128<filtersizemod4>(src_ptr + begin1, src_ptr + begin5, src_ptr + begin9, src_ptr + begin13);
+        data_2_6_10_14 = _mm512_load_partial_safe_4_m128<filtersizemod4>(src_ptr + begin2, src_ptr + begin6, src_ptr + begin10, src_ptr + begin14);
+        data_3_7_11_15 = _mm512_load_partial_safe_4_m128<filtersizemod4>(src_ptr + begin3, src_ptr + begin7, src_ptr + begin11, src_ptr + begin15);
+        data_4_8_12_16 = _mm512_load_partial_safe_4_m128<filtersizemod4>(src_ptr + begin4, src_ptr + begin8, src_ptr + begin12, src_ptr + begin16);
+      }
+      else {
+        // In the safe zone, we can directly load 4 pixels at a time for each of the four lanes.
+        data_1_5_9_13 = _mm512_loadu_4_m128(src_ptr + begin1, src_ptr + begin5, src_ptr + begin9, src_ptr + begin13);
+        data_2_6_10_14 = _mm512_loadu_4_m128(src_ptr + begin2, src_ptr + begin6, src_ptr + begin10, src_ptr + begin14);
+        data_3_7_11_15 = _mm512_loadu_4_m128(src_ptr + begin3, src_ptr + begin7, src_ptr + begin11, src_ptr + begin15);
+        data_4_8_12_16 = _mm512_loadu_4_m128(src_ptr + begin4, src_ptr + begin8, src_ptr + begin12, src_ptr + begin16);
+      }
+
+      _MM_TRANSPOSE16_LANE4_PS(data_1_5_9_13, data_2_6_10_14, data_3_7_11_15, data_4_8_12_16);
+
+      __m512 result = _mm512_mul_ps(data_1_5_9_13, coef_1_5_9_13);
+      result = _mm512_fmadd_ps(data_2_6_10_14, coef_2_6_10_14, result);
+      result = _mm512_fmadd_ps(data_3_7_11_15, coef_3_7_11_15, result);
+      result = _mm512_fmadd_ps(data_4_8_12_16, coef_4_8_12_16, result);
+
+      _mm512_store_ps(dst_ptr, result);
+
+      dst_ptr += dst_pitch;
+      src_ptr += src_pitch;
+    } // y
+    current_coeff += filter_size * 16; // Move to the next set of coefficients for the next 16 output pixels
+    }; // end of lambda
+
+  // Process the 'safe zone' where direct full unaligned loads are acceptable.
+  for (; x < width_safe_mod; x += PIXELS_AT_A_TIME)
+  {
+    do_h_float_core(std::false_type{}); // partial_load == false, use direct _mm512_loadu_4_m128
+  }
+
+  // Process the potentially 'unsafe zone' near the image edge, using safe loading.
+  for (; x < width; x += PIXELS_AT_A_TIME)
+  {
+    do_h_float_core(std::true_type{}); // partial_load == true, use the safer '_mm512_load_partial_safe_4_m128'
+  }
+}
+
+// Instantiate them
+template void resize_h_planar_float_avx512_transpose_vstripe_ks4<0>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+template void resize_h_planar_float_avx512_transpose_vstripe_ks4<1>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+template void resize_h_planar_float_avx512_transpose_vstripe_ks4<2>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+template void resize_h_planar_float_avx512_transpose_vstripe_ks4<3>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+
+#if 0 // DTL version
 // Transpose-based
 // process kernel size from up to 4 - BilinearResize, BicubicResize or sinc up to taps=2
 void resize_h_planar_float_avx512_transpose_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel) {
@@ -82,6 +243,7 @@ void resize_h_planar_float_avx512_transpose_vstripe_ks4(BYTE* dst8, const BYTE*
 
     current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
 
+    // this 16xfloat works, since AviSynth aligns scanlines to 64 bytes.
     for (int x = 0; x < width; x += 16) // is it safe to read by 16 floats = 64 bytes ?
     {
         __m512 c1_c5_c9_c13 = _mm512_load_4_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4, current_coeff + filter_size * 8, current_coeff + filter_size * 12);
@@ -117,26 +279,27 @@ void resize_h_planar_float_avx512_transpose_vstripe_ks4(BYTE* dst8, const BYTE*
     }
 
 }
+#endif
 
-
+#if 0
 void resize_h_planar_float_avx512_permutex_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel)
 {
 
   // assert - check if max pixel_offset is not above single load of 16 src floats (or need several loads and more complex permute program)
 
 #ifdef _DEBUG
-  for (int x = 0; x < width; x++) // check each pair ?
+  for (int x = 0; x < width; x += 16)
   {
     int start_off = program->pixel_offset[x + 0];
     int end_off = program->pixel_offset[x + 15];
-    assert((end_off - start_off) < 15);
+    assert((end_off - start_off) > 15);
   }
 #endif
 
   int filter_size = program->filter_size;
 
   const float* AVS_RESTRICT current_coeff;
-  __m512i one_epi32 = _mm512_set1_epi32(1); 
+  __m512i one_epi32 = _mm512_set1_epi32(1);
 
   src_pitch = src_pitch / sizeof(float);
   dst_pitch = dst_pitch / sizeof(float);
@@ -146,7 +309,6 @@ void resize_h_planar_float_avx512_permutex_vstripe_ks4(BYTE* dst8, const BYTE* s
 
   current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
 
-/* // 16 output samples in H-direction per vstripe
   for (int x = 0; x < width; x += 16)
   {
     // prepare coefs in transposed V-form
@@ -247,371 +409,8 @@ void resize_h_planar_float_avx512_permutex_vstripe_ks4(BYTE* dst8, const BYTE* s
 
     current_coeff += filter_size * 16;
   }
-*/ // 16 output samples per vstripe
-
-/*
-  // 32 output samples per vstripe
-  for (int x = 0; x < width; x += 32) // processing by 32 sample - it is safe at the end of row ?
-  {
-    // prepare coefs in transposed V-form
-    __m512 coef_r0 = _mm512_load_4_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4, current_coeff + filter_size * 8, current_coeff + filter_size * 12);
-    __m512 coef_r1 = _mm512_load_4_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5, current_coeff + filter_size * 9, current_coeff + filter_size * 13);
-    __m512 coef_r2 = _mm512_load_4_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6, current_coeff + filter_size * 10, current_coeff + filter_size * 14);
-    __m512 coef_r3 = _mm512_load_4_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7, current_coeff + filter_size * 11, current_coeff + filter_size * 15);
-
-    // prepare coefs in transposed V-form
-    __m512 coef_r0_2 = _mm512_load_4_m128(current_coeff + filter_size * 16, current_coeff + filter_size * 20, current_coeff + filter_size * 24, current_coeff + filter_size * 28);
-    __m512 coef_r1_2 = _mm512_load_4_m128(current_coeff + filter_size * 17, current_coeff + filter_size * 21, current_coeff + filter_size * 25, current_coeff + filter_size * 29);
-    __m512 coef_r2_2 = _mm512_load_4_m128(current_coeff + filter_size * 18, current_coeff + filter_size * 22, current_coeff + filter_size * 26, current_coeff + filter_size * 30);
-    __m512 coef_r3_2 = _mm512_load_4_m128(current_coeff + filter_size * 19, current_coeff + filter_size * 23, current_coeff + filter_size * 27, current_coeff + filter_size * 31);
-
-    _MM_TRANSPOSE16_LANE4_PS(coef_r0, coef_r1, coef_r2, coef_r3);
-    _MM_TRANSPOSE16_LANE4_PS(coef_r0_2, coef_r1_2, coef_r2_2, coef_r3_2);
-
-    // convert resampling program in H-form into permuting indexes for src transposition in V-form
-    int iStart = program->pixel_offset[x + 0];
-    __m512i perm_0 = _mm512_set_epi32(program->pixel_offset[x + 15] - iStart, program->pixel_offset[x + 14] - iStart, program->pixel_offset[x + 13] - iStart, program->pixel_offset[x + 12] - iStart, program->pixel_offset[x + 11] - iStart, program->pixel_offset[x + 10] - iStart, program->pixel_offset[x + 9] - iStart, program->pixel_offset[x + 8] - iStart, \
-      program->pixel_offset[x + 7] - iStart, program->pixel_offset[x + 6] - iStart, program->pixel_offset[x + 5] - iStart, program->pixel_offset[x + 4] - iStart, program->pixel_offset[x + 3] - iStart, program->pixel_offset[x + 2] - iStart, program->pixel_offset[x + 1] - iStart, 0);
-    __m512i perm_1 = _mm512_add_epi32(perm_0, one_epi32);
-    __m512i perm_2 = _mm512_add_epi32(perm_1, one_epi32);
-    __m512i perm_3 = _mm512_add_epi32(perm_2, one_epi32);
-
-    int iStart_2 = program->pixel_offset[x + 16];
-    __m512i perm_0_2 = _mm512_set_epi32(program->pixel_offset[x + 31] - iStart_2, program->pixel_offset[x + 30] - iStart_2, program->pixel_offset[x + 29] - iStart_2, program->pixel_offset[x + 28] - iStart_2, program->pixel_offset[x + 27] - iStart_2, program->pixel_offset[x + 26] - iStart_2, program->pixel_offset[x + 25] - iStart_2, program->pixel_offset[x + 24] - iStart_2, \
-      program->pixel_offset[x + 23] - iStart_2, program->pixel_offset[x + 22] - iStart_2, program->pixel_offset[x + 21] - iStart_2, program->pixel_offset[x + 20] - iStart_2, program->pixel_offset[x + 19] - iStart_2, program->pixel_offset[x + 18] - iStart_2, program->pixel_offset[x + 17] - iStart_2, program->pixel_offset[x + 16] - iStart_2);
-    __m512i perm_1_2 = _mm512_add_epi32(perm_0, one_epi32);
-    __m512i perm_2_2 = _mm512_add_epi32(perm_1, one_epi32);
-    __m512i perm_3_2 = _mm512_add_epi32(perm_2, one_epi32);
-
-    float* AVS_RESTRICT dst_ptr = dst + x;
-    float* AVS_RESTRICT dst_ptr_2 = dst + x + 16;
-    const float* src_ptr = src + program->pixel_offset[x + 0]; // all permute offsets relative to this start offset
-    const float* src_ptr_2 = src + program->pixel_offset[x + 16]; // all permute offsets relative to this start offset
-#if 0
-    for (int y = 0; y < height; y++) // single row proc, 32 output samples
-    {
-      __m512 data_src = _mm512_loadu_ps(src_ptr);
-      __m512 data_src_2 = _mm512_loadu_ps(src_ptr_2);
-
-      __m512 data_0 = _mm512_permutexvar_ps(perm_0, data_src);
-      __m512 data_1 = _mm512_permutexvar_ps(perm_1, data_src);
-      __m512 data_2 = _mm512_permutexvar_ps(perm_2, data_src);
-      __m512 data_3 = _mm512_permutexvar_ps(perm_3, data_src);
-
-      __m512 data_0_2 = _mm512_permutexvar_ps(perm_0_2, data_src_2);
-      __m512 data_1_2 = _mm512_permutexvar_ps(perm_1_2, data_src_2);
-      __m512 data_2_2 = _mm512_permutexvar_ps(perm_2_2, data_src_2);
-      __m512 data_3_2 = _mm512_permutexvar_ps(perm_3_2, data_src_2);
-
-      __m512 result0 = _mm512_mul_ps(data_0, coef_r0);
-      __m512 result1 = _mm512_mul_ps(data_2, coef_r2);
-
-      __m512 result0_2 = _mm512_mul_ps(data_0_2, coef_r0_2);
-      __m512 result1_2 = _mm512_mul_ps(data_2_2, coef_r2_2);
-
-      result0 = _mm512_fmadd_ps(data_1, coef_r1, result0);
-      result1 = _mm512_fmadd_ps(data_3, coef_r3, result1);
-
-      result0_2 = _mm512_fmadd_ps(data_1_2, coef_r1_2, result0_2);
-      result1_2 = _mm512_fmadd_ps(data_3_2, coef_r3_2, result1_2);
-
-      _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1));
-      _mm512_store_ps(dst_ptr_2, _mm512_add_ps(result0_2, result1_2));
-
-      dst_ptr += dst_pitch;
-      src_ptr += src_pitch;
-
-      dst_ptr_2 += dst_pitch;
-      src_ptr_2 += src_pitch;
-
-    }
-#endif // single row
-    //dual rows and 32 per vstripe
-    const int height_mod2 = (height / 2) * 2; // Process pairs of rows for better efficiency
-    // dual-rows not worst in performance - may be left for the future better memory performance and compute performance hosts
-    for (int y = 0; y < height_mod2; y += 2)
-    {
-      __m512 data_src = _mm512_loadu_ps(src_ptr);
-      __m512 data_src_2 = _mm512_loadu_ps(src_ptr_2);
-
-      __m512 data_src_2r = _mm512_loadu_ps(src_ptr + src_pitch);
-      __m512 data_src_2_2r = _mm512_loadu_ps(src_ptr_2 + src_pitch);
-
-      __m512 data_0 = _mm512_permutexvar_ps(perm_0, data_src);
-      __m512 data_1 = _mm512_permutexvar_ps(perm_1, data_src);
-      __m512 data_2 = _mm512_permutexvar_ps(perm_2, data_src);
-      __m512 data_3 = _mm512_permutexvar_ps(perm_3, data_src);
-
-      __m512 data_0_2 = _mm512_permutexvar_ps(perm_0_2, data_src_2);
-      __m512 data_1_2 = _mm512_permutexvar_ps(perm_1_2, data_src_2);
-      __m512 data_2_2 = _mm512_permutexvar_ps(perm_2_2, data_src_2);
-      __m512 data_3_2 = _mm512_permutexvar_ps(perm_3_2, data_src_2);
-
-      __m512 data_0_2r = _mm512_permutexvar_ps(perm_0, data_src_2r);
-      __m512 data_1_2r = _mm512_permutexvar_ps(perm_1, data_src_2r);
-      __m512 data_2_2r = _mm512_permutexvar_ps(perm_2, data_src_2r);
-      __m512 data_3_2r = _mm512_permutexvar_ps(perm_3, data_src_2r);
-
-      __m512 data_0_2_2r = _mm512_permutexvar_ps(perm_0_2, data_src_2_2r);
-      __m512 data_1_2_2r = _mm512_permutexvar_ps(perm_1_2, data_src_2_2r);
-      __m512 data_2_2_2r = _mm512_permutexvar_ps(perm_2_2, data_src_2_2r);
-      __m512 data_3_2_2r = _mm512_permutexvar_ps(perm_3_2, data_src_2_2r);
-
-      // 1r
-      __m512 result0 = _mm512_mul_ps(data_0, coef_r0);
-      __m512 result1 = _mm512_mul_ps(data_2, coef_r2);
-
-      __m512 result0_2 = _mm512_mul_ps(data_0_2, coef_r0_2);
-      __m512 result1_2 = _mm512_mul_ps(data_2_2, coef_r2_2);
-
-      result0 = _mm512_fmadd_ps(data_1, coef_r1, result0);
-      result1 = _mm512_fmadd_ps(data_3, coef_r3, result1);
-
-      result0_2 = _mm512_fmadd_ps(data_1_2, coef_r1_2, result0_2);
-      result1_2 = _mm512_fmadd_ps(data_3_2, coef_r3_2, result1_2);
-
-      // 2r
-      __m512 result0_2r = _mm512_mul_ps(data_0_2r, coef_r0);
-      __m512 result1_2r = _mm512_mul_ps(data_2_2r, coef_r2);
-
-      __m512 result0_2_2r = _mm512_mul_ps(data_0_2_2r, coef_r0_2);
-      __m512 result1_2_2r = _mm512_mul_ps(data_2_2_2r, coef_r2_2);
-
-      result0_2r = _mm512_fmadd_ps(data_1_2r, coef_r1, result0_2r);
-      result1_2r = _mm512_fmadd_ps(data_3_2r, coef_r3, result1_2r);
-
-      result0_2_2r = _mm512_fmadd_ps(data_1_2_2r, coef_r1_2, result0_2_2r);
-      result1_2_2r = _mm512_fmadd_ps(data_3_2_2r, coef_r3_2, result1_2_2r);
-
-
-      _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1));
-      _mm512_store_ps(dst_ptr_2, _mm512_add_ps(result0_2, result1_2));
-
-      _mm512_store_ps(dst_ptr + dst_pitch, _mm512_add_ps(result0_2r, result1_2r));
-      _mm512_store_ps(dst_ptr_2 + dst_pitch, _mm512_add_ps(result0_2_2r, result1_2_2r));
-
-
-      dst_ptr += dst_pitch * 2;
-      src_ptr += src_pitch * 2;
-
-      dst_ptr_2 += dst_pitch * 2;
-      src_ptr_2 += src_pitch * 2;
-
-    }
-*/
-
-
-// some slower than 32 per vstripe with small cacheabe frame sizes and best performance with large frame size and many threads (most SDRAM controller friendly ?)
-
-  for (int x = 0; x < width; x += 64) // processing by 64 output sample - it is safe at the end of row ?
-  {
-    // prepare coefs in transposed V-form
-    __m512 coef_r0 = _mm512_load_4_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4, current_coeff + filter_size * 8, current_coeff + filter_size * 12);
-    __m512 coef_r1 = _mm512_load_4_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5, current_coeff + filter_size * 9, current_coeff + filter_size * 13);
-    __m512 coef_r2 = _mm512_load_4_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6, current_coeff + filter_size * 10, current_coeff + filter_size * 14);
-    __m512 coef_r3 = _mm512_load_4_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7, current_coeff + filter_size * 11, current_coeff + filter_size * 15);
-
-    // prepare coefs in transposed V-form
-    __m512 coef_r0_2 = _mm512_load_4_m128(current_coeff + filter_size * 16, current_coeff + filter_size * 20, current_coeff + filter_size * 24, current_coeff + filter_size * 28);
-    __m512 coef_r1_2 = _mm512_load_4_m128(current_coeff + filter_size * 17, current_coeff + filter_size * 21, current_coeff + filter_size * 25, current_coeff + filter_size * 29);
-    __m512 coef_r2_2 = _mm512_load_4_m128(current_coeff + filter_size * 18, current_coeff + filter_size * 22, current_coeff + filter_size * 26, current_coeff + filter_size * 30);
-    __m512 coef_r3_2 = _mm512_load_4_m128(current_coeff + filter_size * 19, current_coeff + filter_size * 23, current_coeff + filter_size * 27, current_coeff + filter_size * 31);
-
-    // prepare coefs in transposed V-form
-    __m512 coef_r0_3 = _mm512_load_4_m128(current_coeff + filter_size * 32, current_coeff + filter_size * 36, current_coeff + filter_size * 40, current_coeff + filter_size * 44);
-    __m512 coef_r1_3 = _mm512_load_4_m128(current_coeff + filter_size * 33, current_coeff + filter_size * 37, current_coeff + filter_size * 41, current_coeff + filter_size * 45);
-    __m512 coef_r2_3 = _mm512_load_4_m128(current_coeff + filter_size * 34, current_coeff + filter_size * 38, current_coeff + filter_size * 42, current_coeff + filter_size * 46);
-    __m512 coef_r3_3 = _mm512_load_4_m128(current_coeff + filter_size * 35, current_coeff + filter_size * 39, current_coeff + filter_size * 43, current_coeff + filter_size * 47);
-
-    // prepare coefs in transposed V-form
-    __m512 coef_r0_4 = _mm512_load_4_m128(current_coeff + filter_size * 48, current_coeff + filter_size * 52, current_coeff + filter_size * 56, current_coeff + filter_size * 60);
-    __m512 coef_r1_4 = _mm512_load_4_m128(current_coeff + filter_size * 49, current_coeff + filter_size * 53, current_coeff + filter_size * 57, current_coeff + filter_size * 61);
-    __m512 coef_r2_4 = _mm512_load_4_m128(current_coeff + filter_size * 50, current_coeff + filter_size * 54, current_coeff + filter_size * 58, current_coeff + filter_size * 62);
-    __m512 coef_r3_4 = _mm512_load_4_m128(current_coeff + filter_size * 51, current_coeff + filter_size * 55, current_coeff + filter_size * 59, current_coeff + filter_size * 63);
-
-    _MM_TRANSPOSE16_LANE4_PS(coef_r0, coef_r1, coef_r2, coef_r3);
-    _MM_TRANSPOSE16_LANE4_PS(coef_r0_2, coef_r1_2, coef_r2_2, coef_r3_2);
-    _MM_TRANSPOSE16_LANE4_PS(coef_r0_3, coef_r1_3, coef_r2_3, coef_r3_3);
-    _MM_TRANSPOSE16_LANE4_PS(coef_r0_4, coef_r1_4, coef_r2_4, coef_r3_4);
-
-    // convert resampling program in H-form into permuting indexes for src transposition in V-form
-    int iStart = program->pixel_offset[x + 0];
-    __m512i perm_0 = _mm512_set_epi32(program->pixel_offset[x + 15] - iStart, program->pixel_offset[x + 14] - iStart, program->pixel_offset[x + 13] - iStart, program->pixel_offset[x + 12] - iStart, program->pixel_offset[x + 11] - iStart, program->pixel_offset[x + 10] - iStart, program->pixel_offset[x + 9] - iStart, program->pixel_offset[x + 8] - iStart, \
-      program->pixel_offset[x + 7] - iStart, program->pixel_offset[x + 6] - iStart, program->pixel_offset[x + 5] - iStart, program->pixel_offset[x + 4] - iStart, program->pixel_offset[x + 3] - iStart, program->pixel_offset[x + 2] - iStart, program->pixel_offset[x + 1] - iStart, 0);
-
-    int iStart_2 = program->pixel_offset[x + 16];
-    __m512i perm_0_2 = _mm512_set_epi32(program->pixel_offset[x + 31] - iStart_2, program->pixel_offset[x + 30] - iStart_2, program->pixel_offset[x + 29] - iStart_2, program->pixel_offset[x + 28] - iStart_2, program->pixel_offset[x + 27] - iStart_2, program->pixel_offset[x + 26] - iStart_2, program->pixel_offset[x + 25] - iStart_2, program->pixel_offset[x + 24] - iStart_2, \
-      program->pixel_offset[x + 23] - iStart_2, program->pixel_offset[x + 22] - iStart_2, program->pixel_offset[x + 21] - iStart_2, program->pixel_offset[x + 20] - iStart_2, program->pixel_offset[x + 19] - iStart_2, program->pixel_offset[x + 18] - iStart_2, program->pixel_offset[x + 17] - iStart_2, program->pixel_offset[x + 16] - iStart_2);
-
-    int iStart_3 = program->pixel_offset[x + 32];
-    __m512i perm_0_3 = _mm512_set_epi32(program->pixel_offset[x + 47] - iStart_3, program->pixel_offset[x + 46] - iStart_3, program->pixel_offset[x + 45] - iStart_3, program->pixel_offset[x + 44] - iStart_3, program->pixel_offset[x + 43] - iStart_3, program->pixel_offset[x + 42] - iStart_3, program->pixel_offset[x + 41] - iStart_3, program->pixel_offset[x + 40] - iStart_3, \
-      program->pixel_offset[x + 39] - iStart_3, program->pixel_offset[x + 38] - iStart_3, program->pixel_offset[x + 37] - iStart_3, program->pixel_offset[x + 36] - iStart_3, program->pixel_offset[x + 35] - iStart_3, program->pixel_offset[x + 34] - iStart_3, program->pixel_offset[x + 33] - iStart_3, program->pixel_offset[x + 32] - iStart_3);
-
-    int iStart_4 = program->pixel_offset[x + 48];
-    __m512i perm_0_4 = _mm512_set_epi32(program->pixel_offset[x + 63] - iStart_4, program->pixel_offset[x + 62] - iStart_4, program->pixel_offset[x + 61] - iStart_4, program->pixel_offset[x + 60] - iStart_4, program->pixel_offset[x + 59] - iStart_4, program->pixel_offset[x + 58] - iStart_4, program->pixel_offset[x + 57] - iStart_4, program->pixel_offset[x + 56] - iStart_4, \
-      program->pixel_offset[x + 55] - iStart_4, program->pixel_offset[x + 54] - iStart_4, program->pixel_offset[x + 53] - iStart_4, program->pixel_offset[x + 52] - iStart_4, program->pixel_offset[x + 51] - iStart_4, program->pixel_offset[x + 50] - iStart_4, program->pixel_offset[x + 49] - iStart_4, program->pixel_offset[x + 48] - iStart_4);
-
-
-    float* AVS_RESTRICT dst_ptr = dst + x;
-    const float* src_ptr = src + program->pixel_offset[x + 0]; // all permute offsets relative to this start offset
-    const float* src_ptr_2 = src + program->pixel_offset[x + 16]; // all permute offsets relative to this start offset
-    const float* src_ptr_3 = src + program->pixel_offset[x + 32]; // all permute offsets relative to this start offset
-    const float* src_ptr_4 = src + program->pixel_offset[x + 48]; // all permute offsets relative to this start offset
-
-    for (int y = 0; y < height; y++) // single row proc, 32 output samples
-    {
-      __m512 data_src = _mm512_loadu_ps(src_ptr);
-      __m512 data_src_2 = _mm512_loadu_ps(src_ptr_2);
-      __m512 data_src_3 = _mm512_loadu_ps(src_ptr_3);
-      __m512 data_src_4 = _mm512_loadu_ps(src_ptr_4);
-
-      // 1st
-      __m512 data_0 = _mm512_permutexvar_ps(perm_0, data_src);
-
-      __m512i perm_next = _mm512_add_epi32(perm_0, one_epi32);
-      __m512 data_1 = _mm512_permutexvar_ps(perm_next, data_src);
-
-      perm_next = _mm512_add_epi32(perm_next, one_epi32);
-      __m512 data_2 = _mm512_permutexvar_ps(perm_next, data_src);
-
-      perm_next = _mm512_add_epi32(perm_next, one_epi32);
-      __m512 data_3 = _mm512_permutexvar_ps(perm_next, data_src);
-
-      // 2nd
-      __m512 data_0_2 = _mm512_permutexvar_ps(perm_0_2, data_src_2);
-
-      __m512i perm_next_2 = _mm512_add_epi32(perm_0_2, one_epi32);
-      __m512 data_1_2 = _mm512_permutexvar_ps(perm_next_2, data_src_2);
-
-      perm_next_2 = _mm512_add_epi32(perm_0_2, one_epi32);
-      __m512 data_2_2 = _mm512_permutexvar_ps(perm_next_2, data_src_2);
-
-      perm_next_2 = _mm512_add_epi32(perm_next_2, one_epi32);
-      __m512 data_3_2 = _mm512_permutexvar_ps(perm_next_2, data_src_2);
-
-      // 3rd
-      __m512 data_0_3 = _mm512_permutexvar_ps(perm_0_3, data_src_3);
-
-      __m512i perm_next_3 = _mm512_add_epi32(perm_0_3, one_epi32);
-      __m512 data_1_3 = _mm512_permutexvar_ps(perm_next_3, data_src_3);
-
-      perm_next_3 = _mm512_add_epi32(perm_0_3, one_epi32);
-      __m512 data_2_3 = _mm512_permutexvar_ps(perm_next_3, data_src_3);
-
-      perm_next_3 = _mm512_add_epi32(perm_next_3, one_epi32);
-      __m512 data_3_3 = _mm512_permutexvar_ps(perm_next_3, data_src_3);
-
-      // 4th
-      __m512 data_0_4 = _mm512_permutexvar_ps(perm_0_4, data_src_4);
-
-      __m512i perm_next_4 = _mm512_add_epi32(perm_0_4, one_epi32);
-      __m512 data_1_4 = _mm512_permutexvar_ps(perm_next_4, data_src_4);
-
-      perm_next_4 = _mm512_add_epi32(perm_0_4, one_epi32);
-      __m512 data_2_4 = _mm512_permutexvar_ps(perm_next_4, data_src_4);
-
-      perm_next_4 = _mm512_add_epi32(perm_next_4, one_epi32);
-      __m512 data_3_4 = _mm512_permutexvar_ps(perm_next_4, data_src_3);
-
-
-      __m512 result0 = _mm512_mul_ps(data_0, coef_r0);
-      __m512 result1 = _mm512_mul_ps(data_2, coef_r2);
-
-      __m512 result0_2 = _mm512_mul_ps(data_0_2, coef_r0_2);
-      __m512 result1_2 = _mm512_mul_ps(data_2_2, coef_r2_2);
-
-      __m512 result0_3 = _mm512_mul_ps(data_0_3, coef_r0_3);
-      __m512 result1_3 = _mm512_mul_ps(data_2_3, coef_r2_3);
-
-      __m512 result0_4 = _mm512_mul_ps(data_0_4, coef_r0_4);
-      __m512 result1_4 = _mm512_mul_ps(data_2_4, coef_r2_4);
-
-      result0 = _mm512_fmadd_ps(data_1, coef_r1, result0);
-      result1 = _mm512_fmadd_ps(data_3, coef_r3, result1);
-
-      result0_2 = _mm512_fmadd_ps(data_1_2, coef_r1_2, result0_2);
-      result1_2 = _mm512_fmadd_ps(data_3_2, coef_r3_2, result1_2);
-
-      result0_3 = _mm512_fmadd_ps(data_1_3, coef_r1_3, result0_3);
-      result1_3 = _mm512_fmadd_ps(data_3_3, coef_r3_3, result1_3);
-
-      _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1));
-      _mm512_store_ps(dst_ptr + 16, _mm512_add_ps(result0_2, result1_2));
-      _mm512_store_ps(dst_ptr + 32, _mm512_add_ps(result0_3, result1_3));
-      _mm512_store_ps(dst_ptr + 48, _mm512_add_ps(result0_4, result1_4));
-
-      dst_ptr += dst_pitch;
-
-      src_ptr += src_pitch;
-      src_ptr_2 += src_pitch;
-      src_ptr_3 += src_pitch;
-      src_ptr_4 += src_pitch;
-
-    }
-
-
-/*
-    const int height_mod2 = (height / 2) * 2; // Process pairs of rows for better efficiency
-    // dual-rows not worst in performance - may be left for the future better memory performance and compute performance hosts
-    for (int y = 0; y < height_mod2; y += 2)
-    {
-      __m512 data_src = _mm512_loadu_ps(src_ptr);
-      __m512 data_src_2 = _mm512_loadu_ps(src_ptr + src_pitch);
-
-      __m512 data_0 = _mm512_permutexvar_ps(perm_0, data_src);
-      __m512 data_1 = _mm512_permutexvar_ps(perm_1, data_src);
-      __m512 data_2 = _mm512_permutexvar_ps(perm_2, data_src);
-      __m512 data_3 = _mm512_permutexvar_ps(perm_3, data_src);
-
-      __m512 data_0_2 = _mm512_permutexvar_ps(perm_0, data_src_2);
-      __m512 data_1_2 = _mm512_permutexvar_ps(perm_1, data_src_2);
-      __m512 data_2_2 = _mm512_permutexvar_ps(perm_2, data_src_2);
-      __m512 data_3_2 = _mm512_permutexvar_ps(perm_3, data_src_2);
-
-      __m512 result0 = _mm512_mul_ps(data_0, coef_r0);
-      __m512 result1 = _mm512_mul_ps(data_0_2, coef_r0);
-
-      result0 = _mm512_fmadd_ps(data_1, coef_r1, result0);
-      result1 = _mm512_fmadd_ps(data_1_2, coef_r1, result1);
-
-      result0 = _mm512_fmadd_ps(data_2, coef_r2, result0);
-      result1 = _mm512_fmadd_ps(data_2_2, coef_r2, result1);
-
-      result0 = _mm512_fmadd_ps(data_3, coef_r3, result0);
-      result1 = _mm512_fmadd_ps(data_3_2, coef_r3, result1);
-
-      _mm512_store_ps(dst_ptr, result0);
-      _mm512_store_ps(dst_ptr + dst_pitch, result1);
-
-      dst_ptr += dst_pitch * 2;
-      src_ptr += src_pitch * 2;
-    }
-
-    if (height > height_mod2) // last row
-    {
-      __m512 data_src = _mm512_loadu_ps(src_ptr);
-
-      __m512 data_0 = _mm512_permutexvar_ps(perm_0, data_src);
-      __m512 data_1 = _mm512_permutexvar_ps(perm_1, data_src);
-      __m512 data_2 = _mm512_permutexvar_ps(perm_2, data_src);
-      __m512 data_3 = _mm512_permutexvar_ps(perm_3, data_src);
-
-      __m512 result0 = _mm512_mul_ps(data_0, coef_r0);
-      __m512 result1 = _mm512_mul_ps(data_2, coef_r2);
-
-      result0 = _mm512_fmadd_ps(data_1, coef_r1, result0);
-      result1 = _mm512_fmadd_ps(data_3, coef_r3, result1);
-
-      _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1));
-    }
-*/ // dual rows
-
-    current_coeff += filter_size * 64;
-  }
-
-
 }
-
+#endif
 void resize_h_planar_float_avx512_permutex_vstripe_ks8(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel)
 {
   // assert - check if max pixel_offset is not above single load of 16 src floats (or need several loads and more complex permute program)
@@ -628,7 +427,7 @@ void resize_h_planar_float_avx512_permutex_vstripe_ks8(BYTE* dst8, const BYTE* s
   int filter_size = program->filter_size;
 
   const float* AVS_RESTRICT current_coeff;
-  __m512i one_epi32 = _mm512_set1_epi32(1); 
+  __m512i one_epi32 = _mm512_set1_epi32(1);
 
   src_pitch = src_pitch / sizeof(float);
   dst_pitch = dst_pitch / sizeof(float);
@@ -804,6 +603,7 @@ void resize_h_planar_float_avx512_permutex_vstripe_ks8(BYTE* dst8, const BYTE* s
   }
 }
 
+#if 0
 void resize_h_planar_float_avx512_permutex_vstripe_ks16(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel)
 {
   // assert - check if max pixel_offset is not above single load of 16 src floats (or need several loads and more complex permute program)
@@ -819,7 +619,7 @@ void resize_h_planar_float_avx512_permutex_vstripe_ks16(BYTE* dst8, const BYTE*
   int filter_size = program->filter_size;
 
   const float* AVS_RESTRICT current_coeff;
-  __m512i one_epi32 = _mm512_set1_epi32(1); 
+  __m512i one_epi32 = _mm512_set1_epi32(1);
 
   src_pitch = src_pitch / sizeof(float);
   dst_pitch = dst_pitch / sizeof(float);
@@ -961,6 +761,7 @@ void resize_h_planar_float_avx512_permutex_vstripe_ks16(BYTE* dst8, const BYTE*
     current_coeff += filter_size * 16;
   }
 }
+#endif
 
 
 
diff --git a/avs_core/filters/intel/resample_avx512.h b/avs_core/filters/intel/resample_avx512.h
index dc98e0052..77117f2b3 100644
--- a/avs_core/filters/intel/resample_avx512.h
+++ b/avs_core/filters/intel/resample_avx512.h
@@ -38,26 +38,70 @@
 #include <avisynth.h>
 #include "../resample_functions.h"
 
-void resize_h_planar_float_avx512_transpose_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
-void resize_h_planar_float_avx512_permutex_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
-void resize_h_planar_float_avx512_permutex_vstripe_ks8(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
-void resize_h_planar_float_avx512_permutex_vstripe_ks16(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+#include <immintrin.h> // includes AVX, AVX2, FMA3, AVX512F, AVX512BW, etc. for MSVC, Clang, and GCC
 
-void resize_v_avx512_planar_float(BYTE* dst0, const BYTE* src0, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel);
+// compiler feature checks and error handling
+#if defined(__clang__) && !defined(_MSC_VER)
+#if !defined(__AVX512F__) || !defined(__AVX512BW__)
+#error "This code requires a compiler that supports AVX-512F and AVX-512BW.  Use compiler flags -mavx512f -mavx512bw."
+#endif
+#elif defined(__GNUC__)
+#if !defined(__AVX512F__) || !defined(__AVX512BW__)
+#error "This code requires a compiler that supports AVX-512F and AVX-512BW.  Use compiler flags -mavx512f -mavx512bw."
+#endif
+#elif defined(_MSC_VER)
+  #if !defined(_M_X64) && !defined(_M_AMD64) && !defined(_M_ARM64)
+  #error "AVX-512 is only supported on x64 and ARM64 architectures."
+  #endif
+  // MSVC's <immintrin.h> provides AVX-512 support when /arch:AVX512 is used.
+  // However, MSVC may not define __AVX512F__ or __AVX512BW__ consistently.
+  // We rely on /arch:AVX512 having been set, and assume that if the user is
+  // including this header, they intend to use AVX-512.
+#else
+  #error "Unsupported compiler. This code requires a compiler that supports AVX-512F and AVX-512BW (GCC, Clang, or MSVC)."
+#endif
 
+#if !defined(__FMA__)
+// Assume that all processors that have AVX2/AVX512 also have FMA3
+#if defined (__GNUC__) && ! defined (__INTEL_COMPILER) && ! defined (__clang__)
+// Prevent error message in g++ when using FMA intrinsics with avx2:
+#pragma message "It is recommended to specify also option -mfma when using -mavx2 or higher"
+#else
+#define __FMA__  1
+#endif
+#endif
+// FMA3 instruction set
+#if defined (__FMA__) && (defined(__GNUC__) || defined(__clang__))  && ! defined (__INTEL_COMPILER)
+#include <fmaintrin.h>
+#endif // __FMA__
+
+// MSVC Missing Intrinsics (Workaround for older MSVC versions)
+#if defined(_MSC_VER) && !defined(__clang__)
+#if _MSC_VER < 1922 // Check for MSVC version less than 16.2 (VS 2019 16.2)
+  // Define missing AVX-512BW mask intrinsics for older MSVC.
+  // inline functions that perform the mask operations directly.
+  // Since this is MSVC only, using specific __forceinline.
+__forceinline __mmask64 _kand_mask64(__mmask64 a, __mmask64 b) { return a & b; }
+__forceinline __mmask64 _kor_mask64(__mmask64 a, __mmask64 b) { return a | b; }
+__forceinline __mmask32 _kand_mask32(__mmask32 a, __mmask32 b) { return a & b; }
+__forceinline __mmask32 _kor_mask32(__mmask32 a, __mmask32 b) { return a | b; }
+#endif
+#endif
+
+// useful macros
 
 #define _MM_TRANSPOSE16_LANE4_PS(row0, row1, row2, row3) \
-	do { \
-		__m512 __t0, __t1, __t2, __t3; \
-		__t0 = _mm512_unpacklo_ps(row0, row1); \
-		__t1 = _mm512_unpackhi_ps(row0, row1); \
-		__t2 = _mm512_unpacklo_ps(row2, row3); \
-		__t3 = _mm512_unpackhi_ps(row2, row3); \
-		row0 = _mm512_shuffle_ps(__t0, __t2, _MM_SHUFFLE(1, 0, 1, 0)); \
-		row1 = _mm512_shuffle_ps(__t0, __t2, _MM_SHUFFLE(3, 2, 3, 2)); \
-		row2 = _mm512_shuffle_ps(__t1, __t3, _MM_SHUFFLE(1, 0, 1, 0)); \
-		row3 = _mm512_shuffle_ps(__t1, __t3, _MM_SHUFFLE(3, 2, 3, 2)); \
-	} while (0)
+  do { \
+    __m512 __t0, __t1, __t2, __t3; \
+    __t0 = _mm512_unpacklo_ps(row0, row1); \
+    __t1 = _mm512_unpackhi_ps(row0, row1); \
+    __t2 = _mm512_unpacklo_ps(row2, row3); \
+    __t3 = _mm512_unpackhi_ps(row2, row3); \
+    row0 = _mm512_shuffle_ps(__t0, __t2, _MM_SHUFFLE(1, 0, 1, 0)); \
+    row1 = _mm512_shuffle_ps(__t0, __t2, _MM_SHUFFLE(3, 2, 3, 2)); \
+    row2 = _mm512_shuffle_ps(__t1, __t3, _MM_SHUFFLE(1, 0, 1, 0)); \
+    row3 = _mm512_shuffle_ps(__t1, __t3, _MM_SHUFFLE(3, 2, 3, 2)); \
+  } while (0)
 
 #ifndef _mm512_loadu_4_m128
 #define _mm512_loadu_4_m128(/* __m128 const* */ addr1, \
@@ -75,4 +119,14 @@ _mm512_insertf32x4(_mm512_insertf32x4(_mm512_insertf32x4(_mm512_castps128_ps512(
 _mm512_insertf32x4(_mm512_insertf32x4(_mm512_insertf32x4(_mm512_castps128_ps512(_mm_load_ps(addr1)), _mm_load_ps(addr2), 1), _mm_load_ps(addr3), 2), _mm_load_ps(addr4), 3)
 #endif
 
+
+template<int filtersizemod4>
+void resize_h_planar_float_avx512_transpose_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+
+void resize_h_planar_float_avx512_permutex_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+void resize_h_planar_float_avx512_permutex_vstripe_ks8(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+void resize_h_planar_float_avx512_permutex_vstripe_ks16(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+
+void resize_v_avx512_planar_float(BYTE* dst0, const BYTE* src0, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel);
+
 #endif // __Resample_AVX512_H__
diff --git a/avs_core/filters/intel/resample_sse.cpp b/avs_core/filters/intel/resample_sse.cpp
index a3729c295..7f0afbca3 100644
--- a/avs_core/filters/intel/resample_sse.cpp
+++ b/avs_core/filters/intel/resample_sse.cpp
@@ -152,7 +152,7 @@ void resize_v_mmx_planar(BYTE* dst, const BYTE* src, int dst_pitch, int src_pitc
 }
 #endif
 
-/*
+#if 0
 void resize_v_sse2_planar(BYTE* dst8, const BYTE* src, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel)
 {
   AVS_UNUSED(bits_per_pixel);
@@ -226,7 +226,7 @@ void resize_v_sse2_planar(BYTE* dst8, const BYTE* src, int dst_pitch, int src_pi
     current_coeff += filter_size;
   }
 }
-*/
+#else
 
 void resize_v_sse2_planar(BYTE* dst8, const BYTE* src, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel)
 {
@@ -340,7 +340,7 @@ void resize_v_sse2_planar(BYTE* dst8, const BYTE* src, int dst_pitch, int src_pi
         current_coeff += filter_size;
     }
 }
-
+#endif
 // like the AVX2 version, but only 8 pixels at a time
 template<bool lessthan16bit>
 void resize_v_sse2_planar_uint16_t(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel)
@@ -597,7 +597,7 @@ void resizer_h_ssse3_generic_float(BYTE* dst8, const BYTE* src8, int dst_pitch,
   dst_pitch = dst_pitch / sizeof(float);
   src_pitch = src_pitch / sizeof(float);
 
-  const int w_safe_mod8 = (program->overread_possible ? program->source_overread_beyond_targetx : width) / 8 * 8;
+  const int w_safe_mod8 = (program->safelimit_filter_size_aligned.overread_possible ? program->safelimit_filter_size_aligned.source_overread_beyond_targetx : width) / 8 * 8;
 
   for (int y = 0; y < height; y++) {
     float* current_coeff_base = program->pixel_coefficient_float;
@@ -987,7 +987,7 @@ void resizer_h_ssse3_generic_uint8_16(BYTE* dst8, const BYTE* src8, int dst_pitc
   dst_pitch /= sizeof(pixel_t);
   src_pitch /= sizeof(pixel_t);
 
-  const int w_safe_mod8 = (program->overread_possible ? program->source_overread_beyond_targetx : width) / 8 * 8;
+  const int w_safe_mod8 = (program->safelimit_filter_size_aligned.overread_possible ? program->safelimit_filter_size_aligned.source_overread_beyond_targetx : width) / 8 * 8;
 
   for (int y = 0; y < height; y++) {
     const short* AVS_RESTRICT current_coeff_base = program->pixel_coefficient;
@@ -1020,161 +1020,287 @@ template void resize_v_sse2_planar_uint16_t<true>(BYTE* dst8, const BYTE* src8,
 
 // Transpose-based SIMD
 void resize_h_planar_float_sse_transpose(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel) {
-	int filter_size = program->filter_size;
+  int filter_size = program->filter_size;
+
+  const float* AVS_RESTRICT current_coeff;
 
-	const float* AVS_RESTRICT current_coeff;
+  src_pitch = src_pitch / sizeof(float);
+  dst_pitch = dst_pitch / sizeof(float);
 
-	src_pitch = src_pitch / sizeof(float);
-	dst_pitch = dst_pitch / sizeof(float);
+  float* src = (float*)src8;
+  float* dst = (float*)dst8;
 
-	float* src = (float*)src8;
-	float* dst = (float*)dst8;
+  const int kernel_size = program->filter_size_real;
+  const int ksmod4 = kernel_size / 4 * 4;
+  //	const int ksmod8 = kernel_size / 8 * 8;
 
-	const int kernel_size = program->filter_size_real;
-	const int ksmod4 = kernel_size / 4 * 4;
-	//	const int ksmod8 = kernel_size / 8 * 8;
 #if 0
     // single row processing - slower
-	for (int y = 0; y < height; y++) {
-		current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
+  for (int y = 0; y < height; y++) {
+    current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
 
-		float* AVS_RESTRICT dst2_ptr = dst + y * dst_pitch;
-		const float* src_ptr = src + y * src_pitch;
+    float* AVS_RESTRICT dst2_ptr = dst + y * dst_pitch;
+    const float* src_ptr = src + y * src_pitch;
 
-		for (int x = 0; x < width; x+=4) {
+    // FIXME: the SIMD safe end is not width, but safe_width
+    for (int x = 0; x < width; x += 4) {
 
-			__m128 result = _mm_setzero_ps();
+      __m128 result = _mm_setzero_ps();
 
-			for (int i = 0; i < ksmod4; i += 4) {
-				__m128 data_1 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + 0] + i);
-				__m128 data_2 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + 1] + i);
-				__m128 data_3 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + 2] + i);
-				__m128 data_4 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + 3] + i);
+      for (int i = 0; i < ksmod4; i += 4) {
+        // 4 pixels, in outer x loop. Each has different "begin" offset
+        __m128 data_1 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + 0] + i);
+        __m128 data_2 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + 1] + i);
+        __m128 data_3 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + 2] + i);
+        __m128 data_4 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + 3] + i);
 
-				__m128 coeff_1 = _mm_load_ps(current_coeff + i + filter_size * 0); 
-				__m128 coeff_2 = _mm_load_ps(current_coeff + i + filter_size * 1);
-				__m128 coeff_3 = _mm_load_ps(current_coeff + i + filter_size * 2);
-				__m128 coeff_4 = _mm_load_ps(current_coeff + i + filter_size * 3);
+        __m128 coeff_1 = _mm_load_ps(current_coeff + i + filter_size * 0);
+        __m128 coeff_2 = _mm_load_ps(current_coeff + i + filter_size * 1);
+        __m128 coeff_3 = _mm_load_ps(current_coeff + i + filter_size * 2);
+        __m128 coeff_4 = _mm_load_ps(current_coeff + i + filter_size * 3);
 
-				_MM_TRANSPOSE4_PS(data_1, data_2, data_3, data_4);
-				_MM_TRANSPOSE4_PS(coeff_1, coeff_2, coeff_3, coeff_4);
+        _MM_TRANSPOSE4_PS(data_1, data_2, data_3, data_4);
+        _MM_TRANSPOSE4_PS(coeff_1, coeff_2, coeff_3, coeff_4);
 
-                result = _mm_add_ps(_mm_mul_ps(data_1, coeff_1), result);
-                result = _mm_add_ps(_mm_mul_ps(data_2, coeff_2), result);
-                result = _mm_add_ps(_mm_mul_ps(data_3, coeff_3), result);
-                result = _mm_add_ps(_mm_mul_ps(data_4, coeff_4), result);
-            }
+        result = _mm_add_ps(_mm_mul_ps(data_1, coeff_1), result);
+        result = _mm_add_ps(_mm_mul_ps(data_2, coeff_2), result);
+        result = _mm_add_ps(_mm_mul_ps(data_3, coeff_3), result);
+        result = _mm_add_ps(_mm_mul_ps(data_4, coeff_4), result);
+      }
 
-			_mm_store_ps(dst2_ptr + x, result);
-			current_coeff += filter_size * 4;
-		}
-	}
+      _mm_store_ps(dst2_ptr + x, result);
+      current_coeff += filter_size * 4;
+    }
+  }
 #endif
+  constexpr int PIXELS_AT_A_TIME = 4;
+  // source_overread_beyond_targetx must be compatible with the number of source pixels loaded by SIMD load.
+  // loadu_ps: 4 pixels.
+  const int width_safe_mod = (program->safelimit_4_pixels.overread_possible ? program->safelimit_4_pixels.source_overread_beyond_targetx : width) / PIXELS_AT_A_TIME * PIXELS_AT_A_TIME;
 
-    for (int y = 0; y < height; y+=2) {
-        current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
+  // this is not good, height mod 2 must be used src_ptr2 would access beyond frame
+  for (int y = 0; y < height; y += 2) {
+    current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
 
-        float* AVS_RESTRICT dst2_ptr = dst + y * dst_pitch;
-        float* AVS_RESTRICT dst2_ptr2 = dst + (y + 1) * dst_pitch;
-        const float* src_ptr = src + y * src_pitch;
-        const float* src_ptr2 = src + (y + 1) * src_pitch;
+    float* AVS_RESTRICT dst2_ptr = dst + y * dst_pitch;
+    float* AVS_RESTRICT dst2_ptr2 = dst + (y + 1) * dst_pitch;
+    const float* src_ptr = src + y * src_pitch;
+    const float* src_ptr2 = src + (y + 1) * src_pitch;
 
-        for (int x = 0; x < width; x += 4) {
+    // 1st pass: from 0 to width_safe_mod in PIXELS_AT_A_TIME steps
+    // 2nd pass: from width_safe_mod to width in single pixel steps
+    //for (int x = 0; x < width_safe_mod; x += PIXELS_AT_A_TIME) {
+    for (int x = 0; x < width; x += PIXELS_AT_A_TIME) {
 
-            __m128 result = _mm_setzero_ps();
-            __m128 result2 = _mm_setzero_ps();
+      __m128 result = _mm_setzero_ps();
+      __m128 result2 = _mm_setzero_ps();
 
-            for (int i = 0; i < kernel_size; i += 4) { // it is always mod4 ?
+      for (int i = 0; i < kernel_size; i += 4) { // it is always mod4 ?
 
-                __m128 data_1 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + 0] + i);
-                __m128 data_2 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + 1] + i);
-                __m128 data_3 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + 2] + i);
-                __m128 data_4 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + 3] + i);
+        const int begin1 = program->pixel_offset[x + 0];
+        const int begin2 = program->pixel_offset[x + 1];
+        const int begin3 = program->pixel_offset[x + 2];
+        const int begin4 = program->pixel_offset[x + 3];
 
-                __m128 data_1_2 = _mm_loadu_ps(src_ptr2 + program->pixel_offset[x + 0] + i);
-                __m128 data_2_2 = _mm_loadu_ps(src_ptr2 + program->pixel_offset[x + 1] + i);
-                __m128 data_3_2 = _mm_loadu_ps(src_ptr2 + program->pixel_offset[x + 2] + i);
-                __m128 data_4_2 = _mm_loadu_ps(src_ptr2 + program->pixel_offset[x + 3] + i);
+        // this is not good, src_ptr must be used instead of src_ptr + i
+        __m128 data_1 = _mm_loadu_ps(src_ptr + i + begin1);
+        __m128 data_2 = _mm_loadu_ps(src_ptr + i + begin2);
+        __m128 data_3 = _mm_loadu_ps(src_ptr + i + begin3);
+        __m128 data_4 = _mm_loadu_ps(src_ptr + i + begin4);
 
-                __m128 coeff_1 = _mm_load_ps(current_coeff + i + filter_size * 0);
-                __m128 coeff_2 = _mm_load_ps(current_coeff + i + filter_size * 1);
-                __m128 coeff_3 = _mm_load_ps(current_coeff + i + filter_size * 2);
-                __m128 coeff_4 = _mm_load_ps(current_coeff + i + filter_size * 3);
+        __m128 data_1_2 = _mm_loadu_ps(src_ptr2 + i + begin1);
+        __m128 data_2_2 = _mm_loadu_ps(src_ptr2 + i + begin2);
+        __m128 data_3_2 = _mm_loadu_ps(src_ptr2 + i + begin3);
+        __m128 data_4_2 = _mm_loadu_ps(src_ptr2 + i + begin4);
 
-                _MM_TRANSPOSE4_PS(data_1, data_2, data_3, data_4);
-                _MM_TRANSPOSE4_PS(data_1_2, data_2_2, data_3_2, data_4_2);
-                _MM_TRANSPOSE4_PS(coeff_1, coeff_2, coeff_3, coeff_4);
+        __m128 coeff_1 = _mm_load_ps(current_coeff + i + filter_size * 0);
+        __m128 coeff_2 = _mm_load_ps(current_coeff + i + filter_size * 1);
+        __m128 coeff_3 = _mm_load_ps(current_coeff + i + filter_size * 2);
+        __m128 coeff_4 = _mm_load_ps(current_coeff + i + filter_size * 3);
 
-                result = _mm_add_ps(_mm_mul_ps(data_1, coeff_1), result);
-                result = _mm_add_ps(_mm_mul_ps(data_2, coeff_2), result);
-                result = _mm_add_ps(_mm_mul_ps(data_3, coeff_3), result);
-                result = _mm_add_ps(_mm_mul_ps(data_4, coeff_4), result);
+        _MM_TRANSPOSE4_PS(data_1, data_2, data_3, data_4);
+        _MM_TRANSPOSE4_PS(data_1_2, data_2_2, data_3_2, data_4_2);
+        _MM_TRANSPOSE4_PS(coeff_1, coeff_2, coeff_3, coeff_4);
 
-                result2 = _mm_add_ps(_mm_mul_ps(data_1_2, coeff_1), result2);
-                result2 = _mm_add_ps(_mm_mul_ps(data_2_2, coeff_2), result2);
-                result2 = _mm_add_ps(_mm_mul_ps(data_3_2, coeff_3), result2);
-                result2 = _mm_add_ps(_mm_mul_ps(data_4_2, coeff_4), result2);
+        result = _mm_add_ps(_mm_mul_ps(data_1, coeff_1), result);
+        result = _mm_add_ps(_mm_mul_ps(data_2, coeff_2), result);
+        result = _mm_add_ps(_mm_mul_ps(data_3, coeff_3), result);
+        result = _mm_add_ps(_mm_mul_ps(data_4, coeff_4), result);
 
-            }
+        result2 = _mm_add_ps(_mm_mul_ps(data_1_2, coeff_1), result2);
+        result2 = _mm_add_ps(_mm_mul_ps(data_2_2, coeff_2), result2);
+        result2 = _mm_add_ps(_mm_mul_ps(data_3_2, coeff_3), result2);
+        result2 = _mm_add_ps(_mm_mul_ps(data_4_2, coeff_4), result2);
 
-            _mm_store_ps(dst2_ptr + x, result);
-            _mm_store_ps(dst2_ptr2 + x, result2);
+      }
 
-            current_coeff += filter_size * 4;
-        }
+      _mm_store_ps(dst2_ptr + x, result);
+      _mm_store_ps(dst2_ptr2 + x, result2);
+
+      current_coeff += filter_size * 4;
     }
+  }
 
-    // to do - need to process last row of not-mod2 heights
+  // to do - need to process last row of not-mod2 heights
 }
 
-// process kernel size from up to 4 - BilinearResize, BicubicResize or sinc up to taps=2
-void resize_h_planar_float_sse_transpose_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel) 
-{
-    int filter_size = program->filter_size;
+// Safe partial load with SSE2
+// Read exactly N pixels, avoiding
+// - reading beyond the end of the source buffer.
+// - avoid NaN contamination, since event with zero coefficients NaN * 0 = NaN
+template <int Nmod4>
+AVS_FORCEINLINE static __m128 load_partial_safe_sse2(const float* src_ptr_offsetted) {
+  switch (Nmod4) {
+  case 1:
+    return _mm_set_ps(0.0f, 0.0f, 0.0f, src_ptr_offsetted[0]);
+    // ideally: movss
+  case 2:
+    return _mm_set_ps(0.0f, 0.0f, src_ptr_offsetted[1], src_ptr_offsetted[0]);
+    // ideally: movsd
+  case 3:
+    return _mm_set_ps(0.0f, src_ptr_offsetted[2], src_ptr_offsetted[1], src_ptr_offsetted[0]);
+    // ideally: movss + movsd + shuffle or movsd + insert
+  case 0:
+    return _mm_set_ps(src_ptr_offsetted[3], src_ptr_offsetted[2], src_ptr_offsetted[1], src_ptr_offsetted[0]);
+    // ideally: movups
+  default:
+    return _mm_setzero_ps(); // n/a cannot happen
+  }
+}
 
-    const float* AVS_RESTRICT current_coeff;
+// Processes a horizontal resampling kernel of up to four coefficients for float pixel types.
+// Supports BilinearResize, BicubicResize, or sinc with up to 2 taps (filter size <= 4).
+// SSE2 optimization loads and processes four float coefficients and pixels simultaneously.
+// The 'filtersizemod4' template parameter (0-3) helps optimize for different filter sizes modulo 4.
+// This SSE2 requires only filter_size_alignment of 4.
+template<int filtersizemod4>
+void resize_h_planar_float_sse_transpose_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel) {
+  assert(filtersizemod4 >= 0 && filtersizemod4 <= 3);
 
-    src_pitch = src_pitch / sizeof(float);
-    dst_pitch = dst_pitch / sizeof(float);
+  const int filter_size = program->filter_size; // aligned, practically the coeff table stride
 
-    float* src = (float*)src8;
-    float* dst = (float*)dst8;
+  src_pitch /= sizeof(float);
+  dst_pitch /= sizeof(float);
 
-    current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
+  float* src = (float*)src8;
+  float* dst = (float*)dst8;
 
-    for (int x = 0; x < width; x += 4)
-    {
-        __m128 coeff_1 = _mm_load_ps(current_coeff + filter_size * 0);
-        __m128 coeff_2 = _mm_load_ps(current_coeff + filter_size * 1);
-        __m128 coeff_3 = _mm_load_ps(current_coeff + filter_size * 2);
-        __m128 coeff_4 = _mm_load_ps(current_coeff + filter_size * 3);
+  const float* AVS_RESTRICT current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
 
-        _MM_TRANSPOSE4_PS(coeff_1, coeff_2, coeff_3, coeff_4);
+  constexpr int PIXELS_AT_A_TIME = 4; // Process four pixels in parallel using SSE2
 
-        float* AVS_RESTRICT dst_ptr = dst + x;
-        const float* src_ptr = src;
+  // 'source_overread_beyond_targetx' indicates if the filter kernel can read beyond the target width.
+  // Even if the filter alignment allows larger reads, our safety boundary for unaligned loads starts at 4 pixels back
+  // from the target width, as we load 4 floats at once with '_mm_loadu_ps'.
+  // In AVX2 we process two lanes, so any of the 8 offsets cannot be safely used, fallback to the unsafe case.
+  // This is why then safelimit_4_pixels is used combined with safelimit_4 / PIXELS_AT_A_TIME * PIXELS_AT_A_TIME.
+  const int width_safe_mod = (program->safelimit_4_pixels.overread_possible ? program->safelimit_4_pixels.source_overread_beyond_targetx : width) / PIXELS_AT_A_TIME * PIXELS_AT_A_TIME;
 
-        for (int y = 0; y < height; y++)
-        {
-            __m128 data_1 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + 0]);
-            __m128 data_2 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + 1]);
-            __m128 data_3 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + 2]);
-            __m128 data_4 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + 3]);
+  // Preconditions:
+  assert(program->filter_size_real <= 4); // We preload all relevant coefficients (up to 4) before the height loop.
 
-            _MM_TRANSPOSE4_PS(data_1, data_2, data_3, data_4);
+  // 'target_size_alignment' ensures we can safely access coefficients using offsets like
+  // 'filter_size * 3' when processing 4 H pixels at a time or
+  // 'filter_size * 7' when processing 8 H pixels at a time or
+  // 'filter_size * 15' when processing 16 H pixels at a time
+  assert(program->target_size_alignment >= 4);
 
-            __m128 result = _mm_mul_ps(data_1, coeff_1);
-            result = _mm_add_ps(_mm_mul_ps(data_2, coeff_2), result);
-            result = _mm_add_ps(_mm_mul_ps(data_3, coeff_3), result);
-            result = _mm_add_ps(_mm_mul_ps(data_4, coeff_4), result);
+  // Ensure that coefficient loading beyond the valid target size is safe for 4x4 float loads.
+  assert(program->filter_size_alignment >= 4);
 
-            _mm_store_ps(dst_ptr, result);
+  int x = 0;
 
-            dst_ptr += dst_pitch;
-            src_ptr += src_pitch;
-        }
-        current_coeff += filter_size * 4;
-    }
+  // This 'auto' lambda construct replaces the need of templates
+  auto do_h_float_core = [&](auto partial_load) {
+    // Load up to 4 coefficients at once before the height loop.
+    // Pre-loading and transposing coefficients keeps register usage efficient.
+    // Assumes 'filter_size_aligned' is at least 4.
+    __m128 coeff_1 = _mm_load_ps(current_coeff + filter_size * 0); // Coefficients for the source pixel offset (for src_ptr + begin1 [0..3])
+    __m128 coeff_2 = _mm_load_ps(current_coeff + filter_size * 1); // for src_ptr + begin2 [0..3]
+    __m128 coeff_3 = _mm_load_ps(current_coeff + filter_size * 2); // for src_ptr + begin3 [0..3]
+    __m128 coeff_4 = _mm_load_ps(current_coeff + filter_size * 3); // for src_ptr + begin4 [0..3]
+
+    _MM_TRANSPOSE4_PS(coeff_1, coeff_2, coeff_3, coeff_4);
+
+    float* AVS_RESTRICT dst_ptr = dst + x;
+    const float* src_ptr = src;
 
+    // Pixel offsets for the current target x-positions.
+    // Even for x >= width, these offsets are guaranteed to be within the allocated 'target_size_alignment'.
+    const int begin1 = program->pixel_offset[x + 0];
+    const int begin2 = program->pixel_offset[x + 1];
+    const int begin3 = program->pixel_offset[x + 2];
+    const int begin4 = program->pixel_offset[x + 3];
+
+    for (int y = 0; y < height; y++)
+    {
+      __m128 data_1;
+      __m128 data_2;
+      __m128 data_3;
+      __m128 data_4;
+      if constexpr (partial_load) {
+        // In the potentially unsafe zone (near the right edge of the image), we use a safe loading function
+        // to prevent reading beyond the allocated source scanline. This handles cases where loading 4 floats
+        // starting from 'src_ptr + beginX' might exceed the source buffer.
+
+        // Example of the unsafe scenario: If target width is 320, a naive load at src_ptr + 317
+        // would attempt to read floats at indices 317, 318, 319, and 320, potentially going out of bounds.
+
+        // Two main issues in the unsafe zone:
+        // 1.) Out-of-bounds memory access: Reading beyond the allocated memory for the source scanline can
+        //     lead to access violations and crashes. '_mm_loadu_ps' attempts to load 16 bytes, so even if
+        //     the starting address is within bounds, subsequent reads might not be.
+        // 2.) Garbage or NaN values: Even if a read doesn't cause a crash, accessing uninitialized or
+        //     out-of-bounds memory (especially for float types) can result in garbage data, including NaN.
+        //     Multiplying by a valid coefficient and accumulating this NaN can contaminate the final result.
+
+        // 'load_partial_safe_sse2' safely loads up to 'filter_size_real' pixels and pads with zeros if needed,
+        // preventing out-of-bounds reads and ensuring predictable results even near the image edges.
+
+        data_1 = load_partial_safe_sse2<filtersizemod4>(src_ptr + begin1);
+        data_2 = load_partial_safe_sse2<filtersizemod4>(src_ptr + begin2);
+        data_3 = load_partial_safe_sse2<filtersizemod4>(src_ptr + begin3);
+        data_4 = load_partial_safe_sse2<filtersizemod4>(src_ptr + begin4);
+      }
+      else {
+        // In the safe zone, we can directly load 4 pixels at a time using unaligned loads.
+        data_1 = _mm_loadu_ps(src_ptr + begin1);
+        data_2 = _mm_loadu_ps(src_ptr + begin2);
+        data_3 = _mm_loadu_ps(src_ptr + begin3);
+        data_4 = _mm_loadu_ps(src_ptr + begin4);
+      }
+
+      _MM_TRANSPOSE4_PS(data_1, data_2, data_3, data_4);
+
+      __m128 result = _mm_mul_ps(data_1, coeff_1);
+      result = _mm_add_ps(_mm_mul_ps(data_2, coeff_2), result);
+      result = _mm_add_ps(_mm_mul_ps(data_3, coeff_3), result);
+      result = _mm_add_ps(_mm_mul_ps(data_4, coeff_4), result);
+
+      _mm_store_ps(dst_ptr, result);
+
+      dst_ptr += dst_pitch;
+      src_ptr += src_pitch;
+    } // y
+    current_coeff += filter_size * 4; // Move to the next set of coefficients for the next 4 output pixels
+    }; // end of lambda
+
+  // Process the 'safe zone' where direct full unaligned loads are acceptable.
+  for (; x < width_safe_mod; x += PIXELS_AT_A_TIME)
+  {
+    do_h_float_core(std::false_type{}); // partial_load == false, use direct _mm_loadu_ps
+  }
+
+  // Process the potentially 'unsafe zone' near the image edge, using safe loading.
+  for (; x < width; x += PIXELS_AT_A_TIME)
+  {
+    do_h_float_core(std::true_type{}); // partial_load == true, use the safer 'load_partial_safe_sse2'
+  }
 }
 
+// Instantiate them
+template void resize_h_planar_float_sse_transpose_vstripe_ks4<0>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+template void resize_h_planar_float_sse_transpose_vstripe_ks4<1>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+template void resize_h_planar_float_sse_transpose_vstripe_ks4<2>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+template void resize_h_planar_float_sse_transpose_vstripe_ks4<3>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+
diff --git a/avs_core/filters/intel/resample_sse.h b/avs_core/filters/intel/resample_sse.h
index e4febc523..c09228a07 100644
--- a/avs_core/filters/intel/resample_sse.h
+++ b/avs_core/filters/intel/resample_sse.h
@@ -61,6 +61,8 @@ __attribute__((__target__("ssse3")))
 void resizer_h_ssse3_generic_float(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
 
 void resize_h_planar_float_sse_transpose(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+
+template<int filtersizemod4>
 void resize_h_planar_float_sse_transpose_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
 
 #endif // __Resample_SSE_H__

From 6ef577411747d5e04f7627115192d5594e4bd570 Mon Sep 17 00:00:00 2001
From: DTL2020 <68707763+DTL2020@users.noreply.github.com>
Date: Thu, 22 May 2025 19:44:54 +0300
Subject: [PATCH 14/27] Added AVX512

universal procesing ks4 H-resize (calling from resample.cpp)
---
 avs_core/filters/resample.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/avs_core/filters/resample.cpp b/avs_core/filters/resample.cpp
index 9bb6cd149..52687d034 100644
--- a/avs_core/filters/resample.cpp
+++ b/avs_core/filters/resample.cpp
@@ -1611,10 +1611,14 @@ ResamplerH FilteredResizeH::GetResampler(int CPU, int pixelsize, int bits_per_pi
     if ((CPU & CPUF_AVX512F) && program->filter_size_real <= 4) {
       //return resize_h_planar_float_avx2_permutex_vstripe_ks4;
       switch (program->filter_size_real) {
-      case 1: return resize_h_planar_float_avx512_transpose_vstripe_ks4<1>; break;
+/*      case 1: return resize_h_planar_float_avx512_transpose_vstripe_ks4<1>; break;
       case 2: return resize_h_planar_float_avx512_transpose_vstripe_ks4<2>; break;
       case 3: return resize_h_planar_float_avx512_transpose_vstripe_ks4<3>; break;
-      case 4: return resize_h_planar_float_avx512_transpose_vstripe_ks4<0>; break;
+      case 4: return resize_h_planar_float_avx512_transpose_vstripe_ks4<0>; break;*/
+      case 1: return resize_h_planar_float_avx512_gather_permutex_vstripe_ks4<1>; break;
+      case 2: return resize_h_planar_float_avx512_gather_permutex_vstripe_ks4<2>; break;
+      case 3: return resize_h_planar_float_avx512_gather_permutex_vstripe_ks4<3>; break;
+      case 4: return resize_h_planar_float_avx512_gather_permutex_vstripe_ks4<0>; break;
       }
     }
 #endif

From 3303ba400a71a891af251b450538d39dac04870d Mon Sep 17 00:00:00 2001
From: DTL2020 <68707763+DTL2020@users.noreply.github.com>
Date: Thu, 22 May 2025 19:48:25 +0300
Subject: [PATCH 15/27] Added AVX512

resize_h_planar_float_avx512_gather_permutex_vstripe_ks4() universal function with auto-selection and loading up to 32 sequential floats of sources for 16 output float samples. Not yet good debugged. Also the workunit size for permutex transpose looks like too small for AVX512 and need the adjustment to 2x or 4x size (in H or H and V directions - need more performance tuning).
---
 avs_core/filters/intel/resample_avx512.cpp | 226 +++++++++++++++++++++
 avs_core/filters/intel/resample_avx512.h   |   4 +
 2 files changed, 230 insertions(+)

diff --git a/avs_core/filters/intel/resample_avx512.cpp b/avs_core/filters/intel/resample_avx512.cpp
index a711bc97a..15957a488 100644
--- a/avs_core/filters/intel/resample_avx512.cpp
+++ b/avs_core/filters/intel/resample_avx512.cpp
@@ -227,6 +227,232 @@ template void resize_h_planar_float_avx512_transpose_vstripe_ks4<1>(BYTE* dst8,
 template void resize_h_planar_float_avx512_transpose_vstripe_ks4<2>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
 template void resize_h_planar_float_avx512_transpose_vstripe_ks4<3>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
 
+/* Universal function supporting 2 ways of processing depending on the max offset of the source samples to read in the resampling program :
+1. For high upsampling ratios it uses low read (single 8 float source samples) and permute-transpose before V-fma
+2. For downsample and no-resize convolution - use each input sequence gathering by direct addressing
+*/
+template<int filtersizemod4>
+void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel)
+{
+  assert(filtersizemod4 >= 0 && filtersizemod4 <= 3);
+
+  const int filter_size = program->filter_size; // aligned, practically the coeff table stride
+
+  src_pitch /= sizeof(float);
+  dst_pitch /= sizeof(float);
+
+  float* src = (float*)src8;
+  float* dst = (float*)dst8;
+
+  const float* AVS_RESTRICT current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
+
+  constexpr int PIXELS_AT_A_TIME = 16; // Process sixteen pixels in parallel using AVX512 (4x4 using m128 lanes)
+
+  // 'source_overread_beyond_targetx' indicates if the filter kernel can read beyond the target width.
+  // Even if the filter alignment allows larger reads, our safety boundary for unaligned loads starts at 4 pixels back
+  // from the target width, as we load 4 floats at once conceptually with our safe load.
+  const int width_safe_mod = (program->safelimit_4_pixels.overread_possible ? program->safelimit_4_pixels.source_overread_beyond_targetx : width) / PIXELS_AT_A_TIME * PIXELS_AT_A_TIME;
+
+  // Preconditions:
+  assert(program->filter_size_real <= 4); // We preload all relevant coefficients (up to 4) before the height loop.
+
+  // 'target_size_alignment' ensures we can safely access coefficients using offsets like
+  // 'filter_size * 7' when processing 8 H pixels at a time or
+  // 'filter_size * 15' when processing 16 H pixels at a time
+  assert(program->target_size_alignment >= 16); // Adjusted for 16 pixels
+  assert(FRAME_ALIGN >= 64); // Adjusted for 16 pixels AviSynth+ default
+
+  // Ensure that coefficient loading beyond the valid target size is safe for 4x4 float loads.
+  assert(program->filter_size_alignment >= 4);
+
+  bool bDoGather = false;
+  // Analyse input resampling program to select method of processing
+  for (int x = 0; x < width - 16; x += 16) // -16 to save from vector overrread at program->pixel_offset[x + 15 + 3]; ?
+  {
+    int start_off = program->pixel_offset[x + 0];
+    int end_off = program->pixel_offset[x + 15];
+    if ((end_off - start_off) + (program->filter_size_real - 1) > 32) bDoGather = true;
+
+    start_off = program->pixel_offset[x + 1];
+    end_off = program->pixel_offset[x + 15 + 1];
+    if ((end_off - start_off) + (program->filter_size_real - 1) > 32) bDoGather = true;
+
+    start_off = program->pixel_offset[x + 2];
+    end_off = program->pixel_offset[x + 15 + 2];
+    if ((end_off - start_off) + (program->filter_size_real - 1) > 32) bDoGather = true;
+
+    start_off = program->pixel_offset[x + 3];
+    end_off = program->pixel_offset[x + 15 + 3];
+    if ((end_off - start_off) + (program->filter_size_real - 1) > 32) bDoGather = true;
+  }
+
+  int x = 0;
+
+  if (bDoGather)
+  {
+    // This 'auto' lambda construct replaces the need of templates
+    auto do_h_float_core = [&](auto partial_load) {
+      // Load up to 4x4 coefficients at once before the height loop.
+      // Pre-loading and transposing coefficients keeps register usage efficient.
+      // Assumes 'filter_size_aligned' is at least 4.
+
+      // Coefficients for the source pixel offset (for src_ptr + begin1 [0..3], begin5 [0..3], begin9 [0..3], begin13 [0..3])
+      __m512 coef_1_5_9_13 = _mm512_load_4_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4, current_coeff + filter_size * 8, current_coeff + filter_size * 12);
+      __m512 coef_2_6_10_14 = _mm512_load_4_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5, current_coeff + filter_size * 9, current_coeff + filter_size * 13);
+      __m512 coef_3_7_11_15 = _mm512_load_4_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6, current_coeff + filter_size * 10, current_coeff + filter_size * 14);
+      __m512 coef_4_8_12_16 = _mm512_load_4_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7, current_coeff + filter_size * 11, current_coeff + filter_size * 15);
+
+      _MM_TRANSPOSE16_LANE4_PS(coef_1_5_9_13, coef_2_6_10_14, coef_3_7_11_15, coef_4_8_12_16);
+
+      float* AVS_RESTRICT dst_ptr = dst + x;
+      const float* src_ptr = src;
+
+      // Pixel offsets for the current target x-positions.
+      // Even for x >= width, these offsets are guaranteed to be within the allocated 'target_size_alignment'.
+      const int begin1 = program->pixel_offset[x + 0];
+      const int begin2 = program->pixel_offset[x + 1];
+      const int begin3 = program->pixel_offset[x + 2];
+      const int begin4 = program->pixel_offset[x + 3];
+      const int begin5 = program->pixel_offset[x + 4];
+      const int begin6 = program->pixel_offset[x + 5];
+      const int begin7 = program->pixel_offset[x + 6];
+      const int begin8 = program->pixel_offset[x + 7];
+      const int begin9 = program->pixel_offset[x + 8];
+      const int begin10 = program->pixel_offset[x + 9];
+      const int begin11 = program->pixel_offset[x + 10];
+      const int begin12 = program->pixel_offset[x + 11];
+      const int begin13 = program->pixel_offset[x + 12];
+      const int begin14 = program->pixel_offset[x + 13];
+      const int begin15 = program->pixel_offset[x + 14];
+      const int begin16 = program->pixel_offset[x + 15];
+
+      for (int y = 0; y < height; y++)
+      {
+        __m512 data_1_5_9_13;
+        __m512 data_2_6_10_14;
+        __m512 data_3_7_11_15;
+        __m512 data_4_8_12_16;
+
+        if constexpr (partial_load) {
+          // In the potentially unsafe zone (near the right edge of the image), we use a safe loading function
+          // to prevent reading beyond the allocated source scanline.
+
+          data_1_5_9_13 = _mm512_load_partial_safe_4_m128<filtersizemod4>(src_ptr + begin1, src_ptr + begin5, src_ptr + begin9, src_ptr + begin13);
+          data_2_6_10_14 = _mm512_load_partial_safe_4_m128<filtersizemod4>(src_ptr + begin2, src_ptr + begin6, src_ptr + begin10, src_ptr + begin14);
+          data_3_7_11_15 = _mm512_load_partial_safe_4_m128<filtersizemod4>(src_ptr + begin3, src_ptr + begin7, src_ptr + begin11, src_ptr + begin15);
+          data_4_8_12_16 = _mm512_load_partial_safe_4_m128<filtersizemod4>(src_ptr + begin4, src_ptr + begin8, src_ptr + begin12, src_ptr + begin16);
+        }
+        else {
+          // In the safe zone, we can directly load 4 pixels at a time for each of the four lanes.
+          data_1_5_9_13 = _mm512_loadu_4_m128(src_ptr + begin1, src_ptr + begin5, src_ptr + begin9, src_ptr + begin13);
+          data_2_6_10_14 = _mm512_loadu_4_m128(src_ptr + begin2, src_ptr + begin6, src_ptr + begin10, src_ptr + begin14);
+          data_3_7_11_15 = _mm512_loadu_4_m128(src_ptr + begin3, src_ptr + begin7, src_ptr + begin11, src_ptr + begin15);
+          data_4_8_12_16 = _mm512_loadu_4_m128(src_ptr + begin4, src_ptr + begin8, src_ptr + begin12, src_ptr + begin16);
+        }
+
+        _MM_TRANSPOSE16_LANE4_PS(data_1_5_9_13, data_2_6_10_14, data_3_7_11_15, data_4_8_12_16);
+
+        __m512 result = _mm512_mul_ps(data_1_5_9_13, coef_1_5_9_13);
+        result = _mm512_fmadd_ps(data_2_6_10_14, coef_2_6_10_14, result);
+        result = _mm512_fmadd_ps(data_3_7_11_15, coef_3_7_11_15, result);
+        result = _mm512_fmadd_ps(data_4_8_12_16, coef_4_8_12_16, result);
+
+        _mm512_store_ps(dst_ptr, result);
+
+        dst_ptr += dst_pitch;
+        src_ptr += src_pitch;
+      } // y
+      current_coeff += filter_size * 16; // Move to the next set of coefficients for the next 16 output pixels
+    }; // end of lambda
+
+  // Process the 'safe zone' where direct full unaligned loads are acceptable.
+    for (; x < width_safe_mod; x += PIXELS_AT_A_TIME)
+    {
+      do_h_float_core(std::false_type{}); // partial_load == false, use direct _mm512_loadu_4_m128
+    }
+
+    // Process the potentially 'unsafe zone' near the image edge, using safe loading.
+    for (; x < width; x += PIXELS_AT_A_TIME)
+    {
+      do_h_float_core(std::true_type{}); // partial_load == true, use the safer '_mm512_load_partial_safe_4_m128'
+    }
+  } 
+  else // if(bDoGather)
+  {
+    for (int x = 0; x < width; x += 16)
+    {
+      // prepare coefs in transposed V-form
+      __m512 coef_r0 = _mm512_load_4_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4, current_coeff + filter_size * 8, current_coeff + filter_size * 12);
+      __m512 coef_r1 = _mm512_load_4_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5, current_coeff + filter_size * 9, current_coeff + filter_size * 13);
+      __m512 coef_r2 = _mm512_load_4_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6, current_coeff + filter_size * 10, current_coeff + filter_size * 14);
+      __m512 coef_r3 = _mm512_load_4_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7, current_coeff + filter_size * 11, current_coeff + filter_size * 15);
+
+      _MM_TRANSPOSE16_LANE4_PS(coef_r0, coef_r1, coef_r2, coef_r3);
+
+      // convert resampling program in H-form into permuting indexes for src transposition in V-form
+      int iStart = program->pixel_offset[x + 0];
+
+      __m512i perm_0 = _mm512_set_epi32(
+        program->pixel_offset[x + 15] - iStart,
+        program->pixel_offset[x + 14] - iStart,
+        program->pixel_offset[x + 13] - iStart,
+        program->pixel_offset[x + 12] - iStart,
+        program->pixel_offset[x + 11] - iStart,
+        program->pixel_offset[x + 10] - iStart,
+        program->pixel_offset[x + 9] - iStart,
+        program->pixel_offset[x + 8] - iStart,
+        program->pixel_offset[x + 7] - iStart,
+        program->pixel_offset[x + 6] - iStart,
+        program->pixel_offset[x + 5] - iStart,
+        program->pixel_offset[x + 4] - iStart,
+        program->pixel_offset[x + 3] - iStart,
+        program->pixel_offset[x + 2] - iStart,
+        program->pixel_offset[x + 1] - iStart,
+        0);
+
+      __m512i one_epi32 = _mm512_set1_epi32(1);
+      __m512i perm_1 = _mm512_add_epi32(perm_0, one_epi32);
+      one_epi32 = _mm512_set1_epi32(program->pixel_offset[x + 2] - program->pixel_offset[x + 1]);
+      __m512i perm_2 = _mm512_add_epi32(perm_1, one_epi32);
+      one_epi32 = _mm512_set1_epi32(program->pixel_offset[x + 3] - program->pixel_offset[x + 2]);
+      __m512i perm_3 = _mm512_add_epi32(perm_2, one_epi32);
+
+      float* AVS_RESTRICT dst_ptr = dst + x;
+      const float* src_ptr = src + program->pixel_offset[x + 0]; // all permute offsets relative to this start offset
+
+      for (int y = 0; y < height; y++) // single row proc
+      {
+        __m512 data_src = _mm512_loadu_ps(src_ptr);
+        __m512 data_src2 = _mm512_loadu_ps(src_ptr + 16); // not always needed for upscale also can cause end of buffer overread - need to add limitation (special end of buffer processing ?)
+
+        __m512 data_0 = _mm512_permutex2var_ps(data_src, perm_0, data_src2);
+        __m512 data_1 = _mm512_permutex2var_ps(data_src, perm_1, data_src2);
+        __m512 data_2 = _mm512_permutex2var_ps(data_src, perm_2, data_src2);
+        __m512 data_3 = _mm512_permutex2var_ps(data_src, perm_3, data_src2);
+
+        __m512 result0 = _mm512_mul_ps(data_0, coef_r0);
+        __m512 result1 = _mm512_mul_ps(data_2, coef_r2);
+
+        result0 = _mm512_fmadd_ps(data_1, coef_r1, result0);
+        result1 = _mm512_fmadd_ps(data_3, coef_r3, result1);
+
+        _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1));
+
+        dst_ptr += dst_pitch;
+        src_ptr += src_pitch;
+      }
+
+      current_coeff += filter_size * 16;
+    }
+  }
+}
+
+template void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4<0>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+template void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4<1>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+template void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4<2>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+template void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4<3>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+
+
 #if 0 // DTL version
 // Transpose-based
 // process kernel size from up to 4 - BilinearResize, BicubicResize or sinc up to taps=2
diff --git a/avs_core/filters/intel/resample_avx512.h b/avs_core/filters/intel/resample_avx512.h
index 77117f2b3..affd410de 100644
--- a/avs_core/filters/intel/resample_avx512.h
+++ b/avs_core/filters/intel/resample_avx512.h
@@ -123,6 +123,10 @@ _mm512_insertf32x4(_mm512_insertf32x4(_mm512_insertf32x4(_mm512_castps128_ps512(
 template<int filtersizemod4>
 void resize_h_planar_float_avx512_transpose_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
 
+template<int filtersizemod4>
+void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+
+
 void resize_h_planar_float_avx512_permutex_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
 void resize_h_planar_float_avx512_permutex_vstripe_ks8(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
 void resize_h_planar_float_avx512_permutex_vstripe_ks16(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);

From 810f91ba468601316f8ddf89adee3c9821eb9bfd Mon Sep 17 00:00:00 2001
From: DTL2020 <68707763+DTL2020@users.noreply.github.com>
Date: Sun, 25 May 2025 14:47:27 -0700
Subject: [PATCH 16/27] Added FilteredResize_2p

example of using single temp buf for 3 or 4 planes 2 pass h+V resizing for lower memory usage and better cache reusage (for not very large frame/plane sizes). Still not very nice but easy for testing control with force=3.
---
 avs_core/filters/resample.cpp | 389 +++++++++++++++++++++++++++++++++-
 avs_core/filters/resample.h   |  46 ++++
 2 files changed, 428 insertions(+), 7 deletions(-)

diff --git a/avs_core/filters/resample.cpp b/avs_core/filters/resample.cpp
index 52687d034..ca07becae 100644
--- a/avs_core/filters/resample.cpp
+++ b/avs_core/filters/resample.cpp
@@ -1967,15 +1967,24 @@ PClip FilteredResize::CreateResize(PClip clip, int target_width, int target_heig
   // 3 - force H and V
   const bool force_H = force == 1 || force == 3;
   const bool force_V = force == 2 || force == 3;
-  if (area_FirstH < area_FirstV)
-  {
-    result = CreateResizeV(clip, subrange_top, subrange_height, target_height, force_V, f, preserve_center, chroma_placement, env);
-    result = CreateResizeH(result, subrange_left, subrange_width, target_width, force_H, f, preserve_center, chroma_placement, env);
-  }
+
+  if (force == 3) // not very good manual forcing of special 2pass mode, better to nake selection if both H and V resizs required, currently for test only
+    result = new FilteredResize_2p(clip,
+      subrange_left, subrange_width, target_width,
+      subrange_top, subrange_height, target_height,
+      f, preserve_center, chroma_placement, env); 
   else
   {
-    result = CreateResizeH(clip, subrange_left, subrange_width, target_width, force_H, f, preserve_center, chroma_placement, env);
-    result = CreateResizeV(result, subrange_top, subrange_height, target_height, force_V, f, preserve_center, chroma_placement, env);
+    if (area_FirstH < area_FirstV)
+    {
+      result = CreateResizeV(clip, subrange_top, subrange_height, target_height, force_V, f, preserve_center, chroma_placement, env);
+      result = CreateResizeH(result, subrange_left, subrange_width, target_width, force_H, f, preserve_center, chroma_placement, env);
+    }
+    else
+    {
+      result = CreateResizeH(clip, subrange_left, subrange_width, target_width, force_H, f, preserve_center, chroma_placement, env);
+      result = CreateResizeV(result, subrange_top, subrange_height, target_height, force_V, f, preserve_center, chroma_placement, env);
+    }
   }
   return result;
 }
@@ -2153,3 +2162,369 @@ AVSValue __cdecl FilteredResize::Create_UserDefined2Resize(AVSValue args, void*,
   return CreateResize(args[0].AsClip(), args[1].AsInt(), args[2].AsInt(), &args[6], force, &f, preserve_center, placement_name, forced_chroma_placement, env);
 }
 
+
+/***************************************
+ *****    Filtered Resize - 2p    ******
+ ***************************************/
+
+FilteredResize_2p::FilteredResize_2p(PClip _child,
+  double subrange_left, double subrange_width, int target_width,
+  double subrange_top, double subrange_height, int target_height,
+  ResamplingFunction* func, bool preserve_center, int chroma_placement, IScriptEnvironment* env)
+  : GenericVideoFilter(_child),
+  resampling_program_luma_h(0), resampling_program_chroma_h(0),
+  resampling_program_luma_v(0), resampling_program_chroma_v(0)
+{
+  if (target_height <= 0)
+    env->ThrowError("Resize: Height must be greater than 0.");
+
+  if (target_width <= 0)
+    env->ThrowError("Resize: Width must be greater than 0.");
+
+  // set class globals
+  src_width = vi.width;
+  src_height = vi.height;
+  dst_width = target_width;
+  dst_height = target_height;
+
+  pixelsize = vi.ComponentSize(); // AVS16
+  bits_per_pixel = vi.BitsPerComponent();
+  grey = vi.IsY();
+  bool isRGBPfamily = vi.IsPlanarRGB() || vi.IsPlanarRGBA();
+
+  if (vi.IsPlanar() && !grey && !isRGBPfamily) {
+    const int mask = (1 << vi.GetPlaneHeightSubsampling(PLANAR_U)) - 1;
+
+    if (target_height & mask)
+      env->ThrowError("Resize: Planar destination height must be a multiple of %d.", mask + 1);
+  }
+
+  if (vi.IsRGB() && !isRGBPfamily)
+    subrange_top = vi.height - subrange_top - subrange_height; // packed RGB upside down
+
+#ifdef INTEL_INTRINSICS
+  int cpu = env->GetCPUFlags();
+#else
+  int cpu = 0;
+#endif
+
+  double center_pos_v_luma;
+  double center_pos_v_chroma;
+  GetCenterShiftForResizers(center_pos_v_luma, center_pos_v_chroma, preserve_center, chroma_placement, vi, false /* for vertical */);
+
+  double center_pos_h_luma;
+  double center_pos_h_chroma;
+  GetCenterShiftForResizers(center_pos_h_luma, center_pos_h_chroma, preserve_center, chroma_placement, vi, true /* for horizontal */);
+  // 3.7.4- parameter, old Avisynth behavior: 0.5, 0.5
+
+  // Create resampling program and pitch table for H
+  resampling_program_luma_h = func->GetResamplingProgram(vi.width, subrange_left, subrange_width, target_width, bits_per_pixel,
+    center_pos_h_luma, center_pos_h_luma, // for resizing it's the same for source and dest
+    env);
+  resampler_luma_h = GetResamplerH(cpu, pixelsize, bits_per_pixel, resampling_program_luma_h, env);
+
+  // Create resampling program and pitch table for V
+  resampling_program_luma_v = func->GetResamplingProgram(vi.height, subrange_top, subrange_height, target_height, bits_per_pixel,
+    center_pos_v_luma, center_pos_v_luma, // for resizing it's the same for source and dest
+    env);
+  resampler_luma_v = GetResamplerV(cpu, pixelsize, bits_per_pixel, resampling_program_luma_v, env);
+
+
+  if (vi.IsPlanar() && !grey && !isRGBPfamily) {
+    const int shift = vi.GetPlaneHeightSubsampling(PLANAR_U);
+    const int div = 1 << shift;
+
+    resampling_program_chroma_v = func->GetResamplingProgram(
+      vi.height >> shift,
+      subrange_top / div,
+      subrange_height / div,
+      target_height >> shift,
+      bits_per_pixel,
+      center_pos_v_chroma, center_pos_v_chroma, // for resizing it's the same for source and dest
+      env);
+
+    resampler_chroma_v = GetResamplerV(cpu, pixelsize, bits_per_pixel, resampling_program_chroma_v, env);
+  }
+
+  if (vi.IsPlanar() && !grey && !isRGBPfamily) {
+    const int shift = vi.GetPlaneWidthSubsampling(PLANAR_U);
+    const int div = 1 << shift;
+
+    resampling_program_chroma_h = func->GetResamplingProgram(
+      vi.width >> shift,
+      subrange_left / div,
+      subrange_width / div,
+      target_width >> shift,
+      bits_per_pixel,
+      center_pos_h_chroma, center_pos_h_chroma, // horizontal
+      env);
+
+    resampler_chroma_h = GetResamplerH(cpu, pixelsize, bits_per_pixel, resampling_program_chroma_h, env);
+  }
+
+  // Change target video info size
+  vi.height = target_height;
+  vi.width = target_width;
+}
+
+PVideoFrame __stdcall FilteredResize_2p::GetFrame(int n, IScriptEnvironment* env)
+{
+  PVideoFrame src = child->GetFrame(n, env);
+  PVideoFrame dst = env->NewVideoFrameP(vi, &src);
+  int src_pitch = src->GetPitch();
+  int dst_pitch = dst->GetPitch();
+  const BYTE* srcp = src->GetReadPtr();
+  BYTE* dstp = dst->GetWritePtr(); // for first (largest ?) plane or for single ?
+
+  bool isRGBPfamily = vi.IsPlanarRGB() || vi.IsPlanarRGBA();
+
+  BYTE* temp_1 = static_cast<BYTE*>(env->Allocate(dst_pitch * dst_height, FRAME_ALIGN, AVS_POOLED_ALLOC));
+  if (!temp_1 ) {
+    env->Free(temp_1);
+    env->ThrowError("Could not reserve temp memory in a resampler_2p.");
+  }
+
+  // Do resizing, single plane by plane
+  resampler_luma_h(temp_1, srcp, dst_pitch, src_pitch, resampling_program_luma_h, dst_width, src_height, bits_per_pixel);
+  int work_height = vi.IsPlanar() ? vi.width : vi.BytesFromPixels(vi.width) / pixelsize; // packed RGB: or vi.width * vi.NumComponent()
+  resampler_luma_v(dstp, temp_1, dst_pitch, dst_pitch, resampling_program_luma_v, work_height, vi.height, bits_per_pixel);
+
+  if (isRGBPfamily)
+  {
+    src_pitch = src->GetPitch(PLANAR_B);
+    dst_pitch = dst->GetPitch(PLANAR_B);
+    srcp = src->GetReadPtr(PLANAR_B);
+    dstp = dst->GetWritePtr(PLANAR_B);
+
+    resampler_luma_h(temp_1, srcp, dst_pitch, src_pitch, resampling_program_luma_h, dst_width, src_height, bits_per_pixel);
+    int work_height = vi.IsPlanar() ? vi.width : vi.BytesFromPixels(vi.width) / pixelsize; // packed RGB: or vi.width * vi.NumComponent()
+    resampler_luma_v(dstp, temp_1, dst_pitch, dst_pitch, resampling_program_luma_v, work_height, vi.height, bits_per_pixel);
+
+    src_pitch = src->GetPitch(PLANAR_R);
+    dst_pitch = dst->GetPitch(PLANAR_R);
+    srcp = src->GetReadPtr(PLANAR_R);
+    dstp = dst->GetWritePtr(PLANAR_R);
+
+    resampler_luma_h(temp_1, srcp, dst_pitch, src_pitch, resampling_program_luma_h, dst_width, src_height, bits_per_pixel);
+    resampler_luma_v(dstp, temp_1, dst_pitch, dst_pitch, resampling_program_luma_v, work_height, vi.height, bits_per_pixel);
+
+  }
+  else if (!grey && vi.IsPlanar()) {
+    int width = vi.width >> vi.GetPlaneWidthSubsampling(PLANAR_U);
+    int height = vi.height >> vi.GetPlaneHeightSubsampling(PLANAR_U);
+
+    // Plane U resizing
+    src_pitch = src->GetPitch(PLANAR_U);
+    dst_pitch = dst->GetPitch(PLANAR_U);
+    srcp = src->GetReadPtr(PLANAR_U);
+    dstp = dst->GetWritePtr(PLANAR_U);
+
+    resampler_chroma_h(temp_1, srcp, dst_pitch, src_pitch, resampling_program_chroma_h, width, src_height >> vi.GetPlaneHeightSubsampling(PLANAR_U), bits_per_pixel);
+    resampler_chroma_v(dstp, temp_1, dst_pitch, dst_pitch, resampling_program_chroma_v, width, height, bits_per_pixel);
+
+    // Plane V resizing
+    src_pitch = src->GetPitch(PLANAR_V);
+    dst_pitch = dst->GetPitch(PLANAR_V);
+    srcp = src->GetReadPtr(PLANAR_V);
+    dstp = dst->GetWritePtr(PLANAR_V);
+
+    resampler_chroma_h(temp_1, srcp, dst_pitch, src_pitch, resampling_program_chroma_h, width, src_height >> vi.GetPlaneHeightSubsampling(PLANAR_U), bits_per_pixel);
+    resampler_chroma_v(dstp, temp_1, dst_pitch, dst_pitch, resampling_program_chroma_v, width, height, bits_per_pixel);
+
+  }
+
+  if (vi.IsYUVA() || vi.IsPlanarRGBA()) {
+    src_pitch = src->GetPitch(PLANAR_A);
+    dst_pitch = dst->GetPitch(PLANAR_A);
+    srcp = src->GetReadPtr(PLANAR_A);
+    dstp = dst->GetWritePtr(PLANAR_A);
+
+    resampler_luma_h(temp_1, srcp, dst_pitch, src_pitch, resampling_program_luma_h, dst_width, src_height, bits_per_pixel);
+    int work_height = vi.IsPlanar() ? vi.width : vi.BytesFromPixels(vi.width) / pixelsize; // packed RGB: or vi.width * vi.NumComponent()
+    resampler_luma_v(dstp, temp_1, dst_pitch, dst_pitch, resampling_program_luma_v, work_height, vi.height, bits_per_pixel);
+  }
+
+  env->Free(temp_1);
+
+  return dst;
+}
+
+ResamplerV FilteredResize_2p::GetResamplerV(int CPU, int pixelsize, int bits_per_pixel, ResamplingProgram* program, IScriptEnvironment* env) // may be somehow call same method from FilteredResizeV class ?
+{
+
+  resize_prepare_coeffs(program, env, 8);
+  // for SIMD friendliness and more: consolidate the kernel_size vs filter_size at the end.
+  // See comments at FilteredResizeH::GetResampler
+
+  if (program->filter_size == 1) {
+    // Fast pointresize
+    switch (pixelsize) // AVS16
+    {
+    case 1: return resize_v_planar_pointresize<uint8_t>;
+    case 2: return resize_v_planar_pointresize<uint16_t>;
+    default: // case 4:
+      return resize_v_planar_pointresize<float>;
+    }
+  }
+  else {
+    // Other resizers
+    if (pixelsize == 1)
+    {
+#ifdef INTEL_INTRINSICS
+      if (CPU & CPUF_AVX2)
+        return resize_v_avx2_planar_uint8_t;
+      if (CPU & CPUF_SSE2)
+        return resize_v_sse2_planar;
+#ifdef X86_32
+      if (CPU & CPUF_MMX)
+        return resize_v_mmx_planar;
+#endif
+#endif
+      // C version
+      return resize_v_c_planar_uint8_16_t_auto_vectorized<uint8_t, true>;
+    }
+    else if (pixelsize == 2)
+    {
+#ifdef INTEL_INTRINSICS
+      if (CPU & CPUF_AVX2) {
+        if (bits_per_pixel < 16)
+          return resize_v_avx2_planar_uint16_t<true>;
+        else
+          return resize_v_avx2_planar_uint16_t<false>;
+      }
+      if (CPU & CPUF_SSE2) {
+        if (bits_per_pixel < 16)
+          return resize_v_sse2_planar_uint16_t<true>;
+        else
+          return resize_v_sse2_planar_uint16_t<false>;
+      }
+#endif
+      // C version
+      if (bits_per_pixel == 16)
+        return resize_v_c_planar_uint8_16_t_auto_vectorized<uint16_t, false>;
+      else
+        return resize_v_c_planar_uint8_16_t_auto_vectorized<uint16_t, true>;
+    }
+    else // pixelsize== 4
+    {
+#ifdef INTEL_INTRINSICS
+      if (CPU & CPUF_AVX2) {
+        return resize_v_avx2_planar_float;
+      }
+      if (CPU & CPUF_SSE2) {
+        return resize_v_sse2_planar_float;
+      }
+#endif
+      return resize_v_c_planar_float_auto_vectorized;
+    }
+  }
+}
+
+ResamplerH FilteredResize_2p::GetResamplerH(int CPU, int pixelsize, int bits_per_pixel, ResamplingProgram* program, IScriptEnvironment* env) // may be somehow call same method from FilteredResizeH class ?
+{
+  int simd_coeff_count_padding = 8;
+
+  // Both 8-bit and 16-bit SSSE3 and AVX2 horizontal resizers benefit from processing 16 pixels per cycle.
+  // Floats also use 32 bytes, but since 32/sizeof(float) = 8, processing 16 pixels is unnecessary.
+  // Even in C, the code is optimized to be vector-friendly.
+  if (pixelsize == 1 || pixelsize == 2)
+    simd_coeff_count_padding = 16;
+
+  // Not only does it prepare and pad for SIMD/vector code, but it also corrects, reorders, and equalizes coefficients 
+  // at the right and bottom ends, since we may have variable kernel sizes due to boundary conditions.
+  resize_prepare_coeffs(program, env, simd_coeff_count_padding);
+
+  if (pixelsize == 1)
+  {
+#ifdef INTEL_INTRINSICS
+    if (CPU & CPUF_AVX2) {
+      return resizer_h_avx2_generic_uint8_t;
+    }
+    if (CPU & CPUF_SSSE3) {
+      return resizer_h_ssse3_generic_uint8_16<uint8_t, true>;
+    }
+#endif
+    return resizer_h_c_generic_uint8_16_vectorized<uint8_t, true>;
+    //return resize_h_c_planar<uint8_t, 1>;
+  }
+  else if (pixelsize == 2) {
+#ifdef INTEL_INTRINSICS
+    if (CPU & CPUF_AVX2) {
+      if (bits_per_pixel < 16)
+        return resizer_h_avx2_generic_uint16_t<true>;
+      else
+        return resizer_h_avx2_generic_uint16_t<false>;
+    }
+    if (CPU & CPUF_SSSE3) {
+      if (bits_per_pixel < 16)
+        return resizer_h_ssse3_generic_uint8_16<uint16_t, true>;
+      else
+        return resizer_h_ssse3_generic_uint8_16<uint16_t, false>;
+    }
+#endif
+    if (bits_per_pixel == 16)
+      return resizer_h_c_generic_uint8_16_vectorized<uint16_t, false>;
+    // return resize_h_c_planar<uint16_t, 0>;
+    else
+      return resizer_h_c_generic_uint8_16_vectorized<uint16_t, true>;
+    // return resize_h_c_planar<uint16_t, 1>;
+  }
+  else { //if (pixelsize == 4)
+#ifdef INTEL_INTRINSICS
+#ifdef INTEL_INTRINSICS_AVX512
+    if ((CPU & CPUF_AVX512F) && program->filter_size_real <= 4) {
+      //return resize_h_planar_float_avx2_permutex_vstripe_ks4;
+      switch (program->filter_size_real) {
+        /*      case 1: return resize_h_planar_float_avx512_transpose_vstripe_ks4<1>; break;
+              case 2: return resize_h_planar_float_avx512_transpose_vstripe_ks4<2>; break;
+              case 3: return resize_h_planar_float_avx512_transpose_vstripe_ks4<3>; break;
+              case 4: return resize_h_planar_float_avx512_transpose_vstripe_ks4<0>; break;*/
+      case 1: return resize_h_planar_float_avx512_gather_permutex_vstripe_ks4<1>; break;
+      case 2: return resize_h_planar_float_avx512_gather_permutex_vstripe_ks4<2>; break;
+      case 3: return resize_h_planar_float_avx512_gather_permutex_vstripe_ks4<3>; break;
+      case 4: return resize_h_planar_float_avx512_gather_permutex_vstripe_ks4<0>; break;
+      }
+    }
+#endif
+    if (CPU & CPUF_AVX2) {
+      //return resize_h_planar_float_avx2_permutex_vstripe_ks4;
+
+      switch (program->filter_size_real) {
+        /*      case 1: return resize_h_planar_float_avx_transpose_vstripe_ks4<1>; break;
+              case 2: return resize_h_planar_float_avx_transpose_vstripe_ks4<2>; break;
+              case 3: return resize_h_planar_float_avx_transpose_vstripe_ks4<3>; break;
+              case 4: return resize_h_planar_float_avx_transpose_vstripe_ks4<0>; break;*/
+      case 1: return resize_h_planar_float_avx2_gather_permutex_vstripe_ks4<1>; break;
+      case 2: return resize_h_planar_float_avx2_gather_permutex_vstripe_ks4<2>; break;
+      case 3: return resize_h_planar_float_avx2_gather_permutex_vstripe_ks4<3>; break;
+      case 4: return resize_h_planar_float_avx2_gather_permutex_vstripe_ks4<0>; break;
+      default: return resizer_h_avx2_generic_float;
+      }
+
+    }
+    if (CPU & CPUF_SSSE3) {
+      //      return resizer_h_ssse3_generic_float;
+      switch (program->filter_size_real) {
+      case 1: return resize_h_planar_float_sse_transpose_vstripe_ks4<1>; break;
+      case 2: return resize_h_planar_float_sse_transpose_vstripe_ks4<2>; break;
+      case 3: return resize_h_planar_float_sse_transpose_vstripe_ks4<3>; break;
+      case 4: return resize_h_planar_float_sse_transpose_vstripe_ks4<0>; break;
+      default: return resizer_h_ssse3_generic_float;
+      }
+    }
+#endif
+    return resize_h_c_planar<float, 0>;
+  }
+}
+
+
+FilteredResize_2p::~FilteredResize_2p(void)
+{
+  if (resampling_program_luma_h) { delete resampling_program_luma_h; }
+  if (resampling_program_chroma_h) { delete resampling_program_chroma_h; }
+
+  if (resampling_program_luma_v) { delete resampling_program_luma_v; }
+  if (resampling_program_chroma_v) { delete resampling_program_chroma_v; }
+
+
+}
diff --git a/avs_core/filters/resample.h b/avs_core/filters/resample.h
index 8afed8775..d6272ebee 100644
--- a/avs_core/filters/resample.h
+++ b/avs_core/filters/resample.h
@@ -121,6 +121,52 @@ class FilteredResizeV : public GenericVideoFilter
 };
 
 
+/**
+  * Class to resize in the dual directions using a specified sampling filter and lower size used temporal buffer
+  * Helper for resample functions
+ **/
+class FilteredResize_2p : public GenericVideoFilter
+{
+public:
+  FilteredResize_2p(PClip _child,
+    double subrange_left, double subrange_width, int target_width, 
+    double subrange_top, double subrange_height, int target_height,
+    ResamplingFunction* func, bool preserve_center, int chroma_placement, IScriptEnvironment* env);
+  virtual ~FilteredResize_2p(void);
+
+  PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment* env) override;
+
+  int __stdcall SetCacheHints(int cachehints, int frame_range) override {
+    AVS_UNUSED(frame_range);
+    return cachehints == CACHE_GET_MTMODE ? MT_NICE_FILTER : 0;
+  }
+
+  static ResamplerH GetResamplerH(int CPU, int pixelsize, int bits_per_pixel, ResamplingProgram* program, IScriptEnvironment* env);
+  static ResamplerV GetResamplerV(int CPU, int pixelsize, int bits_per_pixel, ResamplingProgram* program, IScriptEnvironment* env);
+
+private:
+  bool grey;
+  int pixelsize; // AVS16
+  int bits_per_pixel;
+
+  int src_width, src_height, dst_width, dst_height;
+
+  ResamplingProgram* resampling_program_luma_h;
+  ResamplingProgram* resampling_program_chroma_h;
+
+  ResamplingProgram* resampling_program_luma_v;
+  ResamplingProgram* resampling_program_chroma_v;
+
+  ResamplerH resampler_luma_h;
+  ResamplerH resampler_chroma_h;
+
+  ResamplerV resampler_luma_v;
+  ResamplerV resampler_chroma_v;
+
+};
+
+
+
 /*** Resample factory methods ***/
 
 class FilteredResize

From d945014406a54d2294130baed7aed13fd9d4ad50 Mon Sep 17 00:00:00 2001
From: DTL2020 <68707763+DTL2020@users.noreply.github.com>
Date: Sun, 25 May 2025 15:47:44 -0700
Subject: [PATCH 17/27] Added AVX512 in

V-resizers selection
---
 avs_core/filters/resample.cpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/avs_core/filters/resample.cpp b/avs_core/filters/resample.cpp
index ca07becae..40b4a3333 100644
--- a/avs_core/filters/resample.cpp
+++ b/avs_core/filters/resample.cpp
@@ -1846,6 +1846,11 @@ ResamplerV FilteredResizeV::GetResampler(int CPU, int pixelsize, int bits_per_pi
     else // pixelsize== 4
     {
 #ifdef INTEL_INTRINSICS
+#ifdef INTEL_INTRINSICS_AVX512
+      if (CPU & CPUF_AVX512F) {
+        return resize_v_avx512_planar_float;
+      }
+#endif
       if (CPU & CPUF_AVX2) {
         return resize_v_avx2_planar_float;
       }
@@ -2408,6 +2413,11 @@ ResamplerV FilteredResize_2p::GetResamplerV(int CPU, int pixelsize, int bits_per
     else // pixelsize== 4
     {
 #ifdef INTEL_INTRINSICS
+#ifdef INTEL_INTRINSICS_AVX512
+      if (CPU & CPUF_AVX512F) {
+        return resize_v_avx512_planar_float;
+      }
+#endif
       if (CPU & CPUF_AVX2) {
         return resize_v_avx2_planar_float;
       }

From 482eb5288013512189bc19d100a5a88bfe55f937 Mon Sep 17 00:00:00 2001
From: DTL2020 <68707763+DTL2020@users.noreply.github.com>
Date: Mon, 26 May 2025 06:28:09 -0700
Subject: [PATCH 18/27] Example of better

FilteredResize_2p::GetFrame() function using memory from general AVS+ video frames cache. But only as example because currently there is no analysis of the downstream request frame buffer implemented to request same size/type buffer to set highest probability of the presenting same buffer for writing to downsteam filter.
---
 avs_core/filters/resample.cpp | 94 ++++++++++++++++++++++++++++++++++-
 1 file changed, 93 insertions(+), 1 deletion(-)

diff --git a/avs_core/filters/resample.cpp b/avs_core/filters/resample.cpp
index 40b4a3333..428c7051d 100644
--- a/avs_core/filters/resample.cpp
+++ b/avs_core/filters/resample.cpp
@@ -2272,7 +2272,7 @@ FilteredResize_2p::FilteredResize_2p(PClip _child,
   vi.width = target_width;
 }
 
-PVideoFrame __stdcall FilteredResize_2p::GetFrame(int n, IScriptEnvironment* env)
+PVideoFrame __stdcall FilteredResize_2p::GetFrame(int n, IScriptEnvironment* env) // use env->Allocate() to get temp buf from other allocated memory - it is NOT returned to the memory pool for the NewVideoFrameP for the downstream filter to write to ?
 {
   PVideoFrame src = child->GetFrame(n, env);
   PVideoFrame dst = env->NewVideoFrameP(vi, &src);
@@ -2354,6 +2354,98 @@ PVideoFrame __stdcall FilteredResize_2p::GetFrame(int n, IScriptEnvironment* env
   return dst;
 }
 
+#if 0
+PVideoFrame __stdcall FilteredResize_2p::GetFrame(int n, IScriptEnvironment* env) // use NewVideoFrame as temp buf to return it in the vfb pool after exit this filter
+{
+  PVideoFrame src = child->GetFrame(n, env);
+  PVideoFrame dst = env->NewVideoFrameP(vi, &src);
+
+  PVideoFrame tmp = env->NewVideoFrameP(vi, &src);
+  /*
+  Here we need to ask ScriptEnvironment to look for output format of downstream filter ? So it is not trans-in-place filter we can request frame buffer larger and left
+  it unused after exiting this function. Only in this case there is a big probability the env->NewVideoFrameP(vi, &src); for downstream filter call will return this same virtual address buffer
+  to the downstream filter and it can be (at least partially) overwritten saving from useless downloading from CPU cache. It is new TODO idea for modification of ScriptEnvironment vfb memory management.
+  After this will be implemented - we can use such method of requesting temp buffer (frame) to use in 2pass resize.
+  If this is last filter in a chain - simply request lowest possible sized frame.
+  */
+
+  int src_pitch = src->GetPitch();
+  int dst_pitch = dst->GetPitch();
+  const BYTE* srcp = src->GetReadPtr();
+  BYTE* dstp = dst->GetWritePtr(); // for first (largest ?) plane or for single ?
+
+  bool isRGBPfamily = vi.IsPlanarRGB() || vi.IsPlanarRGBA();
+
+  const BYTE* tmp_srcp = tmp->GetReadPtr();
+  BYTE* tmp_dstp = tmp->GetWritePtr(); // for first (largest ?) plane or for single ?
+  int tmp_pitch = tmp->GetPitch();
+
+  // Do resizing, single plane by plane
+  resampler_luma_h(tmp_dstp, srcp, tmp_pitch, src_pitch, resampling_program_luma_h, dst_width, src_height, bits_per_pixel);
+  int work_height = vi.IsPlanar() ? vi.width : vi.BytesFromPixels(vi.width) / pixelsize; // packed RGB: or vi.width * vi.NumComponent()
+  resampler_luma_v(dstp, tmp_srcp, dst_pitch, tmp_pitch, resampling_program_luma_v, work_height, vi.height, bits_per_pixel);
+
+  /* Currently left non-changed from env-Allocate() method untill we can request immediately reusable to writing temp buffer by downstream filter
+  if (isRGBPfamily)
+  {
+    src_pitch = src->GetPitch(PLANAR_B);
+    dst_pitch = dst->GetPitch(PLANAR_B);
+    srcp = src->GetReadPtr(PLANAR_B);
+    dstp = dst->GetWritePtr(PLANAR_B);
+
+    resampler_luma_h(temp_1, srcp, dst_pitch, src_pitch, resampling_program_luma_h, dst_width, src_height, bits_per_pixel);
+    int work_height = vi.IsPlanar() ? vi.width : vi.BytesFromPixels(vi.width) / pixelsize; // packed RGB: or vi.width * vi.NumComponent()
+    resampler_luma_v(dstp, temp_1, dst_pitch, dst_pitch, resampling_program_luma_v, work_height, vi.height, bits_per_pixel);
+
+    src_pitch = src->GetPitch(PLANAR_R);
+    dst_pitch = dst->GetPitch(PLANAR_R);
+    srcp = src->GetReadPtr(PLANAR_R);
+    dstp = dst->GetWritePtr(PLANAR_R);
+
+    resampler_luma_h(temp_1, srcp, dst_pitch, src_pitch, resampling_program_luma_h, dst_width, src_height, bits_per_pixel);
+    resampler_luma_v(dstp, temp_1, dst_pitch, dst_pitch, resampling_program_luma_v, work_height, vi.height, bits_per_pixel);
+
+  }
+  else if (!grey && vi.IsPlanar()) {
+    int width = vi.width >> vi.GetPlaneWidthSubsampling(PLANAR_U);
+    int height = vi.height >> vi.GetPlaneHeightSubsampling(PLANAR_U);
+
+    // Plane U resizing
+    src_pitch = src->GetPitch(PLANAR_U);
+    dst_pitch = dst->GetPitch(PLANAR_U);
+    srcp = src->GetReadPtr(PLANAR_U);
+    dstp = dst->GetWritePtr(PLANAR_U);
+
+    resampler_chroma_h(temp_1, srcp, dst_pitch, src_pitch, resampling_program_chroma_h, width, src_height >> vi.GetPlaneHeightSubsampling(PLANAR_U), bits_per_pixel);
+    resampler_chroma_v(dstp, temp_1, dst_pitch, dst_pitch, resampling_program_chroma_v, width, height, bits_per_pixel);
+
+    // Plane V resizing
+    src_pitch = src->GetPitch(PLANAR_V);
+    dst_pitch = dst->GetPitch(PLANAR_V);
+    srcp = src->GetReadPtr(PLANAR_V);
+    dstp = dst->GetWritePtr(PLANAR_V);
+
+    resampler_chroma_h(temp_1, srcp, dst_pitch, src_pitch, resampling_program_chroma_h, width, src_height >> vi.GetPlaneHeightSubsampling(PLANAR_U), bits_per_pixel);
+    resampler_chroma_v(dstp, temp_1, dst_pitch, dst_pitch, resampling_program_chroma_v, width, height, bits_per_pixel);
+
+  }
+
+  if (vi.IsYUVA() || vi.IsPlanarRGBA()) {
+    src_pitch = src->GetPitch(PLANAR_A);
+    dst_pitch = dst->GetPitch(PLANAR_A);
+    srcp = src->GetReadPtr(PLANAR_A);
+    dstp = dst->GetWritePtr(PLANAR_A);
+
+    resampler_luma_h(temp_1, srcp, dst_pitch, src_pitch, resampling_program_luma_h, dst_width, src_height, bits_per_pixel);
+    int work_height = vi.IsPlanar() ? vi.width : vi.BytesFromPixels(vi.width) / pixelsize; // packed RGB: or vi.width * vi.NumComponent()
+    resampler_luma_v(dstp, temp_1, dst_pitch, dst_pitch, resampling_program_luma_v, work_height, vi.height, bits_per_pixel);
+  }
+  */
+
+  return dst;
+}
+#if 0
+
 ResamplerV FilteredResize_2p::GetResamplerV(int CPU, int pixelsize, int bits_per_pixel, ResamplingProgram* program, IScriptEnvironment* env) // may be somehow call same method from FilteredResizeV class ?
 {
 

From a0218bddda9bbaf887da834d9a269ac564e7edd2 Mon Sep 17 00:00:00 2001
From: DTL2020 <68707763+DTL2020@users.noreply.github.com>
Date: Fri, 30 May 2025 07:10:40 -0700
Subject: [PATCH 19/27] Finished function

FilteredResize_2p::GetFrame() used temp buf from main vfb memory cache. It really at least sometime return released buffer as newvideoframe for downstream filter as destination as expected. But its probability is subject to investigate and improument (best request size ? direct ask for request size via filtergraph nodes scan for data sink filter ?).
Performance test at script
BlankClip(1000000, 320,320, pixel_type="YUV444PS")
mul=2
LanczosResize(width*mul, height*mul, taps=2, force=3)

ConverttoRGB24()
Prefetch(6)

at i5-9600K is about
738 fps with env->Allocate/Free and 804fps with env->NewVideoFrame()
---
 avs_core/filters/resample.cpp | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/avs_core/filters/resample.cpp b/avs_core/filters/resample.cpp
index 428c7051d..2856d397a 100644
--- a/avs_core/filters/resample.cpp
+++ b/avs_core/filters/resample.cpp
@@ -2272,6 +2272,7 @@ FilteredResize_2p::FilteredResize_2p(PClip _child,
   vi.width = target_width;
 }
 
+#if 0 // expected worse in performance - left for performance tests
 PVideoFrame __stdcall FilteredResize_2p::GetFrame(int n, IScriptEnvironment* env) // use env->Allocate() to get temp buf from other allocated memory - it is NOT returned to the memory pool for the NewVideoFrameP for the downstream filter to write to ?
 {
   PVideoFrame src = child->GetFrame(n, env);
@@ -2353,20 +2354,24 @@ PVideoFrame __stdcall FilteredResize_2p::GetFrame(int n, IScriptEnvironment* env
 
   return dst;
 }
+#endif
 
-#if 0
 PVideoFrame __stdcall FilteredResize_2p::GetFrame(int n, IScriptEnvironment* env) // use NewVideoFrame as temp buf to return it in the vfb pool after exit this filter
 {
   PVideoFrame src = child->GetFrame(n, env);
   PVideoFrame dst = env->NewVideoFrameP(vi, &src);
 
-  PVideoFrame tmp = env->NewVideoFrameP(vi, &src);
+  PVideoFrame tmp = env->NewVideoFrame(vi); // no need frame properties copy, use as temporal buffer only and its refcount will be zeroed at function exit with object auto-release/destructor (PVideoFrame::~PVideoFrame() )
   /*
   Here we need to ask ScriptEnvironment to look for output format of downstream filter ? So it is not trans-in-place filter we can request frame buffer larger and left
   it unused after exiting this function. Only in this case there is a big probability the env->NewVideoFrameP(vi, &src); for downstream filter call will return this same virtual address buffer
   to the downstream filter and it can be (at least partially) overwritten saving from useless downloading from CPU cache. It is new TODO idea for modification of ScriptEnvironment vfb memory management.
   After this will be implemented - we can use such method of requesting temp buffer (frame) to use in 2pass resize.
   If this is last filter in a chain - simply request lowest possible sized frame.
+
+  Update 30.05.2025: The expected transfer of tmp buf address to downstream fiter dst frame sometime happens - but how frequently it happens in real scripts running - need to be discovered.
+
+  As env->Allocate/Free buffers are definitely worse (only good if downstream filter will request same temp buf for write) - this temp method expected to be faster (as first expectations).
   */
 
   int src_pitch = src->GetPitch();
@@ -2385,7 +2390,7 @@ PVideoFrame __stdcall FilteredResize_2p::GetFrame(int n, IScriptEnvironment* env
   int work_height = vi.IsPlanar() ? vi.width : vi.BytesFromPixels(vi.width) / pixelsize; // packed RGB: or vi.width * vi.NumComponent()
   resampler_luma_v(dstp, tmp_srcp, dst_pitch, tmp_pitch, resampling_program_luma_v, work_height, vi.height, bits_per_pixel);
 
-  /* Currently left non-changed from env-Allocate() method untill we can request immediately reusable to writing temp buffer by downstream filter
+  
   if (isRGBPfamily)
   {
     src_pitch = src->GetPitch(PLANAR_B);
@@ -2393,17 +2398,17 @@ PVideoFrame __stdcall FilteredResize_2p::GetFrame(int n, IScriptEnvironment* env
     srcp = src->GetReadPtr(PLANAR_B);
     dstp = dst->GetWritePtr(PLANAR_B);
 
-    resampler_luma_h(temp_1, srcp, dst_pitch, src_pitch, resampling_program_luma_h, dst_width, src_height, bits_per_pixel);
+    resampler_luma_h(tmp_dstp, srcp, tmp_pitch, src_pitch, resampling_program_luma_h, dst_width, src_height, bits_per_pixel);
     int work_height = vi.IsPlanar() ? vi.width : vi.BytesFromPixels(vi.width) / pixelsize; // packed RGB: or vi.width * vi.NumComponent()
-    resampler_luma_v(dstp, temp_1, dst_pitch, dst_pitch, resampling_program_luma_v, work_height, vi.height, bits_per_pixel);
+    resampler_luma_v(dstp, tmp_srcp, dst_pitch, tmp_pitch, resampling_program_luma_v, work_height, vi.height, bits_per_pixel);
 
     src_pitch = src->GetPitch(PLANAR_R);
     dst_pitch = dst->GetPitch(PLANAR_R);
     srcp = src->GetReadPtr(PLANAR_R);
     dstp = dst->GetWritePtr(PLANAR_R);
 
-    resampler_luma_h(temp_1, srcp, dst_pitch, src_pitch, resampling_program_luma_h, dst_width, src_height, bits_per_pixel);
-    resampler_luma_v(dstp, temp_1, dst_pitch, dst_pitch, resampling_program_luma_v, work_height, vi.height, bits_per_pixel);
+    resampler_luma_h(tmp_dstp, srcp, tmp_pitch, src_pitch, resampling_program_luma_h, dst_width, src_height, bits_per_pixel);
+    resampler_luma_v(dstp, tmp_srcp, dst_pitch, tmp_pitch, resampling_program_luma_v, work_height, vi.height, bits_per_pixel);
 
   }
   else if (!grey && vi.IsPlanar()) {
@@ -2416,8 +2421,8 @@ PVideoFrame __stdcall FilteredResize_2p::GetFrame(int n, IScriptEnvironment* env
     srcp = src->GetReadPtr(PLANAR_U);
     dstp = dst->GetWritePtr(PLANAR_U);
 
-    resampler_chroma_h(temp_1, srcp, dst_pitch, src_pitch, resampling_program_chroma_h, width, src_height >> vi.GetPlaneHeightSubsampling(PLANAR_U), bits_per_pixel);
-    resampler_chroma_v(dstp, temp_1, dst_pitch, dst_pitch, resampling_program_chroma_v, width, height, bits_per_pixel);
+    resampler_chroma_h(tmp_dstp, srcp, tmp_pitch, src_pitch, resampling_program_chroma_h, width, src_height >> vi.GetPlaneHeightSubsampling(PLANAR_U), bits_per_pixel);
+    resampler_chroma_v(dstp, tmp_dstp, dst_pitch, tmp_pitch, resampling_program_chroma_v, width, height, bits_per_pixel);
 
     // Plane V resizing
     src_pitch = src->GetPitch(PLANAR_V);
@@ -2425,8 +2430,8 @@ PVideoFrame __stdcall FilteredResize_2p::GetFrame(int n, IScriptEnvironment* env
     srcp = src->GetReadPtr(PLANAR_V);
     dstp = dst->GetWritePtr(PLANAR_V);
 
-    resampler_chroma_h(temp_1, srcp, dst_pitch, src_pitch, resampling_program_chroma_h, width, src_height >> vi.GetPlaneHeightSubsampling(PLANAR_U), bits_per_pixel);
-    resampler_chroma_v(dstp, temp_1, dst_pitch, dst_pitch, resampling_program_chroma_v, width, height, bits_per_pixel);
+    resampler_chroma_h(tmp_dstp, srcp, tmp_pitch, src_pitch, resampling_program_chroma_h, width, src_height >> vi.GetPlaneHeightSubsampling(PLANAR_V), bits_per_pixel);
+    resampler_chroma_v(dstp, tmp_dstp, dst_pitch, tmp_pitch, resampling_program_chroma_v, width, height, bits_per_pixel);
 
   }
 
@@ -2436,15 +2441,14 @@ PVideoFrame __stdcall FilteredResize_2p::GetFrame(int n, IScriptEnvironment* env
     srcp = src->GetReadPtr(PLANAR_A);
     dstp = dst->GetWritePtr(PLANAR_A);
 
-    resampler_luma_h(temp_1, srcp, dst_pitch, src_pitch, resampling_program_luma_h, dst_width, src_height, bits_per_pixel);
+    resampler_luma_h(tmp_dstp, srcp, tmp_pitch, src_pitch, resampling_program_luma_h, dst_width, src_height, bits_per_pixel);
     int work_height = vi.IsPlanar() ? vi.width : vi.BytesFromPixels(vi.width) / pixelsize; // packed RGB: or vi.width * vi.NumComponent()
-    resampler_luma_v(dstp, temp_1, dst_pitch, dst_pitch, resampling_program_luma_v, work_height, vi.height, bits_per_pixel);
+    resampler_luma_v(dstp, tmp_dstp, dst_pitch, tmp_pitch, resampling_program_luma_v, work_height, vi.height, bits_per_pixel);
   }
-  */
 
   return dst;
 }
-#if 0
+
 
 ResamplerV FilteredResize_2p::GetResamplerV(int CPU, int pixelsize, int bits_per_pixel, ResamplingProgram* program, IScriptEnvironment* env) // may be somehow call same method from FilteredResizeV class ?
 {

From e174650cafe4fbeec23efc89ff482512fdc9c787 Mon Sep 17 00:00:00 2001
From: DTL2020 <68707763+DTL2020@users.noreply.github.com>
Date: Sun, 8 Jun 2025 04:07:31 -0700
Subject: [PATCH 20/27] New H and V resampling functions

---
 avs_core/filters/resample.cpp | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/avs_core/filters/resample.cpp b/avs_core/filters/resample.cpp
index 2856d397a..7e3d778a4 100644
--- a/avs_core/filters/resample.cpp
+++ b/avs_core/filters/resample.cpp
@@ -1615,10 +1615,10 @@ ResamplerH FilteredResizeH::GetResampler(int CPU, int pixelsize, int bits_per_pi
       case 2: return resize_h_planar_float_avx512_transpose_vstripe_ks4<2>; break;
       case 3: return resize_h_planar_float_avx512_transpose_vstripe_ks4<3>; break;
       case 4: return resize_h_planar_float_avx512_transpose_vstripe_ks4<0>; break;*/
-      case 1: return resize_h_planar_float_avx512_gather_permutex_vstripe_ks4<1>; break;
-      case 2: return resize_h_planar_float_avx512_gather_permutex_vstripe_ks4<2>; break;
-      case 3: return resize_h_planar_float_avx512_gather_permutex_vstripe_ks4<3>; break;
-      case 4: return resize_h_planar_float_avx512_gather_permutex_vstripe_ks4<0>; break;
+      case 1: return resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w<1>; break;
+      case 2: return resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w<2>; break;
+      case 3: return resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w<3>; break;
+      case 4: return resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w<0>; break;
       }
     }
 #endif
@@ -1848,11 +1848,13 @@ ResamplerV FilteredResizeV::GetResampler(int CPU, int pixelsize, int bits_per_pi
 #ifdef INTEL_INTRINSICS
 #ifdef INTEL_INTRINSICS_AVX512
       if (CPU & CPUF_AVX512F) {
-        return resize_v_avx512_planar_float;
+//        return resize_v_avx512_planar_float;
+        return resize_v_avx512_planar_float_w_sr;
       }
 #endif
       if (CPU & CPUF_AVX2) {
-        return resize_v_avx2_planar_float;
+//        return resize_v_avx2_planar_float;
+        return resize_v_avx2_planar_float_w_sr;
       }
       if (CPU & CPUF_SSE2) {
         return resize_v_sse2_planar_float;
@@ -2511,7 +2513,8 @@ ResamplerV FilteredResize_2p::GetResamplerV(int CPU, int pixelsize, int bits_per
 #ifdef INTEL_INTRINSICS
 #ifdef INTEL_INTRINSICS_AVX512
       if (CPU & CPUF_AVX512F) {
-        return resize_v_avx512_planar_float;
+//        return resize_v_avx512_planar_float;
+        return resize_v_avx512_planar_float_w_sr;
       }
 #endif
       if (CPU & CPUF_AVX2) {
@@ -2585,10 +2588,10 @@ ResamplerH FilteredResize_2p::GetResamplerH(int CPU, int pixelsize, int bits_per
               case 2: return resize_h_planar_float_avx512_transpose_vstripe_ks4<2>; break;
               case 3: return resize_h_planar_float_avx512_transpose_vstripe_ks4<3>; break;
               case 4: return resize_h_planar_float_avx512_transpose_vstripe_ks4<0>; break;*/
-      case 1: return resize_h_planar_float_avx512_gather_permutex_vstripe_ks4<1>; break;
-      case 2: return resize_h_planar_float_avx512_gather_permutex_vstripe_ks4<2>; break;
-      case 3: return resize_h_planar_float_avx512_gather_permutex_vstripe_ks4<3>; break;
-      case 4: return resize_h_planar_float_avx512_gather_permutex_vstripe_ks4<0>; break;
+      case 1: return resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w<1>; break;
+      case 2: return resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w<2>; break;
+      case 3: return resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w<3>; break;
+      case 4: return resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w<0>; break;
       }
     }
 #endif

From 06eaf70276f50f3d70ed4c3a8b48f0d44a01a01a Mon Sep 17 00:00:00 2001
From: DTL2020 <68707763+DTL2020@users.noreply.github.com>
Date: Sun, 8 Jun 2025 04:09:17 -0700
Subject: [PATCH 21/27] Added wider

V float AVX2 and AVX512 resamplers and also dual-width (32 samples per loop spin) AVX512 H-resampler.
---
 avs_core/filters/intel/resample_avx2.cpp   |  96 +++
 avs_core/filters/intel/resample_avx2.h     |   1 +
 avs_core/filters/intel/resample_avx512.cpp | 675 ++++++++++++++++++++-
 avs_core/filters/intel/resample_avx512.h   |   4 +
 4 files changed, 762 insertions(+), 14 deletions(-)

diff --git a/avs_core/filters/intel/resample_avx2.cpp b/avs_core/filters/intel/resample_avx2.cpp
index 96fa1533a..23b82db8f 100644
--- a/avs_core/filters/intel/resample_avx2.cpp
+++ b/avs_core/filters/intel/resample_avx2.cpp
@@ -987,6 +987,102 @@ void resize_v_avx2_planar_float(BYTE* dst8, const BYTE* src8, int dst_pitch, int
   }
 }
 
+// Memory-transfer optimized version 
+void resize_v_avx2_planar_float_w_sr(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel)
+{
+  AVS_UNUSED(bits_per_pixel);
+
+  const int filter_size = program->filter_size;
+  const float* AVS_RESTRICT current_coeff = program->pixel_coefficient_float;
+
+  const float* src = (const float*)src8;
+  float* AVS_RESTRICT dst = (float*)dst8;
+  dst_pitch = dst_pitch / sizeof(float);
+  src_pitch = src_pitch / sizeof(float);
+
+  const int kernel_size = program->filter_size_real; // not the aligned
+  const int kernel_size_mod2 = (kernel_size / 2) * 2; // Process pairs of rows for better efficiency
+  const bool notMod2 = kernel_size_mod2 < kernel_size;
+
+  const int width_mod32 = (width / 32) * 32; // Process by 4x 256bit (8 x 8 floats) to make memory read/write linear streams longer, 16x256 bit registers in 64bit mode should be enough
+
+  for (int y = 0; y < target_height; y++) {
+    int offset = program->pixel_offset[y];
+    const float* src_ptr = src + offset * src_pitch;
+
+    for (int x = 0; x < width_mod32; x += 32) {
+      __m256 result_1 = _mm256_setzero_ps();
+      __m256 result_2 = _mm256_setzero_ps();
+      __m256 result_3 = _mm256_setzero_ps();
+      __m256 result_4 = _mm256_setzero_ps();
+
+      const float* AVS_RESTRICT src2_ptr = src_ptr + x; // __restrict here
+
+      int i = 0;
+      for (; i < kernel_size; i ++) {
+        // coefs are equal for all H-samples
+        __m256 coeff = _mm256_set1_ps(current_coeff[i]);
+
+        __m256 src_1 = _mm256_load_ps(src2_ptr); // why was loadu ? source always aligned in V-resizers ?
+        __m256 src_2 = _mm256_load_ps(src2_ptr + 8);
+        __m256 src_3 = _mm256_load_ps(src2_ptr + 16);
+        __m256 src_4 = _mm256_load_ps(src2_ptr + 24);
+
+        result_1 = _mm256_fmadd_ps(src_1, coeff, result_1);
+        result_2 = _mm256_fmadd_ps(src_2, coeff, result_2);
+        result_3 = _mm256_fmadd_ps(src_3, coeff, result_3);
+        result_4 = _mm256_fmadd_ps(src_4, coeff, result_4);
+
+        src2_ptr += src_pitch;
+      }
+
+      _mm256_stream_ps(dst + x, result_1); // it is best with RAW compute performance test but may be not best in filter chain and data splitting - better to use filter store control param cached or not cached stores
+      _mm256_stream_ps(dst + x + 8, result_2);
+      _mm256_stream_ps(dst + x + 16, result_3);
+      _mm256_stream_ps(dst + x + 24, result_4);
+    } // width_mod32
+
+    // 32 byte 8 floats (AVX2 register holds 8 floats)
+    // no need for wmod8, alignment is safe 32 bytes at least
+    for (int x = width_mod32; x < width; x += 8) {
+      __m256 result_single = _mm256_setzero_ps();
+      __m256 result_single_2 = _mm256_setzero_ps();
+
+      const float* AVS_RESTRICT src2_ptr = src_ptr + x; // __restrict here
+
+      // Process pairs of rows for better efficiency (2 coeffs/cycle)
+      // two result variables for potential parallel operation
+      int i = 0;
+      for (; i < kernel_size_mod2; i += 2) {
+        __m256 coeff_even = _mm256_set1_ps(current_coeff[i]);
+        __m256 coeff_odd = _mm256_set1_ps(current_coeff[i + 1]);
+
+        __m256 src_even = _mm256_load_ps(src2_ptr);
+        __m256 src_odd = _mm256_load_ps(src2_ptr + src_pitch);
+
+        result_single = _mm256_fmadd_ps(src_even, coeff_even, result_single);
+        result_single_2 = _mm256_fmadd_ps(src_odd, coeff_odd, result_single_2);
+
+        src2_ptr += 2 * src_pitch;
+      }
+
+      result_single = _mm256_add_ps(result_single, result_single_2);
+
+      // Process the last odd row if needed
+      if (notMod2) {
+        __m256 coeff = _mm256_set1_ps(current_coeff[i]);
+        __m256 src_val = _mm256_load_ps(src2_ptr);
+        result_single = _mm256_fmadd_ps(src_val, coeff, result_single);
+      }
+
+      _mm256_stream_ps(dst + x, result_single);
+    }
+
+    dst += dst_pitch;
+    current_coeff += filter_size;
+  }
+}
+
 // avx2 16bit
 template void resizer_h_avx2_generic_uint16_t<false>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
 // avx2 10-14bit
diff --git a/avs_core/filters/intel/resample_avx2.h b/avs_core/filters/intel/resample_avx2.h
index d5b2baa5d..8f4b41625 100644
--- a/avs_core/filters/intel/resample_avx2.h
+++ b/avs_core/filters/intel/resample_avx2.h
@@ -51,6 +51,7 @@ template<bool lessthan16bit>
 void resize_v_avx2_planar_uint16_t(BYTE* dst0, const BYTE* src0, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel);
 
 void resize_v_avx2_planar_float(BYTE* dst0, const BYTE* src0, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel);
+void resize_v_avx2_planar_float_w_sr(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel);
 
 void resize_h_planar_float_avx_transpose(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
 
diff --git a/avs_core/filters/intel/resample_avx512.cpp b/avs_core/filters/intel/resample_avx512.cpp
index 15957a488..44507d23f 100644
--- a/avs_core/filters/intel/resample_avx512.cpp
+++ b/avs_core/filters/intel/resample_avx512.cpp
@@ -200,7 +200,7 @@ void resize_h_planar_float_avx512_transpose_vstripe_ks4(BYTE* dst8, const BYTE*
       result = _mm512_fmadd_ps(data_3_7_11_15, coef_3_7_11_15, result);
       result = _mm512_fmadd_ps(data_4_8_12_16, coef_4_8_12_16, result);
 
-      _mm512_store_ps(dst_ptr, result);
+      _mm512_stream_ps(dst_ptr, result); // it is best with RAW compute performance test but may be not best in filter chain and data splitting - better to use filter store control param cached or not cached stores
 
       dst_ptr += dst_pitch;
       src_ptr += src_pitch;
@@ -227,6 +227,7 @@ template void resize_h_planar_float_avx512_transpose_vstripe_ks4<1>(BYTE* dst8,
 template void resize_h_planar_float_avx512_transpose_vstripe_ks4<2>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
 template void resize_h_planar_float_avx512_transpose_vstripe_ks4<3>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
 
+
 /* Universal function supporting 2 ways of processing depending on the max offset of the source samples to read in the resampling program :
 1. For high upsampling ratios it uses low read (single 8 float source samples) and permute-transpose before V-fma
 2. For downsample and no-resize convolution - use each input sequence gathering by direct addressing
@@ -357,7 +358,7 @@ void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4(BYTE* dst8, const
         result = _mm512_fmadd_ps(data_3_7_11_15, coef_3_7_11_15, result);
         result = _mm512_fmadd_ps(data_4_8_12_16, coef_4_8_12_16, result);
 
-        _mm512_store_ps(dst_ptr, result);
+        _mm512_stream_ps(dst_ptr, result); // it is best with RAW compute performance test but may be not best in filter chain and data splitting - better to use filter store control param cached or not cached stores
 
         dst_ptr += dst_pitch;
         src_ptr += src_pitch;
@@ -436,7 +437,7 @@ void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4(BYTE* dst8, const
         result0 = _mm512_fmadd_ps(data_1, coef_r1, result0);
         result1 = _mm512_fmadd_ps(data_3, coef_r3, result1);
 
-        _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1));
+        _mm512_stream_ps(dst_ptr, _mm512_add_ps(result0, result1)); // it is best with RAW compute performance test but may be not best in filter chain and data splitting - better to use filter store control param cached or not cached stores
 
         dst_ptr += dst_pitch;
         src_ptr += src_pitch;
@@ -453,6 +454,483 @@ template void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4<2>(BYTE*
 template void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4<3>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
 
 
+/* Universal function supporting 2 ways of processing depending on the max offset of the source samples to read in the resampling program :
+1. For high upsampling ratios it uses low read (single 8 float source samples) and permute-transpose before V-fma
+2. For downsample and no-resize convolution - use each input sequence gathering by direct addressing
+*/
+template<int filtersizemod4>
+void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel)
+{
+  assert(filtersizemod4 >= 0 && filtersizemod4 <= 3);
+
+  const int filter_size = program->filter_size; // aligned, practically the coeff table stride
+
+  src_pitch /= sizeof(float);
+  dst_pitch /= sizeof(float);
+
+  float* src = (float*)src8;
+  float* dst = (float*)dst8;
+
+  const float* AVS_RESTRICT current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
+
+  const int width_mod32 = (width / 32) * 32; // Process by 2x 512it (2 x 16 floats) to make memory read/write linear streams longer,
+
+  constexpr int MAX_PIXELS_AT_A_TIME = 32; // Process sixteen pixels in parallel using AVX512 (4x4 using m128 lanes)
+  constexpr int PIXELS_AT_A_TIME = 16; // Process sixteen pixels in parallel using AVX512 (4x4 using m128 lanes)
+
+  // 'source_overread_beyond_targetx' indicates if the filter kernel can read beyond the target width.
+  // Even if the filter alignment allows larger reads, our safety boundary for unaligned loads starts at 4 pixels back
+  // from the target width, as we load 4 floats at once conceptually with our safe load.
+  const int width_safe_mod = (program->safelimit_4_pixels.overread_possible ? program->safelimit_4_pixels.source_overread_beyond_targetx : width) / MAX_PIXELS_AT_A_TIME * MAX_PIXELS_AT_A_TIME;
+
+  // Preconditions:
+  assert(program->filter_size_real <= 4); // We preload all relevant coefficients (up to 4) before the height loop.
+
+  // 'target_size_alignment' ensures we can safely access coefficients using offsets like
+  // 'filter_size * 7' when processing 8 H pixels at a time or
+  // 'filter_size * 15' when processing 16 H pixels at a time
+  assert(program->target_size_alignment >= 16); // Adjusted for 16 pixels
+  assert(FRAME_ALIGN >= 64); // Adjusted for 16 pixels AviSynth+ default
+
+  // Ensure that coefficient loading beyond the valid target size is safe for 4x4 float loads.
+  assert(program->filter_size_alignment >= 4);
+
+  bool bDoGather = false;
+  // Analyse input resampling program to select method of processing
+  for (int x = 0; x < width - 16; x += 16) // -16 to save from vector overrread at program->pixel_offset[x + 15 + 3]; ?
+  {
+    int start_off = program->pixel_offset[x + 0];
+    int end_off = program->pixel_offset[x + 15];
+    if ((end_off - start_off) + (program->filter_size_real - 1) > 32) bDoGather = true;
+
+    start_off = program->pixel_offset[x + 1];
+    end_off = program->pixel_offset[x + 15 + 1];
+    if ((end_off - start_off) + (program->filter_size_real - 1) > 32) bDoGather = true;
+
+    start_off = program->pixel_offset[x + 2];
+    end_off = program->pixel_offset[x + 15 + 2];
+    if ((end_off - start_off) + (program->filter_size_real - 1) > 32) bDoGather = true;
+
+    start_off = program->pixel_offset[x + 3];
+    end_off = program->pixel_offset[x + 15 + 3];
+    if ((end_off - start_off) + (program->filter_size_real - 1) > 32) bDoGather = true;
+  }
+
+  int x = 0;
+
+  if (bDoGather) 
+  {
+    // This 'auto' lambda construct replaces the need of templates
+    auto do_h_float_core_16 = [&](auto partial_load) {
+      // Load up to 4x4 coefficients at once before the height loop.
+      // Pre-loading and transposing coefficients keeps register usage efficient.
+      // Assumes 'filter_size_aligned' is at least 4.
+
+      // Coefficients for the source pixel offset (for src_ptr + begin1 [0..3], begin5 [0..3], begin9 [0..3], begin13 [0..3])
+      __m512 coef_1_5_9_13 = _mm512_load_4_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4, current_coeff + filter_size * 8, current_coeff + filter_size * 12);
+      __m512 coef_2_6_10_14 = _mm512_load_4_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5, current_coeff + filter_size * 9, current_coeff + filter_size * 13);
+      __m512 coef_3_7_11_15 = _mm512_load_4_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6, current_coeff + filter_size * 10, current_coeff + filter_size * 14);
+      __m512 coef_4_8_12_16 = _mm512_load_4_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7, current_coeff + filter_size * 11, current_coeff + filter_size * 15);
+
+      _MM_TRANSPOSE16_LANE4_PS(coef_1_5_9_13, coef_2_6_10_14, coef_3_7_11_15, coef_4_8_12_16);
+
+      float* AVS_RESTRICT dst_ptr = dst + x;
+      const float* src_ptr = src;
+
+      // Pixel offsets for the current target x-positions.
+      // Even for x >= width, these offsets are guaranteed to be within the allocated 'target_size_alignment'.
+      const int begin1 = program->pixel_offset[x + 0];
+      const int begin2 = program->pixel_offset[x + 1];
+      const int begin3 = program->pixel_offset[x + 2];
+      const int begin4 = program->pixel_offset[x + 3];
+      const int begin5 = program->pixel_offset[x + 4];
+      const int begin6 = program->pixel_offset[x + 5];
+      const int begin7 = program->pixel_offset[x + 6];
+      const int begin8 = program->pixel_offset[x + 7];
+      const int begin9 = program->pixel_offset[x + 8];
+      const int begin10 = program->pixel_offset[x + 9];
+      const int begin11 = program->pixel_offset[x + 10];
+      const int begin12 = program->pixel_offset[x + 11];
+      const int begin13 = program->pixel_offset[x + 12];
+      const int begin14 = program->pixel_offset[x + 13];
+      const int begin15 = program->pixel_offset[x + 14];
+      const int begin16 = program->pixel_offset[x + 15];
+
+      for (int y = 0; y < height; y++)
+      {
+        __m512 data_1_5_9_13;
+        __m512 data_2_6_10_14;
+        __m512 data_3_7_11_15;
+        __m512 data_4_8_12_16;
+
+        if constexpr (partial_load) {
+          // In the potentially unsafe zone (near the right edge of the image), we use a safe loading function
+          // to prevent reading beyond the allocated source scanline.
+
+          data_1_5_9_13 = _mm512_load_partial_safe_4_m128<filtersizemod4>(src_ptr + begin1, src_ptr + begin5, src_ptr + begin9, src_ptr + begin13);
+          data_2_6_10_14 = _mm512_load_partial_safe_4_m128<filtersizemod4>(src_ptr + begin2, src_ptr + begin6, src_ptr + begin10, src_ptr + begin14);
+          data_3_7_11_15 = _mm512_load_partial_safe_4_m128<filtersizemod4>(src_ptr + begin3, src_ptr + begin7, src_ptr + begin11, src_ptr + begin15);
+          data_4_8_12_16 = _mm512_load_partial_safe_4_m128<filtersizemod4>(src_ptr + begin4, src_ptr + begin8, src_ptr + begin12, src_ptr + begin16);
+        }
+        else {
+          // In the safe zone, we can directly load 4 pixels at a time for each of the four lanes.
+          data_1_5_9_13 = _mm512_loadu_4_m128(src_ptr + begin1, src_ptr + begin5, src_ptr + begin9, src_ptr + begin13);
+          data_2_6_10_14 = _mm512_loadu_4_m128(src_ptr + begin2, src_ptr + begin6, src_ptr + begin10, src_ptr + begin14);
+          data_3_7_11_15 = _mm512_loadu_4_m128(src_ptr + begin3, src_ptr + begin7, src_ptr + begin11, src_ptr + begin15);
+          data_4_8_12_16 = _mm512_loadu_4_m128(src_ptr + begin4, src_ptr + begin8, src_ptr + begin12, src_ptr + begin16);
+        }
+
+        _MM_TRANSPOSE16_LANE4_PS(data_1_5_9_13, data_2_6_10_14, data_3_7_11_15, data_4_8_12_16);
+
+        __m512 result = _mm512_mul_ps(data_1_5_9_13, coef_1_5_9_13);
+        result = _mm512_fmadd_ps(data_2_6_10_14, coef_2_6_10_14, result);
+        result = _mm512_fmadd_ps(data_3_7_11_15, coef_3_7_11_15, result);
+        result = _mm512_fmadd_ps(data_4_8_12_16, coef_4_8_12_16, result);
+
+        _mm512_stream_ps(dst_ptr, result); // it is best with RAW compute performance test but may be not best in filter chain and data splitting - better to use filter store control param cached or not cached stores
+
+        dst_ptr += dst_pitch;
+        src_ptr += src_pitch;
+      } // y
+      current_coeff += filter_size * 16; // Move to the next set of coefficients for the next 16 output pixels
+    }; // end of lambda_16
+
+    // This 'auto' lambda construct replaces the need of templates
+    auto do_h_float_core_32 = [&](auto partial_load) {
+      // Load up to 4x4 coefficients at once before the height loop.
+      // Pre-loading and transposing coefficients keeps register usage efficient.
+      // Assumes 'filter_size_aligned' is at least 4.
+
+      // Coefficients for the source pixel offset (for src_ptr + begin1 [0..3], begin5 [0..3], begin9 [0..3], begin13 [0..3])
+      __m512 coef_1_5_9_13 = _mm512_load_4_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4, current_coeff + filter_size * 8, current_coeff + filter_size * 12);
+      __m512 coef_2_6_10_14 = _mm512_load_4_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5, current_coeff + filter_size * 9, current_coeff + filter_size * 13);
+      __m512 coef_3_7_11_15 = _mm512_load_4_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6, current_coeff + filter_size * 10, current_coeff + filter_size * 14);
+      __m512 coef_4_8_12_16 = _mm512_load_4_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7, current_coeff + filter_size * 11, current_coeff + filter_size * 15);
+
+      _MM_TRANSPOSE16_LANE4_PS(coef_1_5_9_13, coef_2_6_10_14, coef_3_7_11_15, coef_4_8_12_16);
+
+      // Coefficients for the source pixel offset (for src_ptr + begin1 [0..3], begin5 [0..3], begin9 [0..3], begin13 [0..3])
+      __m512 coef_1_5_9_13_2 = _mm512_load_4_m128(current_coeff + filter_size * 16, current_coeff + filter_size * 20, current_coeff + filter_size * 24, current_coeff + filter_size * 28);
+      __m512 coef_2_6_10_14_2 = _mm512_load_4_m128(current_coeff + filter_size * 17, current_coeff + filter_size * 21, current_coeff + filter_size * 25, current_coeff + filter_size * 29);
+      __m512 coef_3_7_11_15_2 = _mm512_load_4_m128(current_coeff + filter_size * 18, current_coeff + filter_size * 22, current_coeff + filter_size * 26, current_coeff + filter_size * 30);
+      __m512 coef_4_8_12_16_2 = _mm512_load_4_m128(current_coeff + filter_size * 19, current_coeff + filter_size * 23, current_coeff + filter_size * 27, current_coeff + filter_size * 31);
+
+      _MM_TRANSPOSE16_LANE4_PS(coef_1_5_9_13_2, coef_2_6_10_14_2, coef_3_7_11_15_2, coef_4_8_12_16_2);
+
+      float* AVS_RESTRICT dst_ptr = dst + x;
+      const float* src_ptr = src;
+
+      // Pixel offsets for the current target x-positions.
+      // Even for x >= width, these offsets are guaranteed to be within the allocated 'target_size_alignment'.
+      const int begin1 = program->pixel_offset[x + 0];
+      const int begin2 = program->pixel_offset[x + 1];
+      const int begin3 = program->pixel_offset[x + 2];
+      const int begin4 = program->pixel_offset[x + 3];
+      const int begin5 = program->pixel_offset[x + 4];
+      const int begin6 = program->pixel_offset[x + 5];
+      const int begin7 = program->pixel_offset[x + 6];
+      const int begin8 = program->pixel_offset[x + 7];
+      const int begin9 = program->pixel_offset[x + 8];
+      const int begin10 = program->pixel_offset[x + 9];
+      const int begin11 = program->pixel_offset[x + 10];
+      const int begin12 = program->pixel_offset[x + 11];
+      const int begin13 = program->pixel_offset[x + 12];
+      const int begin14 = program->pixel_offset[x + 13];
+      const int begin15 = program->pixel_offset[x + 14];
+      const int begin16 = program->pixel_offset[x + 15];
+
+      // Pixel offsets for the current target x-positions.
+      // Even for x >= width, these offsets are guaranteed to be within the allocated 'target_size_alignment'.
+      const int begin1_2 = program->pixel_offset[x + 16];
+      const int begin2_2 = program->pixel_offset[x + 17];
+      const int begin3_2 = program->pixel_offset[x + 18];
+      const int begin4_2 = program->pixel_offset[x + 19];
+      const int begin5_2 = program->pixel_offset[x + 20];
+      const int begin6_2 = program->pixel_offset[x + 21];
+      const int begin7_2 = program->pixel_offset[x + 22];
+      const int begin8_2 = program->pixel_offset[x + 23];
+      const int begin9_2 = program->pixel_offset[x + 24];
+      const int begin10_2 = program->pixel_offset[x + 25];
+      const int begin11_2 = program->pixel_offset[x + 26];
+      const int begin12_2 = program->pixel_offset[x + 27];
+      const int begin13_2 = program->pixel_offset[x + 28];
+      const int begin14_2 = program->pixel_offset[x + 29];
+      const int begin15_2 = program->pixel_offset[x + 30];
+      const int begin16_2 = program->pixel_offset[x + 31];
+
+      for (int y = 0; y < height; y++)
+      {
+        __m512 data_1_5_9_13;
+        __m512 data_2_6_10_14;
+        __m512 data_3_7_11_15;
+        __m512 data_4_8_12_16;
+
+        __m512 data_1_5_9_13_2;
+        __m512 data_2_6_10_14_2;
+        __m512 data_3_7_11_15_2;
+        __m512 data_4_8_12_16_2;
+
+        if constexpr (partial_load) {
+          // In the potentially unsafe zone (near the right edge of the image), we use a safe loading function
+          // to prevent reading beyond the allocated source scanline.
+
+          data_1_5_9_13 = _mm512_load_partial_safe_4_m128<filtersizemod4>(src_ptr + begin1, src_ptr + begin5, src_ptr + begin9, src_ptr + begin13);
+          data_2_6_10_14 = _mm512_load_partial_safe_4_m128<filtersizemod4>(src_ptr + begin2, src_ptr + begin6, src_ptr + begin10, src_ptr + begin14);
+          data_3_7_11_15 = _mm512_load_partial_safe_4_m128<filtersizemod4>(src_ptr + begin3, src_ptr + begin7, src_ptr + begin11, src_ptr + begin15);
+          data_4_8_12_16 = _mm512_load_partial_safe_4_m128<filtersizemod4>(src_ptr + begin4, src_ptr + begin8, src_ptr + begin12, src_ptr + begin16);
+
+          data_1_5_9_13_2 = _mm512_load_partial_safe_4_m128<filtersizemod4>(src_ptr + begin1_2, src_ptr + begin5_2, src_ptr + begin9_2, src_ptr + begin13_2);
+          data_2_6_10_14_2 = _mm512_load_partial_safe_4_m128<filtersizemod4>(src_ptr + begin2_2, src_ptr + begin6_2, src_ptr + begin10_2, src_ptr + begin14_2);
+          data_3_7_11_15_2 = _mm512_load_partial_safe_4_m128<filtersizemod4>(src_ptr + begin3_2, src_ptr + begin7_2, src_ptr + begin11_2, src_ptr + begin15_2);
+          data_4_8_12_16_2 = _mm512_load_partial_safe_4_m128<filtersizemod4>(src_ptr + begin4_2, src_ptr + begin8_2, src_ptr + begin12_2, src_ptr + begin16_2);
+
+        }
+        else {
+          // In the safe zone, we can directly load 4 pixels at a time for each of the four lanes.
+          data_1_5_9_13 = _mm512_loadu_4_m128(src_ptr + begin1, src_ptr + begin5, src_ptr + begin9, src_ptr + begin13);
+          data_2_6_10_14 = _mm512_loadu_4_m128(src_ptr + begin2, src_ptr + begin6, src_ptr + begin10, src_ptr + begin14);
+          data_3_7_11_15 = _mm512_loadu_4_m128(src_ptr + begin3, src_ptr + begin7, src_ptr + begin11, src_ptr + begin15);
+          data_4_8_12_16 = _mm512_loadu_4_m128(src_ptr + begin4, src_ptr + begin8, src_ptr + begin12, src_ptr + begin16);
+
+          data_1_5_9_13_2 = _mm512_loadu_4_m128(src_ptr + begin1_2, src_ptr + begin5_2, src_ptr + begin9_2, src_ptr + begin13_2);
+          data_2_6_10_14_2 = _mm512_loadu_4_m128(src_ptr + begin2_2, src_ptr + begin6_2, src_ptr + begin10_2, src_ptr + begin14_2);
+          data_3_7_11_15_2 = _mm512_loadu_4_m128(src_ptr + begin3_2, src_ptr + begin7_2, src_ptr + begin11_2, src_ptr + begin15_2);
+          data_4_8_12_16_2 = _mm512_loadu_4_m128(src_ptr + begin4_2, src_ptr + begin8_2, src_ptr + begin12_2, src_ptr + begin16_2);
+
+        }
+
+        _MM_TRANSPOSE16_LANE4_PS(data_1_5_9_13, data_2_6_10_14, data_3_7_11_15, data_4_8_12_16);
+        _MM_TRANSPOSE16_LANE4_PS(data_1_5_9_13_2, data_2_6_10_14_2, data_3_7_11_15_2, data_4_8_12_16_2);
+
+        __m512 result = _mm512_mul_ps(data_1_5_9_13, coef_1_5_9_13);
+        result = _mm512_fmadd_ps(data_2_6_10_14, coef_2_6_10_14, result);
+        result = _mm512_fmadd_ps(data_3_7_11_15, coef_3_7_11_15, result);
+        result = _mm512_fmadd_ps(data_4_8_12_16, coef_4_8_12_16, result);
+
+        __m512 result_2 = _mm512_mul_ps(data_1_5_9_13_2, coef_1_5_9_13_2);
+        result_2 = _mm512_fmadd_ps(data_2_6_10_14_2, coef_2_6_10_14_2, result_2);
+        result_2 = _mm512_fmadd_ps(data_3_7_11_15_2, coef_3_7_11_15_2, result_2);
+        result_2 = _mm512_fmadd_ps(data_4_8_12_16_2, coef_4_8_12_16_2, result_2);
+
+
+        _mm512_stream_ps(dst_ptr, result); // it is best with RAW compute performance test but may be not best in filter chain and data splitting - better to use filter store control param cached or not cached stores
+        _mm512_stream_ps(dst_ptr + 16, result_2);
+
+        dst_ptr += dst_pitch;
+        src_ptr += src_pitch;
+      } // y
+      current_coeff += filter_size * 32; // Move to the next set of coefficients for the next 32 output pixels
+    }; // end of lambda
+
+    // Process the 'safe zone' where direct full unaligned loads are acceptable.
+    for (; x < std::min(width_mod32, width_safe_mod); x += 32)
+    {
+      do_h_float_core_32(std::false_type{}); // partial_load == false, use direct _mm512_loadu_4_m128
+    }
+
+    for (width_mod32; x < width_safe_mod; x += PIXELS_AT_A_TIME) 
+    {
+      do_h_float_core_16(std::false_type{}); // partial_load == false, use direct _mm512_loadu_4_m128
+    }
+
+    // Process the potentially 'unsafe zone' near the image edge, using safe loading.
+    for (; x < width; x += PIXELS_AT_A_TIME)
+    {
+      do_h_float_core_16(std::true_type{}); // partial_load == true, use the safer '_mm512_load_partial_safe_4_m128'
+    }
+  }
+  else // if(bDoGather)
+  {
+    for (int x = 0; x < width_mod32; x += 32)
+    {
+      // prepare coefs in transposed V-form
+      __m512 coef_r0 = _mm512_load_4_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4, current_coeff + filter_size * 8, current_coeff + filter_size * 12);
+      __m512 coef_r1 = _mm512_load_4_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5, current_coeff + filter_size * 9, current_coeff + filter_size * 13);
+      __m512 coef_r2 = _mm512_load_4_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6, current_coeff + filter_size * 10, current_coeff + filter_size * 14);
+      __m512 coef_r3 = _mm512_load_4_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7, current_coeff + filter_size * 11, current_coeff + filter_size * 15);
+
+      _MM_TRANSPOSE16_LANE4_PS(coef_r0, coef_r1, coef_r2, coef_r3);
+
+      __m512 coef_r0_2 = _mm512_load_4_m128(current_coeff + filter_size * 16, current_coeff + filter_size * 20, current_coeff + filter_size * 24, current_coeff + filter_size * 28);
+      __m512 coef_r1_2 = _mm512_load_4_m128(current_coeff + filter_size * 17, current_coeff + filter_size * 21, current_coeff + filter_size * 25, current_coeff + filter_size * 29);
+      __m512 coef_r2_2 = _mm512_load_4_m128(current_coeff + filter_size * 18, current_coeff + filter_size * 22, current_coeff + filter_size * 26, current_coeff + filter_size * 30);
+      __m512 coef_r3_2 = _mm512_load_4_m128(current_coeff + filter_size * 19, current_coeff + filter_size * 23, current_coeff + filter_size * 27, current_coeff + filter_size * 31);
+
+      _MM_TRANSPOSE16_LANE4_PS(coef_r0_2, coef_r1_2, coef_r2_2, coef_r3_2);
+
+      // convert resampling program in H-form into permuting indexes for src transposition in V-form
+      int iStart = program->pixel_offset[x + 0];
+
+      __m512i perm_0 = _mm512_set_epi32(
+        program->pixel_offset[x + 15] - iStart,
+        program->pixel_offset[x + 14] - iStart,
+        program->pixel_offset[x + 13] - iStart,
+        program->pixel_offset[x + 12] - iStart,
+        program->pixel_offset[x + 11] - iStart,
+        program->pixel_offset[x + 10] - iStart,
+        program->pixel_offset[x + 9] - iStart,
+        program->pixel_offset[x + 8] - iStart,
+        program->pixel_offset[x + 7] - iStart,
+        program->pixel_offset[x + 6] - iStart,
+        program->pixel_offset[x + 5] - iStart,
+        program->pixel_offset[x + 4] - iStart,
+        program->pixel_offset[x + 3] - iStart,
+        program->pixel_offset[x + 2] - iStart,
+        program->pixel_offset[x + 1] - iStart,
+        0);
+
+      __m512i one_epi32 = _mm512_set1_epi32(1);
+      __m512i perm_1 = _mm512_add_epi32(perm_0, one_epi32);
+      one_epi32 = _mm512_set1_epi32(program->pixel_offset[x + 2] - program->pixel_offset[x + 1]);
+      __m512i perm_2 = _mm512_add_epi32(perm_1, one_epi32);
+      one_epi32 = _mm512_set1_epi32(program->pixel_offset[x + 3] - program->pixel_offset[x + 2]);
+      __m512i perm_3 = _mm512_add_epi32(perm_2, one_epi32);
+
+      // second gropup
+      __m512i perm_0_2 = _mm512_set_epi32(
+        program->pixel_offset[x + 31] - iStart,
+        program->pixel_offset[x + 30] - iStart,
+        program->pixel_offset[x + 29] - iStart,
+        program->pixel_offset[x + 28] - iStart,
+        program->pixel_offset[x + 27] - iStart,
+        program->pixel_offset[x + 26] - iStart,
+        program->pixel_offset[x + 25] - iStart,
+        program->pixel_offset[x + 24] - iStart,
+        program->pixel_offset[x + 23] - iStart,
+        program->pixel_offset[x + 22] - iStart,
+        program->pixel_offset[x + 21] - iStart,
+        program->pixel_offset[x + 20] - iStart,
+        program->pixel_offset[x + 19] - iStart,
+        program->pixel_offset[x + 18] - iStart,
+        program->pixel_offset[x + 17] - iStart,
+        program->pixel_offset[x + 16] - iStart);
+
+
+      __m512i perm_1_2 = _mm512_add_epi32(perm_0_2, one_epi32);
+      one_epi32 = _mm512_set1_epi32(program->pixel_offset[x + 2] - program->pixel_offset[x + 1]);
+      __m512i perm_2_2 = _mm512_add_epi32(perm_1_2, one_epi32);
+      one_epi32 = _mm512_set1_epi32(program->pixel_offset[x + 3] - program->pixel_offset[x + 2]);
+      __m512i perm_3_2 = _mm512_add_epi32(perm_2_2, one_epi32);
+
+      float* AVS_RESTRICT dst_ptr = dst + x;
+      const float* src_ptr = src + program->pixel_offset[x + 0]; // all permute offsets relative to this start offset
+      const float* src_ptr2 = src + program->pixel_offset[x + 16]; // all permute offsets relative to this start offset
+
+      for (int y = 0; y < height; y++) // single row proc
+      {
+        __m512 data_src = _mm512_loadu_ps(src_ptr);
+        __m512 data_src2 = _mm512_loadu_ps(src_ptr + 16); // not always needed for upscale also can cause end of buffer overread - need to add limitation (special end of buffer processing ?)
+
+        __m512 data_src_2 = _mm512_loadu_ps(src_ptr2);
+        __m512 data_src2_2 = _mm512_loadu_ps(src_ptr2 + 16); // not always needed for upscale also can cause end of buffer overread - need to add limitation (special end of buffer processing ?)
+
+        __m512 data_0 = _mm512_permutex2var_ps(data_src, perm_0, data_src2);
+        __m512 data_1 = _mm512_permutex2var_ps(data_src, perm_1, data_src2);
+        __m512 data_2 = _mm512_permutex2var_ps(data_src, perm_2, data_src2);
+        __m512 data_3 = _mm512_permutex2var_ps(data_src, perm_3, data_src2);
+
+        __m512 data_0_2 = _mm512_permutex2var_ps(data_src_2, perm_0_2, data_src2_2);
+        __m512 data_1_2 = _mm512_permutex2var_ps(data_src_2, perm_1_2, data_src2_2);
+        __m512 data_2_2 = _mm512_permutex2var_ps(data_src_2, perm_2_2, data_src2_2);
+        __m512 data_3_2 = _mm512_permutex2var_ps(data_src_2, perm_3_2, data_src2_2);
+
+        __m512 result0 = _mm512_mul_ps(data_0, coef_r0);
+        __m512 result1 = _mm512_mul_ps(data_2, coef_r2);
+
+        __m512 result0_2 = _mm512_mul_ps(data_0_2, coef_r0_2);
+        __m512 result1_2 = _mm512_mul_ps(data_2_2, coef_r2_2);
+
+        result0 = _mm512_fmadd_ps(data_1, coef_r1, result0);
+        result1 = _mm512_fmadd_ps(data_3, coef_r3, result1);
+
+        result0_2 = _mm512_fmadd_ps(data_1_2, coef_r1_2, result0_2);
+        result1_2 = _mm512_fmadd_ps(data_3_2, coef_r3_2, result1_2);
+
+
+        _mm512_stream_ps(dst_ptr, _mm512_add_ps(result0, result1)); // it is best with RAW compute performance test but may be not best in filter chain and data splitting - better to use filter store control param cached or not cached stores
+        _mm512_stream_ps(dst_ptr + 16, _mm512_add_ps(result0_2, result1_2)); 
+
+        dst_ptr += dst_pitch;
+        src_ptr += src_pitch;
+      }
+
+      current_coeff += filter_size * 32;
+    } // to width_mo32
+
+    for (int x = width_mod32; x < width; x += 16)
+    {
+      // prepare coefs in transposed V-form
+      __m512 coef_r0 = _mm512_load_4_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4, current_coeff + filter_size * 8, current_coeff + filter_size * 12);
+      __m512 coef_r1 = _mm512_load_4_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5, current_coeff + filter_size * 9, current_coeff + filter_size * 13);
+      __m512 coef_r2 = _mm512_load_4_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6, current_coeff + filter_size * 10, current_coeff + filter_size * 14);
+      __m512 coef_r3 = _mm512_load_4_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7, current_coeff + filter_size * 11, current_coeff + filter_size * 15);
+
+      _MM_TRANSPOSE16_LANE4_PS(coef_r0, coef_r1, coef_r2, coef_r3);
+
+      // convert resampling program in H-form into permuting indexes for src transposition in V-form
+      int iStart = program->pixel_offset[x + 0];
+
+      __m512i perm_0 = _mm512_set_epi32(
+        program->pixel_offset[x + 15] - iStart,
+        program->pixel_offset[x + 14] - iStart,
+        program->pixel_offset[x + 13] - iStart,
+        program->pixel_offset[x + 12] - iStart,
+        program->pixel_offset[x + 11] - iStart,
+        program->pixel_offset[x + 10] - iStart,
+        program->pixel_offset[x + 9] - iStart,
+        program->pixel_offset[x + 8] - iStart,
+        program->pixel_offset[x + 7] - iStart,
+        program->pixel_offset[x + 6] - iStart,
+        program->pixel_offset[x + 5] - iStart,
+        program->pixel_offset[x + 4] - iStart,
+        program->pixel_offset[x + 3] - iStart,
+        program->pixel_offset[x + 2] - iStart,
+        program->pixel_offset[x + 1] - iStart,
+        0);
+
+      __m512i one_epi32 = _mm512_set1_epi32(1);
+      __m512i perm_1 = _mm512_add_epi32(perm_0, one_epi32);
+      one_epi32 = _mm512_set1_epi32(program->pixel_offset[x + 2] - program->pixel_offset[x + 1]);
+      __m512i perm_2 = _mm512_add_epi32(perm_1, one_epi32);
+      one_epi32 = _mm512_set1_epi32(program->pixel_offset[x + 3] - program->pixel_offset[x + 2]);
+      __m512i perm_3 = _mm512_add_epi32(perm_2, one_epi32);
+
+      float* AVS_RESTRICT dst_ptr = dst + x;
+      const float* src_ptr = src + program->pixel_offset[x + 0]; // all permute offsets relative to this start offset
+
+      for (int y = 0; y < height; y++) // single row proc
+      {
+        __m512 data_src = _mm512_loadu_ps(src_ptr);
+        __m512 data_src2 = _mm512_loadu_ps(src_ptr + 16); // not always needed for upscale also can cause end of buffer overread - need to add limitation (special end of buffer processing ?)
+
+        __m512 data_0 = _mm512_permutex2var_ps(data_src, perm_0, data_src2);
+        __m512 data_1 = _mm512_permutex2var_ps(data_src, perm_1, data_src2);
+        __m512 data_2 = _mm512_permutex2var_ps(data_src, perm_2, data_src2);
+        __m512 data_3 = _mm512_permutex2var_ps(data_src, perm_3, data_src2);
+
+        __m512 result0 = _mm512_mul_ps(data_0, coef_r0);
+        __m512 result1 = _mm512_mul_ps(data_2, coef_r2);
+
+        result0 = _mm512_fmadd_ps(data_1, coef_r1, result0);
+        result1 = _mm512_fmadd_ps(data_3, coef_r3, result1);
+
+        _mm512_stream_ps(dst_ptr, _mm512_add_ps(result0, result1)); // it is best with RAW compute performance test but may be not best in filter chain and data splitting - better to use filter store control param cached or not cached stores
+
+        dst_ptr += dst_pitch;
+        src_ptr += src_pitch;
+      }
+
+      current_coeff += filter_size * 16;
+    } // to width
+  }
+}
+
+template void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w<0>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+template void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w<1>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+template void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w<2>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+template void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w<3>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+
+
 #if 0 // DTL version
 // Transpose-based
 // process kernel size from up to 4 - BilinearResize, BicubicResize or sinc up to taps=2
@@ -496,7 +974,7 @@ void resize_h_planar_float_avx512_transpose_vstripe_ks4(BYTE* dst8, const BYTE*
             result = _mm512_fmadd_ps(d3_d7_d11_d15, c3_c7_c11_c15, result);
             result = _mm512_fmadd_ps(d4_d8_d12_d16, c4_c8_c12_c16, result);
 
-            _mm512_store_ps(dst_ptr, result);
+            _mm512_stream_ps(dst_ptr, result); // it is best with RAW compute performance test but may be not best in filter chain and data splitting - better to use filter store control param cached or not cached stores
 
             dst_ptr += dst_pitch;
             src_ptr += src_pitch;
@@ -572,7 +1050,7 @@ void resize_h_planar_float_avx512_permutex_vstripe_ks4(BYTE* dst8, const BYTE* s
       result0 = _mm512_fmadd_ps(data_1, coef_r1, result0);
       result1 = _mm512_fmadd_ps(data_3, coef_r3, result1);
 
-      _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1));
+      _mm512_stream_ps(dst_ptr, _mm512_add_ps(result0, result1)); // it is best with RAW compute performance test but may be not best in filter chain and data splitting - better to use filter store control param cached or not cached stores
 
       dst_ptr += dst_pitch;
       src_ptr += src_pitch;
@@ -608,8 +1086,8 @@ void resize_h_planar_float_avx512_permutex_vstripe_ks4(BYTE* dst8, const BYTE* s
       result0 = _mm512_fmadd_ps(data_3, coef_r3, result0);
       result1 = _mm512_fmadd_ps(data_3_2, coef_r3, result1);
 
-      _mm512_store_ps(dst_ptr, result0);
-      _mm512_store_ps(dst_ptr + dst_pitch, result1);
+      _mm512_stream_ps(dst_ptr, result0); // it is best with RAW compute performance test but may be not best in filter chain and data splitting - better to use filter store control param cached or not cached stores
+      _mm512_stream_ps(dst_ptr + dst_pitch, result1);
 
       dst_ptr += dst_pitch * 2;
       src_ptr += src_pitch * 2;
@@ -630,7 +1108,7 @@ void resize_h_planar_float_avx512_permutex_vstripe_ks4(BYTE* dst8, const BYTE* s
       result0 = _mm512_fmadd_ps(data_1, coef_r1, result0);
       result1 = _mm512_fmadd_ps(data_3, coef_r3, result1);
 
-      _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1));
+      _mm512_stream_ps(dst_ptr, _mm512_add_ps(result0, result1)); // it is best with RAW compute performance test but may be not best in filter chain and data splitting - better to use filter store control param cached or not cached stores
     }
 
     current_coeff += filter_size * 16;
@@ -734,7 +1212,7 @@ void resize_h_planar_float_avx512_permutex_vstripe_ks8(BYTE* dst8, const BYTE* s
       result0 = _mm512_fmadd_ps(data_3, coef_r3, result0);
       result1 = _mm512_fmadd_ps(data_7, coef_r7, result1);
 
-      _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1));
+      _mm512_stream_ps(dst_ptr, _mm512_add_ps(result0, result1)); // it is best with RAW compute performance test but may be not best in filter chain and data splitting - better to use filter store control param cached or not cached stores
 
       dst_ptr += dst_pitch;
       src_ptr += src_pitch;
@@ -790,8 +1268,8 @@ void resize_h_planar_float_avx512_permutex_vstripe_ks8(BYTE* dst8, const BYTE* s
       result0 = _mm512_fmadd_ps(data_7, coef_r7, result0);
       result1 = _mm512_fmadd_ps(data_7_2, coef_r7, result1);
 
-      _mm512_store_ps(dst_ptr, result0);
-      _mm512_store_ps(dst_ptr + dst_pitch, result1);
+      _mm512_stream_ps(dst_ptr, result0); // it is best with RAW compute performance test but may be not best in filter chain and data splitting - better to use filter store control param cached or not cached stores
+      _mm512_stream_ps(dst_ptr + dst_pitch, result1);
 
       dst_ptr += dst_pitch * 2;
       src_ptr += src_pitch * 2;
@@ -822,7 +1300,7 @@ void resize_h_planar_float_avx512_permutex_vstripe_ks8(BYTE* dst8, const BYTE* s
       result0 = _mm512_fmadd_ps(data_3, coef_r3, result0);
       result1 = _mm512_fmadd_ps(data_7, coef_r7, result1);
 
-      _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1));
+      _mm512_stream_ps(dst_ptr, _mm512_add_ps(result0, result1)); // it is best with RAW compute performance test but may be not best in filter chain and data splitting - better to use filter store control param cached or not cached stores
     }
 
     current_coeff += filter_size * 16;
@@ -978,7 +1456,7 @@ void resize_h_planar_float_avx512_permutex_vstripe_ks16(BYTE* dst8, const BYTE*
       result0 = _mm512_fmadd_ps(data_7, coef_r7, result0);
       result1 = _mm512_fmadd_ps(data_15, coef_r15, result1);
 
-      _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1));
+      _mm512_stream_ps(dst_ptr, _mm512_add_ps(result0, result1)); // it is best with RAW compute performance test but may be not best in filter chain and data splitting - better to use filter store control param cached or not cached stores
 
       dst_ptr += dst_pitch;
       src_ptr += src_pitch;
@@ -1046,11 +1524,180 @@ void resize_v_avx512_planar_float(BYTE* dst8, const BYTE* src8, int dst_pitch, i
         result_single = _mm512_fmadd_ps(src_val, coeff, result_single);
       }
 
-      _mm512_store_ps(dst + x, result_single);
+      _mm512_stream_ps(dst + x, result_single);
+    }
+
+    dst += dst_pitch;
+    current_coeff += filter_size;
+  }
+}
+
+void resize_v_avx512_planar_float_w_sr(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel)
+{
+  AVS_UNUSED(bits_per_pixel);
+
+  const int filter_size = program->filter_size;
+  const float* AVS_RESTRICT current_coeff = program->pixel_coefficient_float;
+
+  const float* src = (const float*)src8;
+  float* AVS_RESTRICT dst = (float*)dst8;
+  dst_pitch = dst_pitch / sizeof(float);
+  src_pitch = src_pitch / sizeof(float);
+
+  const int kernel_size = program->filter_size_real; // not the aligned
+  const int kernel_size_mod2 = (kernel_size / 2) * 2; // Process pairs of rows for better efficiency
+  const bool notMod2 = kernel_size_mod2 < kernel_size;
+
+  const int width_mod128 = (width / 128) * 128; // Process by 8x 512it (8 x 16 floats) to make memory read/write linear streams longer, 32x512 bit registers should be enough
+  const int width_mod64 = (width / 64) * 64; // Process by 4x 512it (4 x 16 floats) to make memory read/write linear streams longer,
+  const int width_mod32 = (width / 32) * 32; // Process by 2x 512it (2 x 16 floats) to make memory read/write linear streams longer,
+
+  for (int y = 0; y < target_height; y++) {
+    int offset = program->pixel_offset[y];
+    const float* src_ptr = src + offset * src_pitch;
+
+    for (int x = 0; x < width_mod128; x += 128) {
+      __m512 result_1 = _mm512_setzero_ps();
+      __m512 result_2 = _mm512_setzero_ps();
+      __m512 result_3 = _mm512_setzero_ps();
+      __m512 result_4 = _mm512_setzero_ps();
+      __m512 result_5 = _mm512_setzero_ps();
+      __m512 result_6 = _mm512_setzero_ps();
+      __m512 result_7 = _mm512_setzero_ps();
+      __m512 result_8 = _mm512_setzero_ps();
+
+      const float* AVS_RESTRICT src2_ptr = src_ptr + x; // __restrict here
+
+      int i = 0;
+      for (; i < kernel_size; i ++) {
+        __m512 coeff = _mm512_set1_ps(current_coeff[i]);
+
+        __m512 src_1 = _mm512_load_ps(src2_ptr);
+        __m512 src_2 = _mm512_load_ps(src2_ptr + 16);
+        __m512 src_3 = _mm512_load_ps(src2_ptr + 32);
+        __m512 src_4 = _mm512_load_ps(src2_ptr + 48);
+        __m512 src_5 = _mm512_load_ps(src2_ptr + 64);
+        __m512 src_6 = _mm512_load_ps(src2_ptr + 80);
+        __m512 src_7 = _mm512_load_ps(src2_ptr + 96);
+        __m512 src_8 = _mm512_load_ps(src2_ptr + 112);
+
+        result_1 = _mm512_fmadd_ps(src_1, coeff, result_1);
+        result_2 = _mm512_fmadd_ps(src_2, coeff, result_2);
+        result_3 = _mm512_fmadd_ps(src_3, coeff, result_3);
+        result_4 = _mm512_fmadd_ps(src_4, coeff, result_4);
+        result_5 = _mm512_fmadd_ps(src_5, coeff, result_5);
+        result_6 = _mm512_fmadd_ps(src_6, coeff, result_6);
+        result_7 = _mm512_fmadd_ps(src_7, coeff, result_7);
+        result_8 = _mm512_fmadd_ps(src_8, coeff, result_8);
+
+        src2_ptr += src_pitch;
+      }
+
+      _mm512_stream_ps(dst + x, result_1); // it is best with RAW compute performance test but may be not best in filter chain and data splitting - better to use filter store control param cached or not cached stores
+      _mm512_stream_ps(dst + x + 16, result_2);
+      _mm512_stream_ps(dst + x + 32, result_3);
+      _mm512_stream_ps(dst + x + 48, result_4);
+      _mm512_stream_ps(dst + x + 64, result_5);
+      _mm512_stream_ps(dst + x + 80, result_6);
+      _mm512_stream_ps(dst + x + 96, result_7);
+      _mm512_stream_ps(dst + x + 112, result_8);
+    }
+
+    for (int x = width_mod128; x < width_mod64; x += 64) {
+      __m512 result_1 = _mm512_setzero_ps();
+      __m512 result_2 = _mm512_setzero_ps();
+      __m512 result_3 = _mm512_setzero_ps();
+      __m512 result_4 = _mm512_setzero_ps();
+
+      const float* AVS_RESTRICT src2_ptr = src_ptr + x; // __restrict here
+
+      int i = 0;
+      for (; i < kernel_size; i++) {
+        __m512 coeff = _mm512_set1_ps(current_coeff[i]);
+
+        __m512 src_1 = _mm512_load_ps(src2_ptr);
+        __m512 src_2 = _mm512_load_ps(src2_ptr + 16);
+        __m512 src_3 = _mm512_load_ps(src2_ptr + 32);
+        __m512 src_4 = _mm512_load_ps(src2_ptr + 48);
+
+        result_1 = _mm512_fmadd_ps(src_1, coeff, result_1);
+        result_2 = _mm512_fmadd_ps(src_2, coeff, result_2);
+        result_3 = _mm512_fmadd_ps(src_3, coeff, result_3);
+        result_4 = _mm512_fmadd_ps(src_4, coeff, result_4);
+
+        src2_ptr += src_pitch;
+      }
+
+      _mm512_stream_ps(dst + x, result_1);
+      _mm512_stream_ps(dst + x + 16, result_2);
+      _mm512_stream_ps(dst + x + 32, result_3);
+      _mm512_stream_ps(dst + x + 48, result_4);
+    }
+
+    for (int x = width_mod64; x < width_mod32; x += 32) {
+      __m512 result_1 = _mm512_setzero_ps();
+      __m512 result_2 = _mm512_setzero_ps();
+
+      const float* AVS_RESTRICT src2_ptr = src_ptr + x; // __restrict here
+
+      int i = 0;
+      for (; i < kernel_size; i++) {
+        __m512 coeff = _mm512_set1_ps(current_coeff[i]);
+
+        __m512 src_1 = _mm512_load_ps(src2_ptr);
+        __m512 src_2 = _mm512_load_ps(src2_ptr + 16);
+
+        result_1 = _mm512_fmadd_ps(src_1, coeff, result_1);
+        result_2 = _mm512_fmadd_ps(src_2, coeff, result_2);
+
+        src2_ptr += src_pitch;
+      }
+
+      _mm512_stream_ps(dst + x, result_1);
+      _mm512_stream_ps(dst + x + 16, result_2);
+    }
+
+
+    // 64 byte 16 floats (AVX512 register holds 16 floats)
+    // row alignment is 64 bytes - so it is safe to load mod16 of float32 ?
+    for (int x = width_mod32; x < width; x += 16) {
+      __m512 result_single = _mm512_setzero_ps();
+      __m512 result_single_2 = _mm512_setzero_ps();
+
+      const float* AVS_RESTRICT src2_ptr = src_ptr + x; // __restrict here
+
+      // Process pairs of rows for better efficiency (2 coeffs/cycle)
+      // two result variables for potential parallel operation
+      int i = 0;
+      for (; i < kernel_size_mod2; i += 2) {
+        __m512 coeff_even = _mm512_set1_ps(current_coeff[i]);
+        __m512 coeff_odd = _mm512_set1_ps(current_coeff[i + 1]);
+
+        __m512 src_even = _mm512_load_ps(src2_ptr);
+        __m512 src_odd = _mm512_load_ps(src2_ptr + src_pitch);
+
+        result_single = _mm512_fmadd_ps(src_even, coeff_even, result_single);
+        result_single_2 = _mm512_fmadd_ps(src_odd, coeff_odd, result_single_2);
+
+        src2_ptr += 2 * src_pitch;
+      }
+
+      result_single = _mm512_add_ps(result_single, result_single_2);
+
+      // Process the last odd row if needed
+      if (notMod2) {
+        __m512 coeff = _mm512_set1_ps(current_coeff[i]);
+        __m512 src_val = _mm512_loadu_ps(src2_ptr);
+        result_single = _mm512_fmadd_ps(src_val, coeff, result_single);
+      }
+
+      _mm512_stream_ps(dst + x, result_single);
     }
 
+
     dst += dst_pitch;
     current_coeff += filter_size;
   }
 }
 
+
diff --git a/avs_core/filters/intel/resample_avx512.h b/avs_core/filters/intel/resample_avx512.h
index affd410de..2d7fab5b0 100644
--- a/avs_core/filters/intel/resample_avx512.h
+++ b/avs_core/filters/intel/resample_avx512.h
@@ -126,11 +126,15 @@ void resize_h_planar_float_avx512_transpose_vstripe_ks4(BYTE* dst8, const BYTE*
 template<int filtersizemod4>
 void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
 
+template<int filtersizemod4>
+void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+
 
 void resize_h_planar_float_avx512_permutex_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
 void resize_h_planar_float_avx512_permutex_vstripe_ks8(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
 void resize_h_planar_float_avx512_permutex_vstripe_ks16(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
 
 void resize_v_avx512_planar_float(BYTE* dst0, const BYTE* src0, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel);
+void resize_v_avx512_planar_float_w_sr(BYTE* dst0, const BYTE* src0, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel);
 
 #endif // __Resample_AVX512_H__

From d24b9ebd5526fdd2f13f69d521be680585ee4cfd Mon Sep 17 00:00:00 2001
From: DTL2020 <68707763+DTL2020@users.noreply.github.com>
Date: Sun, 8 Jun 2025 06:52:27 -0700
Subject: [PATCH 22/27] Use stores instead of

stream (uncached) in new AVX2 and AVX512 float resizers.
---
 avs_core/filters/intel/resample_avx2.cpp   | 10 ++--
 avs_core/filters/intel/resample_avx512.cpp | 70 +++++++++++-----------
 2 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/avs_core/filters/intel/resample_avx2.cpp b/avs_core/filters/intel/resample_avx2.cpp
index 23b82db8f..b28c74b24 100644
--- a/avs_core/filters/intel/resample_avx2.cpp
+++ b/avs_core/filters/intel/resample_avx2.cpp
@@ -1036,10 +1036,10 @@ void resize_v_avx2_planar_float_w_sr(BYTE* dst8, const BYTE* src8, int dst_pitch
         src2_ptr += src_pitch;
       }
 
-      _mm256_stream_ps(dst + x, result_1); // it is best with RAW compute performance test but may be not best in filter chain and data splitting - better to use filter store control param cached or not cached stores
-      _mm256_stream_ps(dst + x + 8, result_2);
-      _mm256_stream_ps(dst + x + 16, result_3);
-      _mm256_stream_ps(dst + x + 24, result_4);
+      _mm256_store_ps(dst + x, result_1); 
+      _mm256_store_ps(dst + x + 8, result_2);
+      _mm256_store_ps(dst + x + 16, result_3);
+      _mm256_store_ps(dst + x + 24, result_4);
     } // width_mod32
 
     // 32 byte 8 floats (AVX2 register holds 8 floats)
@@ -1075,7 +1075,7 @@ void resize_v_avx2_planar_float_w_sr(BYTE* dst8, const BYTE* src8, int dst_pitch
         result_single = _mm256_fmadd_ps(src_val, coeff, result_single);
       }
 
-      _mm256_stream_ps(dst + x, result_single);
+      _mm256_store_ps(dst + x, result_single);
     }
 
     dst += dst_pitch;
diff --git a/avs_core/filters/intel/resample_avx512.cpp b/avs_core/filters/intel/resample_avx512.cpp
index 44507d23f..531d0a134 100644
--- a/avs_core/filters/intel/resample_avx512.cpp
+++ b/avs_core/filters/intel/resample_avx512.cpp
@@ -200,7 +200,7 @@ void resize_h_planar_float_avx512_transpose_vstripe_ks4(BYTE* dst8, const BYTE*
       result = _mm512_fmadd_ps(data_3_7_11_15, coef_3_7_11_15, result);
       result = _mm512_fmadd_ps(data_4_8_12_16, coef_4_8_12_16, result);
 
-      _mm512_stream_ps(dst_ptr, result); // it is best with RAW compute performance test but may be not best in filter chain and data splitting - better to use filter store control param cached or not cached stores
+      _mm512_store_ps(dst_ptr, result); 
 
       dst_ptr += dst_pitch;
       src_ptr += src_pitch;
@@ -358,7 +358,7 @@ void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4(BYTE* dst8, const
         result = _mm512_fmadd_ps(data_3_7_11_15, coef_3_7_11_15, result);
         result = _mm512_fmadd_ps(data_4_8_12_16, coef_4_8_12_16, result);
 
-        _mm512_stream_ps(dst_ptr, result); // it is best with RAW compute performance test but may be not best in filter chain and data splitting - better to use filter store control param cached or not cached stores
+        _mm512_store_ps(dst_ptr, result); 
 
         dst_ptr += dst_pitch;
         src_ptr += src_pitch;
@@ -437,7 +437,7 @@ void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4(BYTE* dst8, const
         result0 = _mm512_fmadd_ps(data_1, coef_r1, result0);
         result1 = _mm512_fmadd_ps(data_3, coef_r3, result1);
 
-        _mm512_stream_ps(dst_ptr, _mm512_add_ps(result0, result1)); // it is best with RAW compute performance test but may be not best in filter chain and data splitting - better to use filter store control param cached or not cached stores
+        _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1)); 
 
         dst_ptr += dst_pitch;
         src_ptr += src_pitch;
@@ -587,7 +587,7 @@ void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w(BYTE* dst8, con
         result = _mm512_fmadd_ps(data_3_7_11_15, coef_3_7_11_15, result);
         result = _mm512_fmadd_ps(data_4_8_12_16, coef_4_8_12_16, result);
 
-        _mm512_stream_ps(dst_ptr, result); // it is best with RAW compute performance test but may be not best in filter chain and data splitting - better to use filter store control param cached or not cached stores
+        _mm512_store_ps(dst_ptr, result); 
 
         dst_ptr += dst_pitch;
         src_ptr += src_pitch;
@@ -713,8 +713,8 @@ void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w(BYTE* dst8, con
         result_2 = _mm512_fmadd_ps(data_4_8_12_16_2, coef_4_8_12_16_2, result_2);
 
 
-        _mm512_stream_ps(dst_ptr, result); // it is best with RAW compute performance test but may be not best in filter chain and data splitting - better to use filter store control param cached or not cached stores
-        _mm512_stream_ps(dst_ptr + 16, result_2);
+        _mm512_store_ps(dst_ptr, result); 
+        _mm512_store_ps(dst_ptr + 16, result_2);
 
         dst_ptr += dst_pitch;
         src_ptr += src_pitch;
@@ -847,8 +847,8 @@ void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w(BYTE* dst8, con
         result1_2 = _mm512_fmadd_ps(data_3_2, coef_r3_2, result1_2);
 
 
-        _mm512_stream_ps(dst_ptr, _mm512_add_ps(result0, result1)); // it is best with RAW compute performance test but may be not best in filter chain and data splitting - better to use filter store control param cached or not cached stores
-        _mm512_stream_ps(dst_ptr + 16, _mm512_add_ps(result0_2, result1_2)); 
+        _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1)); 
+        _mm512_store_ps(dst_ptr + 16, _mm512_add_ps(result0_2, result1_2)); 
 
         dst_ptr += dst_pitch;
         src_ptr += src_pitch;
@@ -914,7 +914,7 @@ void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w(BYTE* dst8, con
         result0 = _mm512_fmadd_ps(data_1, coef_r1, result0);
         result1 = _mm512_fmadd_ps(data_3, coef_r3, result1);
 
-        _mm512_stream_ps(dst_ptr, _mm512_add_ps(result0, result1)); // it is best with RAW compute performance test but may be not best in filter chain and data splitting - better to use filter store control param cached or not cached stores
+        _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1)); 
 
         dst_ptr += dst_pitch;
         src_ptr += src_pitch;
@@ -974,7 +974,7 @@ void resize_h_planar_float_avx512_transpose_vstripe_ks4(BYTE* dst8, const BYTE*
             result = _mm512_fmadd_ps(d3_d7_d11_d15, c3_c7_c11_c15, result);
             result = _mm512_fmadd_ps(d4_d8_d12_d16, c4_c8_c12_c16, result);
 
-            _mm512_stream_ps(dst_ptr, result); // it is best with RAW compute performance test but may be not best in filter chain and data splitting - better to use filter store control param cached or not cached stores
+            _mm512_store_ps(dst_ptr, result); 
 
             dst_ptr += dst_pitch;
             src_ptr += src_pitch;
@@ -1050,7 +1050,7 @@ void resize_h_planar_float_avx512_permutex_vstripe_ks4(BYTE* dst8, const BYTE* s
       result0 = _mm512_fmadd_ps(data_1, coef_r1, result0);
       result1 = _mm512_fmadd_ps(data_3, coef_r3, result1);
 
-      _mm512_stream_ps(dst_ptr, _mm512_add_ps(result0, result1)); // it is best with RAW compute performance test but may be not best in filter chain and data splitting - better to use filter store control param cached or not cached stores
+      _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1)); 
 
       dst_ptr += dst_pitch;
       src_ptr += src_pitch;
@@ -1086,8 +1086,8 @@ void resize_h_planar_float_avx512_permutex_vstripe_ks4(BYTE* dst8, const BYTE* s
       result0 = _mm512_fmadd_ps(data_3, coef_r3, result0);
       result1 = _mm512_fmadd_ps(data_3_2, coef_r3, result1);
 
-      _mm512_stream_ps(dst_ptr, result0); // it is best with RAW compute performance test but may be not best in filter chain and data splitting - better to use filter store control param cached or not cached stores
-      _mm512_stream_ps(dst_ptr + dst_pitch, result1);
+      _mm512_store_ps(dst_ptr, result0); 
+      _mm512_store_ps(dst_ptr + dst_pitch, result1);
 
       dst_ptr += dst_pitch * 2;
       src_ptr += src_pitch * 2;
@@ -1108,7 +1108,7 @@ void resize_h_planar_float_avx512_permutex_vstripe_ks4(BYTE* dst8, const BYTE* s
       result0 = _mm512_fmadd_ps(data_1, coef_r1, result0);
       result1 = _mm512_fmadd_ps(data_3, coef_r3, result1);
 
-      _mm512_stream_ps(dst_ptr, _mm512_add_ps(result0, result1)); // it is best with RAW compute performance test but may be not best in filter chain and data splitting - better to use filter store control param cached or not cached stores
+      _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1)); 
     }
 
     current_coeff += filter_size * 16;
@@ -1212,7 +1212,7 @@ void resize_h_planar_float_avx512_permutex_vstripe_ks8(BYTE* dst8, const BYTE* s
       result0 = _mm512_fmadd_ps(data_3, coef_r3, result0);
       result1 = _mm512_fmadd_ps(data_7, coef_r7, result1);
 
-      _mm512_stream_ps(dst_ptr, _mm512_add_ps(result0, result1)); // it is best with RAW compute performance test but may be not best in filter chain and data splitting - better to use filter store control param cached or not cached stores
+      _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1)); 
 
       dst_ptr += dst_pitch;
       src_ptr += src_pitch;
@@ -1268,8 +1268,8 @@ void resize_h_planar_float_avx512_permutex_vstripe_ks8(BYTE* dst8, const BYTE* s
       result0 = _mm512_fmadd_ps(data_7, coef_r7, result0);
       result1 = _mm512_fmadd_ps(data_7_2, coef_r7, result1);
 
-      _mm512_stream_ps(dst_ptr, result0); // it is best with RAW compute performance test but may be not best in filter chain and data splitting - better to use filter store control param cached or not cached stores
-      _mm512_stream_ps(dst_ptr + dst_pitch, result1);
+      _mm512_store_ps(dst_ptr, result0); 
+      _mm512_store_ps(dst_ptr + dst_pitch, result1);
 
       dst_ptr += dst_pitch * 2;
       src_ptr += src_pitch * 2;
@@ -1300,7 +1300,7 @@ void resize_h_planar_float_avx512_permutex_vstripe_ks8(BYTE* dst8, const BYTE* s
       result0 = _mm512_fmadd_ps(data_3, coef_r3, result0);
       result1 = _mm512_fmadd_ps(data_7, coef_r7, result1);
 
-      _mm512_stream_ps(dst_ptr, _mm512_add_ps(result0, result1)); // it is best with RAW compute performance test but may be not best in filter chain and data splitting - better to use filter store control param cached or not cached stores
+      _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1)); 
     }
 
     current_coeff += filter_size * 16;
@@ -1456,7 +1456,7 @@ void resize_h_planar_float_avx512_permutex_vstripe_ks16(BYTE* dst8, const BYTE*
       result0 = _mm512_fmadd_ps(data_7, coef_r7, result0);
       result1 = _mm512_fmadd_ps(data_15, coef_r15, result1);
 
-      _mm512_stream_ps(dst_ptr, _mm512_add_ps(result0, result1)); // it is best with RAW compute performance test but may be not best in filter chain and data splitting - better to use filter store control param cached or not cached stores
+      _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1)); 
 
       dst_ptr += dst_pitch;
       src_ptr += src_pitch;
@@ -1524,7 +1524,7 @@ void resize_v_avx512_planar_float(BYTE* dst8, const BYTE* src8, int dst_pitch, i
         result_single = _mm512_fmadd_ps(src_val, coeff, result_single);
       }
 
-      _mm512_stream_ps(dst + x, result_single);
+      _mm512_store_ps(dst + x, result_single);
     }
 
     dst += dst_pitch;
@@ -1593,14 +1593,14 @@ void resize_v_avx512_planar_float_w_sr(BYTE* dst8, const BYTE* src8, int dst_pit
         src2_ptr += src_pitch;
       }
 
-      _mm512_stream_ps(dst + x, result_1); // it is best with RAW compute performance test but may be not best in filter chain and data splitting - better to use filter store control param cached or not cached stores
-      _mm512_stream_ps(dst + x + 16, result_2);
-      _mm512_stream_ps(dst + x + 32, result_3);
-      _mm512_stream_ps(dst + x + 48, result_4);
-      _mm512_stream_ps(dst + x + 64, result_5);
-      _mm512_stream_ps(dst + x + 80, result_6);
-      _mm512_stream_ps(dst + x + 96, result_7);
-      _mm512_stream_ps(dst + x + 112, result_8);
+      _mm512_store_ps(dst + x, result_1); 
+      _mm512_store_ps(dst + x + 16, result_2);
+      _mm512_store_ps(dst + x + 32, result_3);
+      _mm512_store_ps(dst + x + 48, result_4);
+      _mm512_store_ps(dst + x + 64, result_5);
+      _mm512_store_ps(dst + x + 80, result_6);
+      _mm512_store_ps(dst + x + 96, result_7);
+      _mm512_store_ps(dst + x + 112, result_8);
     }
 
     for (int x = width_mod128; x < width_mod64; x += 64) {
@@ -1628,10 +1628,10 @@ void resize_v_avx512_planar_float_w_sr(BYTE* dst8, const BYTE* src8, int dst_pit
         src2_ptr += src_pitch;
       }
 
-      _mm512_stream_ps(dst + x, result_1);
-      _mm512_stream_ps(dst + x + 16, result_2);
-      _mm512_stream_ps(dst + x + 32, result_3);
-      _mm512_stream_ps(dst + x + 48, result_4);
+      _mm512_store_ps(dst + x, result_1);
+      _mm512_store_ps(dst + x + 16, result_2);
+      _mm512_store_ps(dst + x + 32, result_3);
+      _mm512_store_ps(dst + x + 48, result_4);
     }
 
     for (int x = width_mod64; x < width_mod32; x += 32) {
@@ -1653,8 +1653,8 @@ void resize_v_avx512_planar_float_w_sr(BYTE* dst8, const BYTE* src8, int dst_pit
         src2_ptr += src_pitch;
       }
 
-      _mm512_stream_ps(dst + x, result_1);
-      _mm512_stream_ps(dst + x + 16, result_2);
+      _mm512_store_ps(dst + x, result_1);
+      _mm512_store_ps(dst + x + 16, result_2);
     }
 
 
@@ -1691,7 +1691,7 @@ void resize_v_avx512_planar_float_w_sr(BYTE* dst8, const BYTE* src8, int dst_pit
         result_single = _mm512_fmadd_ps(src_val, coeff, result_single);
       }
 
-      _mm512_stream_ps(dst + x, result_single);
+      _mm512_store_ps(dst + x, result_single);
     }
 
 

From 58708e2f27a3634907663f3e91c8f8122343331a Mon Sep 17 00:00:00 2001
From: DTL2020 <68707763+DTL2020@users.noreply.github.com>
Date: Sun, 8 Jun 2025 10:33:31 -0700
Subject: [PATCH 23/27] Added AVX512 V-resampler for

8bit format.
---
 avs_core/filters/resample.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/avs_core/filters/resample.cpp b/avs_core/filters/resample.cpp
index 7e3d778a4..c604cd3ea 100644
--- a/avs_core/filters/resample.cpp
+++ b/avs_core/filters/resample.cpp
@@ -1809,6 +1809,10 @@ ResamplerV FilteredResizeV::GetResampler(int CPU, int pixelsize, int bits_per_pi
     if (pixelsize == 1)
     {
 #ifdef INTEL_INTRINSICS
+#ifdef INTEL_INTRINSICS_AVX512
+      if (CPU & CPUF_AVX512F)
+        return resize_v_avx512_planar_uint8_t_w_sr;
+#endif
       if (CPU & CPUF_AVX2)
         return resize_v_avx2_planar_uint8_t;
       if (CPU & CPUF_SSE2)
@@ -2474,6 +2478,10 @@ ResamplerV FilteredResize_2p::GetResamplerV(int CPU, int pixelsize, int bits_per
     if (pixelsize == 1)
     {
 #ifdef INTEL_INTRINSICS
+#ifdef INTEL_INTRINSICS_AVX512
+      if (CPU & CPUF_AVX512F)
+        return resize_v_avx512_planar_uint8_t_w_sr;
+#endif
       if (CPU & CPUF_AVX2)
         return resize_v_avx2_planar_uint8_t;
       if (CPU & CPUF_SSE2)

From 6df70051b2b63b55257c0aaf4578b773cd7ebc30 Mon Sep 17 00:00:00 2001
From: DTL2020 <68707763+DTL2020@users.noreply.github.com>
Date: Sun, 8 Jun 2025 10:34:25 -0700
Subject: [PATCH 24/27] Added AVX512 V-resampler for

8bit format
---
 avs_core/filters/intel/resample_avx512.cpp | 162 +++++++++++++++++++++
 avs_core/filters/intel/resample_avx512.h   |   3 +
 2 files changed, 165 insertions(+)

diff --git a/avs_core/filters/intel/resample_avx512.cpp b/avs_core/filters/intel/resample_avx512.cpp
index 531d0a134..eefece605 100644
--- a/avs_core/filters/intel/resample_avx512.cpp
+++ b/avs_core/filters/intel/resample_avx512.cpp
@@ -1700,4 +1700,166 @@ void resize_v_avx512_planar_float_w_sr(BYTE* dst8, const BYTE* src8, int dst_pit
   }
 }
 
+// uint8_t
+void resize_v_avx512_planar_uint8_t_w_sr(BYTE* AVS_RESTRICT dst, const BYTE* src, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel)
+{
+  AVS_UNUSED(bits_per_pixel);
+  int filter_size = program->filter_size;
+  const short* AVS_RESTRICT current_coeff = program->pixel_coefficient;
+  __m512i rounder = _mm512_set1_epi32(1 << (FPScale8bits - 1));
+  __m512i zero = _mm512_setzero_si512();
+
+  const int kernel_size = program->filter_size_real; // not the aligned
+
+  const int width_mod128 = (width / 128) * 128;
+
+  const __m512i perm_idx1 = _mm512_set_epi64(8 + 5, 8 + 4, 8 + 1, 8 + 0, 5, 4, 1, 0);
+  const __m512i perm_idx2 = _mm512_set_epi64(8 + 7, 8 + 6, 8 + 3, 8 + 2, 7, 6, 3, 2);
+
+  for (int y = 0; y < target_height; y++) {
+    int offset = program->pixel_offset[y];
+    const BYTE* AVS_RESTRICT src_ptr = src + offset * src_pitch;
+
+    for (int x = 0; x < width_mod128; x += 128) {
+
+      __m512i result_lo = rounder;
+      __m512i result_hi = rounder;
+      __m512i result_lo2 = rounder;
+      __m512i result_hi2 = rounder;
+
+      __m512i result_lo_2 = rounder;
+      __m512i result_hi_2 = rounder;
+      __m512i result_lo2_2 = rounder;
+      __m512i result_hi2_2 = rounder;
+
+      const uint8_t* AVS_RESTRICT src2_ptr = src_ptr + x;
+
+      int i = 0;
+      // 128 byte 128 pixel
+      for (; i < kernel_size; i++) {
+        // Broadcast a single coefficients
+        __m512i coeff = _mm512_set1_epi16(*reinterpret_cast<const short*>(current_coeff + i)); // 0|co|0|co|0|co|0|co   0|co|0|co|0|co|0|co
+
+        __m512i src_1_1 = _mm512_cvtepu8_epi16(_mm256_load_si256(reinterpret_cast<const __m256i*>(src2_ptr))); // 32x 8->16bit pixels
+        __m512i src_1_2 = _mm512_cvtepu8_epi16(_mm256_load_si256(reinterpret_cast<const __m256i*>(src2_ptr + 32))); // 32x 8->16bit pixels
+        __m512i src_2_1 = _mm512_cvtepu8_epi16(_mm256_load_si256(reinterpret_cast<const __m256i*>(src2_ptr + 64))); // 32x 8->16bit pixels
+        __m512i src_2_2 = _mm512_cvtepu8_epi16(_mm256_load_si256(reinterpret_cast<const __m256i*>(src2_ptr + 96))); // 32x 8->16bit pixels
+
+        __m512i src_lo = _mm512_unpacklo_epi16(src_1_1, zero);
+        __m512i src_hi = _mm512_unpackhi_epi16(src_1_1, zero);
+        __m512i src_lo2 = _mm512_unpacklo_epi16(src_1_2, zero);
+        __m512i src_hi2 = _mm512_unpackhi_epi16(src_1_2, zero);
+
+        __m512i src_lo_2 = _mm512_unpacklo_epi16(src_2_1, zero);
+        __m512i src_hi_2 = _mm512_unpackhi_epi16(src_2_1, zero);
+        __m512i src_lo2_2 = _mm512_unpacklo_epi16(src_2_2, zero);
+        __m512i src_hi2_2 = _mm512_unpackhi_epi16(src_2_2, zero);
+
+        result_lo = _mm512_add_epi32(result_lo, _mm512_madd_epi16(src_lo, coeff)); // a*b + c
+        result_hi = _mm512_add_epi32(result_hi, _mm512_madd_epi16(src_hi, coeff)); // a*b + c
+        result_lo2 = _mm512_add_epi32(result_lo2, _mm512_madd_epi16(src_lo2, coeff)); // a*b + c
+        result_hi2 = _mm512_add_epi32(result_hi2, _mm512_madd_epi16(src_hi2, coeff)); // a*b + c
+
+        result_lo_2 = _mm512_add_epi32(result_lo_2, _mm512_madd_epi16(src_lo_2, coeff)); // a*b + c
+        result_hi_2 = _mm512_add_epi32(result_hi_2, _mm512_madd_epi16(src_hi_2, coeff)); // a*b + c
+        result_lo2_2 = _mm512_add_epi32(result_lo2_2, _mm512_madd_epi16(src_lo2_2, coeff)); // a*b + c
+        result_hi2_2 = _mm512_add_epi32(result_hi2_2, _mm512_madd_epi16(src_hi2_2, coeff)); // a*b + c
+
+        src2_ptr += src_pitch;
+
+      }
+
+      // scale back, store
+      // shift back integer arithmetic 14 bits precision
+      result_lo = _mm512_srai_epi32(result_lo, FPScale8bits);
+      result_hi = _mm512_srai_epi32(result_hi, FPScale8bits);
+      result_lo2 = _mm512_srai_epi32(result_lo2, FPScale8bits);
+      result_hi2 = _mm512_srai_epi32(result_hi2, FPScale8bits);
+
+      result_lo_2 = _mm512_srai_epi32(result_lo_2, FPScale8bits);
+      result_hi_2 = _mm512_srai_epi32(result_hi_2, FPScale8bits);
+      result_lo2_2 = _mm512_srai_epi32(result_lo2_2, FPScale8bits);
+      result_hi2_2 = _mm512_srai_epi32(result_hi2_2, FPScale8bits);
+
+      __m512i result_2x8x_uint16 = _mm512_packus_epi32(result_lo, result_hi);
+      __m512i result2_2x8x_uint16 = _mm512_packus_epi32(result_lo2, result_hi2);
+
+      __m512i result_2x8x_uint16_2 = _mm512_packus_epi32(result_lo_2, result_hi_2);
+      __m512i result2_2x8x_uint16_2 = _mm512_packus_epi32(result_lo2_2, result_hi2_2);
+
+      __m512i pack_1 = _mm512_permutex2var_epi64(result_2x8x_uint16, perm_idx1, result2_2x8x_uint16);
+      __m512i pack_2 = _mm512_permutex2var_epi64(result_2x8x_uint16, perm_idx2, result2_2x8x_uint16);
+
+      __m512i pack_1_2 = _mm512_permutex2var_epi64(result_2x8x_uint16_2, perm_idx1, result2_2x8x_uint16_2);
+      __m512i pack_2_2 = _mm512_permutex2var_epi64(result_2x8x_uint16_2, perm_idx2, result2_2x8x_uint16_2);
+
+      __m512i res = _mm512_packus_epi16(pack_1, pack_2);
+      __m512i res_2 = _mm512_packus_epi16(pack_1_2, pack_2_2);
+
+      _mm512_store_si512(reinterpret_cast<__m512i*>(dst + x), res);
+      _mm512_store_si512(reinterpret_cast<__m512i*>(dst + x + 64), res);
+
+    }
+
+    // 64 byte 64 pixel
+    // no need wmod16, alignment is safe at least 32
+    for (int x = width_mod128; x < width; x += 64) {
+
+      __m512i result_lo = rounder;
+      __m512i result_hi = rounder;
+
+      __m512i result_lo2 = rounder;
+      __m512i result_hi2 = rounder;
+
+      const uint8_t* AVS_RESTRICT src2_ptr = src_ptr + x;
+
+      int i = 0;
+      for (; i < kernel_size; i++) {
+        // Broadcast a single coefficients
+        __m512i coeff = _mm512_set1_epi16(*reinterpret_cast<const short*>(current_coeff + i)); // 0|co|0|co|0|co|0|co   0|co|0|co|0|co|0|co
+
+        __m512i src_1_1 = _mm512_cvtepu8_epi16(_mm256_load_si256(reinterpret_cast<const __m256i*>(src2_ptr))); // 32x 8->16bit pixels
+        __m512i src_1_2 = _mm512_cvtepu8_epi16(_mm256_load_si256(reinterpret_cast<const __m256i*>(src2_ptr + 32))); // 32x 8->16bit pixels
+
+        __m512i src_lo = _mm512_unpacklo_epi16(src_1_1, zero);
+        __m512i src_hi = _mm512_unpackhi_epi16(src_1_1, zero);
+
+        __m512i src_lo2 = _mm512_unpacklo_epi16(src_1_2, zero);
+        __m512i src_hi2 = _mm512_unpackhi_epi16(src_1_2, zero);
+
+        result_lo = _mm512_add_epi32(result_lo, _mm512_madd_epi16(src_lo, coeff)); // a*b + c
+        result_hi = _mm512_add_epi32(result_hi, _mm512_madd_epi16(src_hi, coeff)); // a*b + c
+
+        result_lo2 = _mm512_add_epi32(result_lo2, _mm512_madd_epi16(src_lo2, coeff)); // a*b + c
+        result_hi2 = _mm512_add_epi32(result_hi2, _mm512_madd_epi16(src_hi2, coeff)); // a*b + c
+
+        src2_ptr += src_pitch;
+
+      }
+
+      // scale back, store
+      // shift back integer arithmetic 14 bits precision
+      result_lo = _mm512_srai_epi32(result_lo, FPScale8bits);
+      result_hi = _mm512_srai_epi32(result_hi, FPScale8bits);
+
+      result_lo2 = _mm512_srai_epi32(result_lo2, FPScale8bits);
+      result_hi2 = _mm512_srai_epi32(result_hi2, FPScale8bits);
+
+      __m512i result_2x8x_uint16 = _mm512_packus_epi32(result_lo, result_hi);
+      __m512i result_2x8x_uint16_2 = _mm512_packus_epi32(result_lo2, result_hi2);
+
+      __m512i pack_1 = _mm512_permutex2var_epi64(result_2x8x_uint16, perm_idx1, result_2x8x_uint16_2);
+      __m512i pack_2 = _mm512_permutex2var_epi64(result_2x8x_uint16, perm_idx2, result_2x8x_uint16_2);
+
+      __m512i res = _mm512_packus_epi16(pack_1, pack_2);
+
+      _mm512_store_si512(reinterpret_cast<__m512i*>(dst + x), res);
+
+    }
+
+    dst += dst_pitch;
+    current_coeff += filter_size;
+  }
+}
+
 
diff --git a/avs_core/filters/intel/resample_avx512.h b/avs_core/filters/intel/resample_avx512.h
index 2d7fab5b0..b9f8bb978 100644
--- a/avs_core/filters/intel/resample_avx512.h
+++ b/avs_core/filters/intel/resample_avx512.h
@@ -137,4 +137,7 @@ void resize_h_planar_float_avx512_permutex_vstripe_ks16(BYTE* dst8, const BYTE*
 void resize_v_avx512_planar_float(BYTE* dst0, const BYTE* src0, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel);
 void resize_v_avx512_planar_float_w_sr(BYTE* dst0, const BYTE* src0, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel);
 
+// uint8_t
+void resize_v_avx512_planar_uint8_t_w_sr(BYTE* AVS_RESTRICT dst, const BYTE* src, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel);
+
 #endif // __Resample_AVX512_H__

From 997c5a871c2374c38b8b51bcc4c0478391039375 Mon Sep 17 00:00:00 2001
From: DTL2020 <68707763+DTL2020@users.noreply.github.com>
Date: Tue, 10 Jun 2025 04:06:43 -0700
Subject: [PATCH 25/27] Fixed bug in AVX512 V 8bit

resampler.
---
 avs_core/filters/intel/resample_avx512.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/avs_core/filters/intel/resample_avx512.cpp b/avs_core/filters/intel/resample_avx512.cpp
index eefece605..d702ea386 100644
--- a/avs_core/filters/intel/resample_avx512.cpp
+++ b/avs_core/filters/intel/resample_avx512.cpp
@@ -1797,7 +1797,7 @@ void resize_v_avx512_planar_uint8_t_w_sr(BYTE* AVS_RESTRICT dst, const BYTE* src
       __m512i res_2 = _mm512_packus_epi16(pack_1_2, pack_2_2);
 
       _mm512_store_si512(reinterpret_cast<__m512i*>(dst + x), res);
-      _mm512_store_si512(reinterpret_cast<__m512i*>(dst + x + 64), res);
+      _mm512_store_si512(reinterpret_cast<__m512i*>(dst + x + 64), res_2);
 
     }
 

From cd01498ffe5486c6fcb955290ae6468da0f38a97 Mon Sep 17 00:00:00 2001
From: DTL2020 <68707763+DTL2020@users.noreply.github.com>
Date: Tue, 10 Jun 2025 10:36:30 -0700
Subject: [PATCH 26/27] Added AVX512 V 16bit

resampler
---
 avs_core/filters/resample.cpp | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/avs_core/filters/resample.cpp b/avs_core/filters/resample.cpp
index c604cd3ea..18ca2ba5c 100644
--- a/avs_core/filters/resample.cpp
+++ b/avs_core/filters/resample.cpp
@@ -1828,6 +1828,13 @@ ResamplerV FilteredResizeV::GetResampler(int CPU, int pixelsize, int bits_per_pi
     else if (pixelsize == 2)
     {
 #ifdef INTEL_INTRINSICS
+#ifdef INTEL_INTRINSICS_AVX512
+      if (CPU & CPUF_AVX512F)
+        if (bits_per_pixel < 16)
+          return resize_v_avx512_planar_uint16_t_w_sr<true>;
+        else
+          return resize_v_avx512_planar_uint16_t_w_sr<false>;
+#endif
       if (CPU & CPUF_AVX2) {
         if (bits_per_pixel < 16)
           return resize_v_avx2_planar_uint16_t<true>;
@@ -2497,6 +2504,13 @@ ResamplerV FilteredResize_2p::GetResamplerV(int CPU, int pixelsize, int bits_per
     else if (pixelsize == 2)
     {
 #ifdef INTEL_INTRINSICS
+#ifdef INTEL_INTRINSICS_AVX512
+      if (CPU & CPUF_AVX512F)
+        if (bits_per_pixel < 16)
+          return resize_v_avx512_planar_uint16_t_w_sr<true>;
+        else
+          return resize_v_avx512_planar_uint16_t_w_sr<false>;
+#endif
       if (CPU & CPUF_AVX2) {
         if (bits_per_pixel < 16)
           return resize_v_avx2_planar_uint16_t<true>;

From 034a47e2c91ad9d84ad24492d37a18f99b58d996 Mon Sep 17 00:00:00 2001
From: DTL2020 <68707763+DTL2020@users.noreply.github.com>
Date: Tue, 10 Jun 2025 10:37:40 -0700
Subject: [PATCH 27/27] Added AVX512 V 16bit

resampler
---
 avs_core/filters/intel/resample_avx512.cpp | 149 +++++++++++++++++++++
 avs_core/filters/intel/resample_avx512.h   |   4 +
 2 files changed, 153 insertions(+)

diff --git a/avs_core/filters/intel/resample_avx512.cpp b/avs_core/filters/intel/resample_avx512.cpp
index d702ea386..68e9cae78 100644
--- a/avs_core/filters/intel/resample_avx512.cpp
+++ b/avs_core/filters/intel/resample_avx512.cpp
@@ -1862,4 +1862,153 @@ void resize_v_avx512_planar_uint8_t_w_sr(BYTE* AVS_RESTRICT dst, const BYTE* src
   }
 }
 
+//uint16_t
+template<bool lessthan16bit>
+void resize_v_avx512_planar_uint16_t_w_sr(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel)
+{
+  int filter_size = program->filter_size;
+  const short* AVS_RESTRICT current_coeff = program->pixel_coefficient;
+
+  const __m512i zero = _mm512_setzero_si512();
+
+  const int width_mod64 = (width / 64) * 64;
+
+  // for 16 bits only
+  const __m512i shifttosigned = _mm512_set1_epi16(-32768);
+  const __m512i shiftfromsigned = _mm512_set1_epi32(32768 << FPScale16bits);
+
+  const __m512i rounder = _mm512_set1_epi32(1 << (FPScale16bits - 1));
+
+  const uint16_t* src = (uint16_t*)src8;
+  uint16_t* AVS_RESTRICT dst = (uint16_t * AVS_RESTRICT)dst8;
+  dst_pitch = dst_pitch / sizeof(uint16_t);
+  src_pitch = src_pitch / sizeof(uint16_t);
+
+  const int kernel_size = program->filter_size_real; // not the aligned
+
+  const int limit = (1 << bits_per_pixel) - 1;
+  __m512i clamp_limit = _mm512_set1_epi16((short)limit); // clamp limit for <16 bits
+
+  for (int y = 0; y < target_height; y++) {
+    int offset = program->pixel_offset[y];
+    const uint16_t* src_ptr = src + offset * src_pitch;
+
+    // 128 byte 32 word
+    for (int x = 0; x < width_mod64; x += 64) {
+
+      __m512i result_lo = rounder;
+      __m512i result_hi = rounder;
+
+      __m512i result_lo_2 = rounder;
+      __m512i result_hi_2 = rounder;
+
+      const uint16_t* AVS_RESTRICT src2_ptr = src_ptr + x;
+
+      int i = 0;
+      for (; i < kernel_size; i++) {
+        // Broadcast a single coefficients
+        __m512i coeff = _mm512_set1_epi16(current_coeff[i]); // 0|co|0|co|0|co|0|co   0|co|0|co|0|co|0|co
+
+        __m512i src = _mm512_load_si512(reinterpret_cast<const __m512i*>(src2_ptr)); // 32x 16bit pixels
+        __m512i src_2 = _mm512_load_si512(reinterpret_cast<const __m512i*>(src2_ptr + 32)); // 32x 16bit pixels
+
+        if (!lessthan16bit) {
+          src = _mm512_add_epi16(src, shifttosigned);
+          src_2 = _mm512_add_epi16(src_2, shifttosigned);
+        }
+
+        __m512i src_lo = _mm512_unpacklo_epi16(src, zero);
+        __m512i src_hi = _mm512_unpackhi_epi16(src, zero);
+
+        __m512i src_lo_2 = _mm512_unpacklo_epi16(src_2, zero);
+        __m512i src_hi_2 = _mm512_unpackhi_epi16(src_2, zero);
+
+        result_lo = _mm512_add_epi32(result_lo, _mm512_madd_epi16(src_lo, coeff)); // a*b + c
+        result_hi = _mm512_add_epi32(result_hi, _mm512_madd_epi16(src_hi, coeff)); // a*b + c
+
+        result_lo_2 = _mm512_add_epi32(result_lo_2, _mm512_madd_epi16(src_lo_2, coeff)); // a*b + c
+        result_hi_2 = _mm512_add_epi32(result_hi_2, _mm512_madd_epi16(src_hi_2, coeff)); // a*b + c
+
+        src2_ptr += src_pitch;
+      }
+
+      if (!lessthan16bit) {
+        result_lo = _mm512_add_epi32(result_lo, shiftfromsigned);
+        result_hi = _mm512_add_epi32(result_hi, shiftfromsigned);
+
+        result_lo_2 = _mm512_add_epi32(result_lo_2, shiftfromsigned);
+        result_hi_2 = _mm512_add_epi32(result_hi_2, shiftfromsigned);
+
+      }
+      // shift back integer arithmetic 13 bits precision
+      result_lo = _mm512_srai_epi32(result_lo, FPScale16bits);
+      result_hi = _mm512_srai_epi32(result_hi, FPScale16bits);
+
+      result_lo_2 = _mm512_srai_epi32(result_lo_2, FPScale16bits);
+      result_hi_2 = _mm512_srai_epi32(result_hi_2, FPScale16bits);
+
+      __m512i result_2x8x_uint16 = _mm512_packus_epi32(result_lo, result_hi);
+      __m512i result_2x8x_uint16_2 = _mm512_packus_epi32(result_lo_2, result_hi_2);
+      if (lessthan16bit) {
+        result_2x8x_uint16 = _mm512_min_epu16(result_2x8x_uint16, clamp_limit); // extra clamp for 10-14 bit
+        result_2x8x_uint16_2 = _mm512_min_epu16(result_2x8x_uint16_2, clamp_limit); // extra clamp for 10-14 bit
+      }
+      _mm512_store_si512(reinterpret_cast<__m512i*>(dst + x), result_2x8x_uint16);
+      _mm512_store_si512(reinterpret_cast<__m512i*>(dst + x + 32), result_2x8x_uint16_2);
+    }
+    
+    // last 32
+    // 64 byte 32 word
+    for (int x = width_mod64; x < width; x += 32) { 
+
+      __m512i result_lo = rounder;
+      __m512i result_hi = rounder;
+
+      const uint16_t* AVS_RESTRICT src2_ptr = src_ptr + x;
+
+      int i = 0;
+      for (; i < kernel_size; i++) {
+        // Broadcast a single coefficients
+        __m512i coeff = _mm512_set1_epi16(current_coeff[i]); // 0|co|0|co|0|co|0|co   0|co|0|co|0|co|0|co
+
+        __m512i src = _mm512_load_si512(reinterpret_cast<const __m512i*>(src2_ptr)); // 32x 16bit pixels
+        if (!lessthan16bit) {
+          src = _mm512_add_epi16(src, shifttosigned);
+        }
+        __m512i src_lo = _mm512_unpacklo_epi16(src, zero);
+        __m512i src_hi = _mm512_unpackhi_epi16(src, zero);
+        result_lo = _mm512_add_epi32(result_lo, _mm512_madd_epi16(src_lo, coeff)); // a*b + c
+        result_hi = _mm512_add_epi32(result_hi, _mm512_madd_epi16(src_hi, coeff)); // a*b + c
+
+        src2_ptr += src_pitch;
+      }
+
+      if (!lessthan16bit) {
+        result_lo = _mm512_add_epi32(result_lo, shiftfromsigned);
+        result_hi = _mm512_add_epi32(result_hi, shiftfromsigned);
+      }
+      // shift back integer arithmetic 13 bits precision
+      result_lo = _mm512_srai_epi32(result_lo, FPScale16bits);
+      result_hi = _mm512_srai_epi32(result_hi, FPScale16bits);
+
+      __m512i result_2x8x_uint16 = _mm512_packus_epi32(result_lo, result_hi);
+      if (lessthan16bit) {
+        result_2x8x_uint16 = _mm512_min_epu16(result_2x8x_uint16, clamp_limit); // extra clamp for 10-14 bit
+      }
+      _mm512_store_si512(reinterpret_cast<__m512i*>(dst + x), result_2x8x_uint16);
+
+    }
+
+    dst += dst_pitch;
+    current_coeff += filter_size;
+  }
+}
+
+// avx512 16
+template void resize_v_avx512_planar_uint16_t_w_sr<false>(BYTE* dst0, const BYTE* src0, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel);
+// avx512 10-14bit
+template void resize_v_avx512_planar_uint16_t_w_sr<true>(BYTE* dst0, const BYTE* src0, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel);
+
+
+
 
diff --git a/avs_core/filters/intel/resample_avx512.h b/avs_core/filters/intel/resample_avx512.h
index b9f8bb978..a9b5a0706 100644
--- a/avs_core/filters/intel/resample_avx512.h
+++ b/avs_core/filters/intel/resample_avx512.h
@@ -140,4 +140,8 @@ void resize_v_avx512_planar_float_w_sr(BYTE* dst0, const BYTE* src0, int dst_pit
 // uint8_t
 void resize_v_avx512_planar_uint8_t_w_sr(BYTE* AVS_RESTRICT dst, const BYTE* src, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel);
 
+// uint16_t
+template<bool lessthan16bit>
+void resize_v_avx512_planar_uint16_t_w_sr(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel);
+
 #endif // __Resample_AVX512_H__