diff --git a/avs_core/filters/intel/resample_avx2.cpp b/avs_core/filters/intel/resample_avx2.cpp
index 1929021eb..b28c74b24 100644
--- a/avs_core/filters/intel/resample_avx2.cpp
+++ b/avs_core/filters/intel/resample_avx2.cpp
@@ -365,7 +365,7 @@ static void internal_resizer_h_avx2_generic_uint8_16_t(BYTE* dst8, const BYTE* s
   dst_pitch /= sizeof(pixel_t);
   src_pitch /= sizeof(pixel_t);
 
-  const int w_safe_mod8 = (program->overread_possible ? program->source_overread_beyond_targetx : width) / 8 * 8;
+  const int w_safe_mod8 = (program->safelimit_filter_size_aligned.overread_possible ? program->safelimit_filter_size_aligned.source_overread_beyond_targetx : width) / 8 * 8;
 
   for (int y = 0; y < height; y++) {
     const short* AVS_RESTRICT current_coeff_base = program->pixel_coefficient;
@@ -586,7 +586,7 @@ static void internal_resizer_h_avx2_generic_float(BYTE* dst8, const BYTE* src8,
   dst_pitch = dst_pitch / sizeof(float);
   src_pitch = src_pitch / sizeof(float);
 
-  const int w_safe_mod8 = (program->overread_possible ? program->source_overread_beyond_targetx : width) / 8 * 8;
+  const int w_safe_mod8 = (program->safelimit_filter_size_aligned.overread_possible ? program->safelimit_filter_size_aligned.source_overread_beyond_targetx : width) / 8 * 8;
 
   for (int y = 0; y < height; y++) {
     float* current_coeff_base = program->pixel_coefficient_float;
@@ -627,7 +627,7 @@ void resizer_h_avx2_generic_float(BYTE* dst8, const BYTE* src8, int dst_pitch, i
 // end of H float
 
 //-------- 256 bit Verticals
-/*
+#if 0
 void resize_v_avx2_planar_uint8_t(BYTE* AVS_RESTRICT dst, const BYTE* src, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel)
 {
   AVS_UNUSED(bits_per_pixel);
@@ -702,7 +702,7 @@ void resize_v_avx2_planar_uint8_t(BYTE* AVS_RESTRICT dst, const BYTE* src, int d
     current_coeff += filter_size;
   }
 }
-*/
+#else
 
 void resize_v_avx2_planar_uint8_t(BYTE* AVS_RESTRICT dst, const BYTE* src, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel)
 {
@@ -822,7 +822,7 @@ void resize_v_avx2_planar_uint8_t(BYTE* AVS_RESTRICT dst, const BYTE* src, int d
         current_coeff += filter_size;
     }
 }
-
+#endif
 template<bool lessthan16bit>
 void resize_v_avx2_planar_uint16_t(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel)
 {
@@ -987,6 +987,102 @@ void resize_v_avx2_planar_float(BYTE* dst8, const BYTE* src8, int dst_pitch, int
   }
 }
 
+// Memory-transfer optimized version 
+void resize_v_avx2_planar_float_w_sr(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel)
+{
+  AVS_UNUSED(bits_per_pixel);
+
+  const int filter_size = program->filter_size;
+  const float* AVS_RESTRICT current_coeff = program->pixel_coefficient_float;
+
+  const float* src = (const float*)src8;
+  float* AVS_RESTRICT dst = (float*)dst8;
+  dst_pitch = dst_pitch / sizeof(float);
+  src_pitch = src_pitch / sizeof(float);
+
+  const int kernel_size = program->filter_size_real; // not the aligned
+  const int kernel_size_mod2 = (kernel_size / 2) * 2; // Process pairs of rows for better efficiency
+  const bool notMod2 = kernel_size_mod2 < kernel_size;
+
+  const int width_mod32 = (width / 32) * 32; // Process by 4x 256bit (8 x 8 floats) to make memory read/write linear streams longer, 16x256 bit registers in 64bit mode should be enough
+
+  for (int y = 0; y < target_height; y++) {
+    int offset = program->pixel_offset[y];
+    const float* src_ptr = src + offset * src_pitch;
+
+    for (int x = 0; x < width_mod32; x += 32) {
+      __m256 result_1 = _mm256_setzero_ps();
+      __m256 result_2 = _mm256_setzero_ps();
+      __m256 result_3 = _mm256_setzero_ps();
+      __m256 result_4 = _mm256_setzero_ps();
+
+      const float* AVS_RESTRICT src2_ptr = src_ptr + x; // __restrict here
+
+      int i = 0;
+      for (; i < kernel_size; i ++) {
+        // coefs are equal for all H-samples
+        __m256 coeff = _mm256_set1_ps(current_coeff[i]);
+
+        __m256 src_1 = _mm256_load_ps(src2_ptr); // why was loadu ? source always aligned in V-resizers ?
+        __m256 src_2 = _mm256_load_ps(src2_ptr + 8);
+        __m256 src_3 = _mm256_load_ps(src2_ptr + 16);
+        __m256 src_4 = _mm256_load_ps(src2_ptr + 24);
+
+        result_1 = _mm256_fmadd_ps(src_1, coeff, result_1);
+        result_2 = _mm256_fmadd_ps(src_2, coeff, result_2);
+        result_3 = _mm256_fmadd_ps(src_3, coeff, result_3);
+        result_4 = _mm256_fmadd_ps(src_4, coeff, result_4);
+
+        src2_ptr += src_pitch;
+      }
+
+      _mm256_store_ps(dst + x, result_1); 
+      _mm256_store_ps(dst + x + 8, result_2);
+      _mm256_store_ps(dst + x + 16, result_3);
+      _mm256_store_ps(dst + x + 24, result_4);
+    } // width_mod32
+
+    // 32 byte 8 floats (AVX2 register holds 8 floats)
+    // no need for wmod8, alignment is safe 32 bytes at least
+    for (int x = width_mod32; x < width; x += 8) {
+      __m256 result_single = _mm256_setzero_ps();
+      __m256 result_single_2 = _mm256_setzero_ps();
+
+      const float* AVS_RESTRICT src2_ptr = src_ptr + x; // __restrict here
+
+      // Process pairs of rows for better efficiency (2 coeffs/cycle)
+      // two result variables for potential parallel operation
+      int i = 0;
+      for (; i < kernel_size_mod2; i += 2) {
+        __m256 coeff_even = _mm256_set1_ps(current_coeff[i]);
+        __m256 coeff_odd = _mm256_set1_ps(current_coeff[i + 1]);
+
+        __m256 src_even = _mm256_load_ps(src2_ptr);
+        __m256 src_odd = _mm256_load_ps(src2_ptr + src_pitch);
+
+        result_single = _mm256_fmadd_ps(src_even, coeff_even, result_single);
+        result_single_2 = _mm256_fmadd_ps(src_odd, coeff_odd, result_single_2);
+
+        src2_ptr += 2 * src_pitch;
+      }
+
+      result_single = _mm256_add_ps(result_single, result_single_2);
+
+      // Process the last odd row if needed
+      if (notMod2) {
+        __m256 coeff = _mm256_set1_ps(current_coeff[i]);
+        __m256 src_val = _mm256_load_ps(src2_ptr);
+        result_single = _mm256_fmadd_ps(src_val, coeff, result_single);
+      }
+
+      _mm256_store_ps(dst + x, result_single);
+    }
+
+    dst += dst_pitch;
+    current_coeff += filter_size;
+  }
+}
+
 // avx2 16bit
 template void resizer_h_avx2_generic_uint16_t<false>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
 // avx2 10-14bit
@@ -998,3 +1094,874 @@ template void resizer_h_avx2_generic_uint16_t<true>(BYTE* dst8, const BYTE* src8
 template void resize_v_avx2_planar_uint16_t<false>(BYTE* dst0, const BYTE* src0, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel);
 // avx2 10-14bit
 template void resize_v_avx2_planar_uint16_t<true>(BYTE* dst0, const BYTE* src0, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel);
+
+
+// Transpose-based SIMD
+void resize_h_planar_float_avx_transpose(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel) {
+    int filter_size = program->filter_size;
+
+    const float* AVS_RESTRICT current_coeff;
+
+    src_pitch = src_pitch / sizeof(float);
+    dst_pitch = dst_pitch / sizeof(float);
+
+    float* src = (float*)src8;
+    float* dst = (float*)dst8;
+
+    const int kernel_size = program->filter_size_real;
+    const int ksmod4 = kernel_size / 4 * 4;
+//    const int ksmod8 = kernel_size / 8 * 8;
+#if 0
+
+    for (int y = 0; y < height; y ++) {
+        current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
+
+        float* AVS_RESTRICT dst2_ptr = dst + y * dst_pitch;
+        const float* src_ptr = src + y * src_pitch;
+
+        for (int x = 0; x < width; x += 8) {
+
+            __m256 result = _mm256_setzero_ps();
+
+            for (int i = 0; i < ksmod4; i += 4) {
+
+                __m256 data_1_data_5 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 0] + i, src_ptr + program->pixel_offset[x + 4] + i);
+                __m256 data_2_data_6 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 1] + i, src_ptr + program->pixel_offset[x + 5] + i);
+                __m256 data_3_data_7 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 2] + i, src_ptr + program->pixel_offset[x + 6] + i);
+                __m256 data_4_data_8 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 3] + i, src_ptr + program->pixel_offset[x + 7] + i);
+
+                __m256 coef_1_coef_5 = _mm256_load_2_m128(current_coeff + i + filter_size * 0, current_coeff + i + filter_size * 4);
+                __m256 coef_2_coef_6 = _mm256_load_2_m128(current_coeff + i + filter_size * 1, current_coeff + i + filter_size * 5);
+                __m256 coef_3_coef_7 = _mm256_load_2_m128(current_coeff + i + filter_size * 2, current_coeff + i + filter_size * 6);
+                __m256 coef_4_coef_8 = _mm256_load_2_m128(current_coeff + i + filter_size * 3, current_coeff + i + filter_size * 7);
+
+                _MM_TRANSPOSE8_LANE4_PS(data_1_data_5, data_2_data_6, data_3_data_7, data_4_data_8);
+                _MM_TRANSPOSE8_LANE4_PS(coef_1_coef_5, coef_2_coef_6, coef_3_coef_7, coef_4_coef_8);
+
+                result = _mm256_fmadd_ps(data_1_data_5, coef_1_coef_5, result);
+                result = _mm256_fmadd_ps(data_2_data_6, coef_2_coef_6, result);
+                result = _mm256_fmadd_ps(data_3_data_7, coef_3_coef_7, result);
+                result = _mm256_fmadd_ps(data_4_data_8, coef_4_coef_8, result);
+
+            }
+
+            _mm256_store_ps(dst2_ptr + x, result);
+            current_coeff += filter_size * 8;
+        }
+    }
+#endif
+
+    for (int y = 0; y < height; y+=2) {
+        current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
+
+        float* AVS_RESTRICT dst2_ptr = dst + y * dst_pitch;
+        float* AVS_RESTRICT dst2_ptr2 = dst + (y + 1) * dst_pitch;
+        const float* src_ptr = src + y * src_pitch;
+        const float* src_ptr2 = src + (y + 1) * src_pitch;
+
+        for (int x = 0; x < width; x += 8) {
+
+            __m256 result = _mm256_setzero_ps();
+            __m256 result2 = _mm256_setzero_ps();
+
+            for (int i = 0; i < kernel_size; i += 4) { // is it always mod4 ?
+                __m256 data_1_data_5 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 0] + i, src_ptr + program->pixel_offset[x + 4] + i);
+                __m256 data_2_data_6 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 1] + i, src_ptr + program->pixel_offset[x + 5] + i);
+                __m256 data_3_data_7 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 2] + i, src_ptr + program->pixel_offset[x + 6] + i);
+                __m256 data_4_data_8 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 3] + i, src_ptr + program->pixel_offset[x + 7] + i);
+
+                __m256 data_1_data_5_2 = _mm256_loadu_2_m128(src_ptr2 + program->pixel_offset[x + 0] + i, src_ptr2 + program->pixel_offset[x + 4] + i);
+                __m256 data_2_data_6_2 = _mm256_loadu_2_m128(src_ptr2 + program->pixel_offset[x + 1] + i, src_ptr2 + program->pixel_offset[x + 5] + i);
+                __m256 data_3_data_7_2 = _mm256_loadu_2_m128(src_ptr2 + program->pixel_offset[x + 2] + i, src_ptr2 + program->pixel_offset[x + 6] + i);
+                __m256 data_4_data_8_2 = _mm256_loadu_2_m128(src_ptr2 + program->pixel_offset[x + 3] + i, src_ptr2 + program->pixel_offset[x + 7] + i);
+
+                __m256 coef_1_coef_5 = _mm256_load_2_m128(current_coeff + i + filter_size * 0, current_coeff + i + filter_size * 4);
+                __m256 coef_2_coef_6 = _mm256_load_2_m128(current_coeff + i + filter_size * 1, current_coeff + i + filter_size * 5);
+                __m256 coef_3_coef_7 = _mm256_load_2_m128(current_coeff + i + filter_size * 2, current_coeff + i + filter_size * 6);
+                __m256 coef_4_coef_8 = _mm256_load_2_m128(current_coeff + i + filter_size * 3, current_coeff + i + filter_size * 7);
+
+                _MM_TRANSPOSE8_LANE4_PS(data_1_data_5, data_2_data_6, data_3_data_7, data_4_data_8);
+                _MM_TRANSPOSE8_LANE4_PS(data_1_data_5_2, data_2_data_6_2, data_3_data_7_2, data_4_data_8_2);
+                _MM_TRANSPOSE8_LANE4_PS(coef_1_coef_5, coef_2_coef_6, coef_3_coef_7, coef_4_coef_8);
+
+                result = _mm256_fmadd_ps(data_1_data_5, coef_1_coef_5, result);
+                result = _mm256_fmadd_ps(data_2_data_6, coef_2_coef_6, result);
+                result = _mm256_fmadd_ps(data_3_data_7, coef_3_coef_7, result);
+                result = _mm256_fmadd_ps(data_4_data_8, coef_4_coef_8, result);
+
+                result2 = _mm256_fmadd_ps(data_1_data_5_2, coef_1_coef_5, result2);
+                result2 = _mm256_fmadd_ps(data_2_data_6_2, coef_2_coef_6, result2);
+                result2 = _mm256_fmadd_ps(data_3_data_7_2, coef_3_coef_7, result2);
+                result2 = _mm256_fmadd_ps(data_4_data_8_2, coef_4_coef_8, result2);
+
+            }
+
+            // need to process last non-mod4 kernel samples in scalar way. or can we do over-read up to 3 kernel and source samples safely with main 4-kernel_samples loop ?
+
+            _mm256_store_ps(dst2_ptr + x, result);
+            _mm256_store_ps(dst2_ptr2 + x, result2);
+            current_coeff += filter_size * 8;
+        }
+    }
+
+}
+
+
+// Safe dual lane partial load with AVX
+// Read exactly N pixels, avoiding
+// - reading beyond the end of the source buffer.
+// - avoid NaN contamination, since event with zero coefficients NaN * 0 = NaN
+template <int Nmod4>
+AVS_FORCEINLINE static __m256 _mm256_load_partial_safe_2_m128(const float* src_ptr_offsetted1, const float* src_ptr_offsetted2) {
+  __m128 s1;
+  __m128 s2;
+  switch (Nmod4) {
+  case 1:
+    s1 = _mm_set_ps(0.0f, 0.0f, 0.0f, src_ptr_offsetted1[0]);
+    s2 = _mm_set_ps(0.0f, 0.0f, 0.0f, src_ptr_offsetted2[0]);
+    // ideally: movss
+    break;
+  case 2:
+    s1 = _mm_set_ps(0.0f, 0.0f, src_ptr_offsetted1[1], src_ptr_offsetted1[0]);
+    s2 = _mm_set_ps(0.0f, 0.0f, src_ptr_offsetted2[1], src_ptr_offsetted2[0]);
+    // ideally: movsd
+    break;
+  case 3:
+    s1 = _mm_set_ps(0.0f, src_ptr_offsetted1[2], src_ptr_offsetted1[1], src_ptr_offsetted1[0]);
+    s2 = _mm_set_ps(0.0f, src_ptr_offsetted2[2], src_ptr_offsetted2[1], src_ptr_offsetted2[0]);
+    // ideally: movss + movsd + shuffle or movsd + insert
+    break;
+  case 0:
+    s1 = _mm_set_ps(src_ptr_offsetted1[3], src_ptr_offsetted1[2], src_ptr_offsetted1[1], src_ptr_offsetted1[0]);
+    s2 = _mm_set_ps(src_ptr_offsetted2[3], src_ptr_offsetted2[2], src_ptr_offsetted2[1], src_ptr_offsetted2[0]);
+    // ideally: movups
+    break;
+  default:
+    s1 = _mm_setzero_ps(); // n/a cannot happen
+    s2 = _mm_setzero_ps();
+  }
+  return _mm256_set_m128(s2, s1);
+}
+
+
+// Processes a horizontal resampling kernel of up to four coefficients for float pixel types.
+// Supports BilinearResize, BicubicResize, or sinc with up to 2 taps (filter size <= 4).
+// AVX optimization loads and processes four float coefficients and eight pixels simultaneously.
+// The 'filtersizemod4' template parameter (0-3) helps optimize for different filter sizes modulo 4.
+// This AVX2 requires only filter_size_alignment of 4.
+template<int filtersizemod4>
+void resize_h_planar_float_avx_transpose_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel) {
+  assert(filtersizemod4 >= 0 && filtersizemod4 <= 3);
+
+  const int filter_size = program->filter_size; // aligned, practically the coeff table stride
+
+  src_pitch /= sizeof(float);
+  dst_pitch /= sizeof(float);
+
+  float* src = (float*)src8;
+  float* dst = (float*)dst8;
+
+  const float* AVS_RESTRICT current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
+
+  constexpr int PIXELS_AT_A_TIME = 8; // Process eight pixels in parallel using AVX2 (2x4 using m128 lanes)
+
+  // 'source_overread_beyond_targetx' indicates if the filter kernel can read beyond the target width.
+  // Even if the filter alignment allows larger reads, our safety boundary for unaligned loads starts at 4 pixels back
+  // from the target width, as we load 4 floats at once with '_mm_loadu_ps'.
+  const int width_safe_mod = (program->safelimit_4_pixels.overread_possible ? program->safelimit_4_pixels.source_overread_beyond_targetx : width) / PIXELS_AT_A_TIME * PIXELS_AT_A_TIME;
+
+  // Preconditions:
+  assert(program->filter_size_real <= 4); // We preload all relevant coefficients (up to 4) before the height loop.
+
+  // 'target_size_alignment' ensures we can safely access coefficients using offsets like
+  // 'filter_size * 7' when processing 8 H pixels at a time or
+  // 'filter_size * 15' when processing 16 H pixels at a time
+  assert(program->target_size_alignment >= 8);
+
+  // Ensure that coefficient loading beyond the valid target size is safe for 4x4 float loads.
+  assert(program->filter_size_alignment >= 4);
+
+  int x = 0;
+
+  // This 'auto' lambda construct replaces the need of templates
+  auto do_h_float_core = [&](auto partial_load) {
+    // Load up to 2x4 coefficients at once before the height loop.
+    // Pre-loading and transposing coefficients keeps register usage efficient.
+    // Assumes 'filter_size_aligned' is at least 4.
+
+    // Coefficients for the source pixel offset (for src_ptr + begin1 [0..3] and for src_ptr + begin5 [0..3] )
+    __m256 coef_1_coef_5 = _mm256_load_2_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4);
+    __m256 coef_2_coef_6 = _mm256_load_2_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5);
+    __m256 coef_3_coef_7 = _mm256_load_2_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6);
+    __m256 coef_4_coef_8 = _mm256_load_2_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7);
+
+    _MM_TRANSPOSE8_LANE4_PS(coef_1_coef_5, coef_2_coef_6, coef_3_coef_7, coef_4_coef_8);
+
+    float* AVS_RESTRICT dst_ptr = dst + x;
+    const float* src_ptr = src;
+
+    // Pixel offsets for the current target x-positions.
+    // Even for x >= width, these offsets are guaranteed to be within the allocated 'target_size_alignment'.
+    const int begin1 = program->pixel_offset[x + 0];
+    const int begin2 = program->pixel_offset[x + 1];
+    const int begin3 = program->pixel_offset[x + 2];
+    const int begin4 = program->pixel_offset[x + 3];
+    const int begin5 = program->pixel_offset[x + 4];
+    const int begin6 = program->pixel_offset[x + 5];
+    const int begin7 = program->pixel_offset[x + 6];
+    const int begin8 = program->pixel_offset[x + 7];
+
+    for (int y = 0; y < height; y++)
+    {
+      __m256 data_1_data_5;
+      __m256 data_2_data_6;
+      __m256 data_3_data_7;
+      __m256 data_4_data_8;
+
+      if constexpr (partial_load) {
+        // In the potentially unsafe zone (near the right edge of the image), we use a safe loading function
+        // to prevent reading beyond the allocated source scanline. This handles cases where loading 4 floats
+        // starting from 'src_ptr + beginX' might exceed the source buffer.
+
+        // Example of the unsafe scenario: If target width is 320, a naive load at src_ptr + 317
+        // would attempt to read floats at indices 317, 318, 319, and 320, potentially going out of bounds.
+
+        // Two main issues in the unsafe zone:
+        // 1.) Out-of-bounds memory access: Reading beyond the allocated memory for the source scanline can
+        //     lead to access violations and crashes. '_mm_loadu_ps' attempts to load 16 bytes, so even if
+        //     the starting address is within bounds, subsequent reads might not be.
+        // 2.) Garbage or NaN values: Even if a read doesn't cause a crash, accessing uninitialized or
+        //     out-of-bounds memory (especially for float types) can result in garbage data, including NaN.
+        //     Multiplying by a valid coefficient and accumulating this NaN can contaminate the final result.
+
+        // '_mm256_load_partial_safe_2_m128' safely loads up to 'filter_size_real' pixels and pads with zeros if needed,
+        // preventing out-of-bounds reads and ensuring predictable results even near the image edges.
+
+        data_1_data_5 = _mm256_load_partial_safe_2_m128<filtersizemod4>(src_ptr + begin1, src_ptr + begin5);
+        data_2_data_6 = _mm256_load_partial_safe_2_m128<filtersizemod4>(src_ptr + begin2, src_ptr + begin6);
+        data_3_data_7 = _mm256_load_partial_safe_2_m128<filtersizemod4>(src_ptr + begin3, src_ptr + begin7);
+        data_4_data_8 = _mm256_load_partial_safe_2_m128<filtersizemod4>(src_ptr + begin4, src_ptr + begin8);
+      }
+      else {
+        // In the safe zone, we can directly load 4 pixels at a time using unaligned loads.
+        data_1_data_5 = _mm256_loadu_2_m128(src_ptr + begin1, src_ptr + begin5);
+        data_2_data_6 = _mm256_loadu_2_m128(src_ptr + begin2, src_ptr + begin6);
+        data_3_data_7 = _mm256_loadu_2_m128(src_ptr + begin3, src_ptr + begin7);
+        data_4_data_8 = _mm256_loadu_2_m128(src_ptr + begin4, src_ptr + begin8);
+      }
+
+      _MM_TRANSPOSE8_LANE4_PS(data_1_data_5, data_2_data_6, data_3_data_7, data_4_data_8);
+
+      __m256 result = _mm256_mul_ps(data_1_data_5, coef_1_coef_5);
+      result = _mm256_fmadd_ps(data_2_data_6, coef_2_coef_6, result);
+      result = _mm256_fmadd_ps(data_3_data_7, coef_3_coef_7, result);
+      result = _mm256_fmadd_ps(data_4_data_8, coef_4_coef_8, result);
+
+      _mm256_store_ps(dst_ptr, result);
+
+      dst_ptr += dst_pitch;
+      src_ptr += src_pitch;
+    } // y
+    current_coeff += filter_size * 8; // Move to the next set of coefficients for the next 8 output pixels
+    }; // end of lambda
+
+  // Process the 'safe zone' where direct full unaligned loads are acceptable.
+  for (; x < width_safe_mod; x += PIXELS_AT_A_TIME)
+  {
+    do_h_float_core(std::false_type{}); // partial_load == false, use direct _mm_loadu_ps
+  }
+
+  // Process the potentially 'unsafe zone' near the image edge, using safe loading.
+  for (; x < width; x += PIXELS_AT_A_TIME)
+  {
+    do_h_float_core(std::true_type{}); // partial_load == true, use the safer '_mm256_load_partial_safe_2_m128'
+  }
+}
+
+// Instantiate them
+template void resize_h_planar_float_avx_transpose_vstripe_ks4<0>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+template void resize_h_planar_float_avx_transpose_vstripe_ks4<1>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+template void resize_h_planar_float_avx_transpose_vstripe_ks4<2>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+template void resize_h_planar_float_avx_transpose_vstripe_ks4<3>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+
+void resize_h_planar_float_avx_gather_vstripe_ks8(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel)
+{
+  int filter_size = program->filter_size;
+
+  const float* AVS_RESTRICT current_coeff;
+
+  src_pitch = src_pitch / sizeof(float);
+  dst_pitch = dst_pitch / sizeof(float);
+
+  float* src = (float*)src8;
+  float* dst = (float*)dst8;
+
+  current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
+  __m256i one_epi32 = _mm256_set1_epi32(1);
+
+  for (int x = 0; x < width; x += 8)
+  {
+    __m256 coef_0 = _mm256_load_ps(current_coeff + filter_size * 0);
+    __m256 coef_1 = _mm256_load_ps(current_coeff + filter_size * 1);
+    __m256 coef_2 = _mm256_load_ps(current_coeff + filter_size * 2);
+    __m256 coef_3 = _mm256_load_ps(current_coeff + filter_size * 3);
+    __m256 coef_4 = _mm256_load_ps(current_coeff + filter_size * 4);
+    __m256 coef_5 = _mm256_load_ps(current_coeff + filter_size * 5);
+    __m256 coef_6 = _mm256_load_ps(current_coeff + filter_size * 6);
+    __m256 coef_7 = _mm256_load_ps(current_coeff + filter_size * 7);
+
+    _MM_TRANSPOSE8_PS(coef_0, coef_1, coef_2, coef_3, coef_4, coef_5, coef_6, coef_7);
+
+    float* AVS_RESTRICT dst_ptr = dst + x;
+    const float* src_ptr = src;
+
+    for (int y = 0; y < height; y++)
+    {
+      //            __m256i offsets = _mm256_load_si256(program->pixel_offset + x); // hope it is always aligned ?
+      __m256i offsets = _mm256_set_epi32(program->pixel_offset[x + 7], program->pixel_offset[x + 6], program->pixel_offset[x + 5], program->pixel_offset[x + 4], program->pixel_offset[x + 3], program->pixel_offset[x + 2], program->pixel_offset[x + 1], program->pixel_offset[x + 0]);
+      //            __m256i offsets = _mm256_set1_epi32(program->pixel_offset[x]); // test
+      __m256 data_0 = _mm256_i32gather_ps(src_ptr, offsets, 4);
+
+      offsets = _mm256_add_epi32(offsets, one_epi32);
+      __m256 data_1 = _mm256_i32gather_ps(src_ptr, offsets, 4);
+
+      offsets = _mm256_add_epi32(offsets, one_epi32);
+      __m256 data_2 = _mm256_i32gather_ps(src_ptr, offsets, 4);
+
+      offsets = _mm256_add_epi32(offsets, one_epi32);
+      __m256 data_3 = _mm256_i32gather_ps(src_ptr, offsets, 4);
+
+      offsets = _mm256_add_epi32(offsets, one_epi32);
+      __m256 data_4 = _mm256_i32gather_ps(src_ptr, offsets, 4);
+
+      offsets = _mm256_add_epi32(offsets, one_epi32);
+      __m256 data_5 = _mm256_i32gather_ps(src_ptr, offsets, 4);
+
+      offsets = _mm256_add_epi32(offsets, one_epi32);
+      __m256 data_6 = _mm256_i32gather_ps(src_ptr, offsets, 4);
+
+      offsets = _mm256_add_epi32(offsets, one_epi32);
+      __m256 data_7 = _mm256_i32gather_ps(src_ptr, offsets, 4);
+
+      __m256 result0 = _mm256_mul_ps(data_0, coef_0);
+      __m256 result1 = _mm256_mul_ps(data_4, coef_4);
+
+      result0 = _mm256_fmadd_ps(data_1, coef_1, result0);
+      result1 = _mm256_fmadd_ps(data_5, coef_5, result1);
+
+      result0 = _mm256_fmadd_ps(data_2, coef_2, result0);
+      result1 = _mm256_fmadd_ps(data_6, coef_6, result1);
+
+      result0 = _mm256_fmadd_ps(data_3, coef_3, result0);
+      result1 = _mm256_fmadd_ps(data_7, coef_7, result1);
+
+      _mm256_store_ps(dst_ptr, _mm256_add_ps(result0, result1));
+
+      dst_ptr += dst_pitch;
+      src_ptr += src_pitch;
+    }
+    current_coeff += filter_size * 8;
+  }
+}
+
+void resize_h_planar_float_avx2_permutex_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel) {
+
+  // assert - check if max pixel_offset is not above single load of 8 src floats (or need several loads and more complex permute program)
+  // probably this is a valid assumption; there can be no jumps in source pixel indexes, it would mean that the
+  // filter would neglect some pixels in the source image, which is not allowed by the filter design
+#if 1 //def _DEBUG
+  for (int x = 0; x < width; x += 8)
+  {
+    int start_off = program->pixel_offset[x + 0];
+    int end_off = program->pixel_offset[x + 7];
+    assert((end_off - start_off) <= 7);
+
+    start_off = program->pixel_offset[x + 1];
+    end_off = program->pixel_offset[x + 7 + 1];
+    assert((end_off - start_off) <= 7);
+
+    start_off = program->pixel_offset[x + 2];
+    end_off = program->pixel_offset[x + 7 + 2];
+    assert((end_off - start_off) <= 7);
+
+    start_off = program->pixel_offset[x + 3];
+    end_off = program->pixel_offset[x + 7 + 3];
+    assert((end_off - start_off) <= 7);
+  }
+#endif
+
+  int filter_size = program->filter_size;
+
+  const float* AVS_RESTRICT current_coeff;
+  __m256i one_epi32 = _mm256_set1_epi32(1);
+
+  src_pitch = src_pitch / sizeof(float);
+  dst_pitch = dst_pitch / sizeof(float);
+
+  float* src = (float*)src8;
+  float* dst = (float*)dst8;
+
+  current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
+
+  for (int x = 0; x < width; x += 8)
+  {
+    // prepare coefs in transposed V-form
+    __m256 coef_0 = _mm256_load_2_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4);
+    __m256 coef_1 = _mm256_load_2_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5);
+    __m256 coef_2 = _mm256_load_2_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6);
+    __m256 coef_3 = _mm256_load_2_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7);
+
+    _MM_TRANSPOSE8_LANE4_PS(coef_0, coef_1, coef_2, coef_3);
+
+    // convert resampling program in H-form into permuting indexes for src transposition in V-form
+    int iStart = program->pixel_offset[x + 0];
+
+    __m256i perm_0 = _mm256_set_epi32(
+      program->pixel_offset[x + 7] - iStart,
+      program->pixel_offset[x + 6] - iStart,
+      program->pixel_offset[x + 5] - iStart,
+      program->pixel_offset[x + 4] - iStart,
+      program->pixel_offset[x + 3] - iStart,
+      program->pixel_offset[x + 2] - iStart,
+      program->pixel_offset[x + 1] - iStart,
+      0);
+    __m256i perm_1 = _mm256_add_epi32(perm_0, one_epi32);
+    one_epi32 = _mm256_set1_epi32(program->pixel_offset[x + 2] - program->pixel_offset[x + 1]);
+    __m256i perm_2 = _mm256_add_epi32(perm_1, one_epi32);
+    one_epi32 = _mm256_set1_epi32(program->pixel_offset[x + 3] - program->pixel_offset[x + 2]);
+    __m256i perm_3 = _mm256_add_epi32(perm_2, one_epi32);
+    /*
+    one_epi32 = _mm256_set1_epi32(program->pixel_offset[x + 1] - program->pixel_offset[x + 0]);
+    __m256i perm_1 = _mm256_add_epi32(perm_0, one_epi32);
+    one_epi32 = _mm256_set1_epi32(program->pixel_offset[x + 2] - program->pixel_offset[x + 1]);
+    __m256i perm_2 = _mm256_add_epi32(perm_1, one_epi32);
+    one_epi32 = _mm256_set1_epi32(program->pixel_offset[x + 3] - program->pixel_offset[x + 2]);
+    __m256i perm_3 = _mm256_add_epi32(perm_2, one_epi32);
+    __m256i perm_1 = _mm256_set_epi32(
+      program->pixel_offset[x + 7] - iStart,
+      program->pixel_offset[x + 6] - iStart,
+      program->pixel_offset[x + 5] - iStart,
+      program->pixel_offset[x + 4] - iStart,
+      program->pixel_offset[x + 3] - iStart,
+      program->pixel_offset[x + 2] - iStart,
+      program->pixel_offset[x + 1] - iStart,
+      0);
+    */
+
+    float* AVS_RESTRICT dst_ptr = dst + x;
+    const float* src_ptr = src + program->pixel_offset[x + 0]; // all permute offsets relative to this start offset
+
+    for (int y = 0; y < height; y++)
+    {
+      __m256 data_src = _mm256_loadu_ps(src_ptr);
+
+      __m256 data_0 = _mm256_permutevar8x32_ps(data_src, perm_0);
+      __m256 data_1 = _mm256_permutevar8x32_ps(data_src, perm_1);
+      __m256 data_2 = _mm256_permutevar8x32_ps(data_src, perm_2);
+      __m256 data_3 = _mm256_permutevar8x32_ps(data_src, perm_3);
+
+      __m256 result0 = _mm256_mul_ps(data_0, coef_0);
+      __m256 result1 = _mm256_mul_ps(data_2, coef_2);
+
+      result0 = _mm256_fmadd_ps(data_1, coef_1, result0);
+      result1 = _mm256_fmadd_ps(data_3, coef_3, result1);
+
+      _mm256_store_ps(dst_ptr, _mm256_add_ps(result0, result1));
+
+      dst_ptr += dst_pitch;
+      src_ptr += src_pitch;
+    }
+    current_coeff += filter_size * 8;
+  }
+}
+
+
+/* Universal function supporting 2 ways of processing depending on the max offset of the source samples to read in the resampling program :
+1. For high upsampling ratios it uses low read (single 8 float source samples) and permute-transpose before V-fma
+2. For downsample and no-resize convolution - use each input sequence gathering by direct addressing
+*/
+template<int filtersizemod4>
+void resize_h_planar_float_avx2_gather_permutex_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel)
+{
+  assert(filtersizemod4 >= 0 && filtersizemod4 <= 3);
+
+  const int filter_size = program->filter_size; // aligned, practically the coeff table stride
+
+  src_pitch /= sizeof(float);
+  dst_pitch /= sizeof(float);
+
+  float* src = (float*)src8;
+  float* dst = (float*)dst8;
+
+  const float* AVS_RESTRICT current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
+
+  constexpr int PIXELS_AT_A_TIME = 8; // Process eight pixels in parallel using AVX2 (2x4 using m128 lanes)
+
+  // 'source_overread_beyond_targetx' indicates if the filter kernel can read beyond the target width.
+  // Even if the filter alignment allows larger reads, our safety boundary for unaligned loads starts at 4 pixels back
+  // from the target width, as we load 4 floats at once with '_mm_loadu_ps'.
+  const int width_safe_mod = (program->safelimit_4_pixels.overread_possible ? program->safelimit_4_pixels.source_overread_beyond_targetx : width) / PIXELS_AT_A_TIME * PIXELS_AT_A_TIME;
+
+  // Preconditions:
+  assert(program->filter_size_real <= 4); // We preload all relevant coefficients (up to 4) before the height loop.
+
+  // 'target_size_alignment' ensures we can safely access coefficients using offsets like
+  // 'filter_size * 7' when processing 8 H pixels at a time or
+  // 'filter_size * 15' when processing 16 H pixels at a time
+  assert(program->target_size_alignment >= 8);
+
+  // Ensure that coefficient loading beyond the valid target size is safe for 4x4 float loads.
+  assert(program->filter_size_alignment >= 4);
+
+  bool bDoGather = false;
+  // Analyse input resampling program to select method of processing
+  for (int x = 0; x < width - 8; x += 8) // -8 to save from vector overrread at program->pixel_offset[x + 7 + 3]; ?
+  {
+    int start_off = program->pixel_offset[x + 0];
+    int end_off = program->pixel_offset[x + 7];
+    if ((end_off - start_off) + (program->filter_size_real - 1) > 8) bDoGather = true;
+
+    start_off = program->pixel_offset[x + 1];
+    end_off = program->pixel_offset[x + 7 + 1];
+    if ((end_off - start_off) + (program->filter_size_real - 1) > 8) bDoGather = true;
+
+    start_off = program->pixel_offset[x + 2];
+    end_off = program->pixel_offset[x + 7 + 2];
+    if ((end_off - start_off) + (program->filter_size_real - 1) > 8) bDoGather = true;
+
+    start_off = program->pixel_offset[x + 3];
+    end_off = program->pixel_offset[x + 7 + 3];
+    if ((end_off - start_off) + (program->filter_size_real - 1) > 8) bDoGather = true; 
+  }
+
+  if (bDoGather)
+  {
+    int x = 0;
+
+    // This 'auto' lambda construct replaces the need of templates
+    auto do_h_float_core = [&](auto partial_load) {
+      // Load up to 2x4 coefficients at once before the height loop.
+      // Pre-loading and transposing coefficients keeps register usage efficient.
+      // Assumes 'filter_size_aligned' is at least 4.
+
+      // Coefficients for the source pixel offset (for src_ptr + begin1 [0..3] and for src_ptr + begin5 [0..3] )
+      __m256 coef_1_coef_5 = _mm256_load_2_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4);
+      __m256 coef_2_coef_6 = _mm256_load_2_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5);
+      __m256 coef_3_coef_7 = _mm256_load_2_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6);
+      __m256 coef_4_coef_8 = _mm256_load_2_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7);
+
+      _MM_TRANSPOSE8_LANE4_PS(coef_1_coef_5, coef_2_coef_6, coef_3_coef_7, coef_4_coef_8);
+
+      float* AVS_RESTRICT dst_ptr = dst + x;
+      const float* src_ptr = src;
+
+      // Pixel offsets for the current target x-positions.
+      // Even for x >= width, these offsets are guaranteed to be within the allocated 'target_size_alignment'.
+      const int begin1 = program->pixel_offset[x + 0];
+      const int begin2 = program->pixel_offset[x + 1];
+      const int begin3 = program->pixel_offset[x + 2];
+      const int begin4 = program->pixel_offset[x + 3];
+      const int begin5 = program->pixel_offset[x + 4];
+      const int begin6 = program->pixel_offset[x + 5];
+      const int begin7 = program->pixel_offset[x + 6];
+      const int begin8 = program->pixel_offset[x + 7];
+
+      for (int y = 0; y < height; y++)
+      {
+        __m256 data_1_data_5;
+        __m256 data_2_data_6;
+        __m256 data_3_data_7;
+        __m256 data_4_data_8;
+
+        if constexpr (partial_load) {
+          // In the potentially unsafe zone (near the right edge of the image), we use a safe loading function
+          // to prevent reading beyond the allocated source scanline. This handles cases where loading 4 floats
+          // starting from 'src_ptr + beginX' might exceed the source buffer.
+
+          // Example of the unsafe scenario: If target width is 320, a naive load at src_ptr + 317
+          // would attempt to read floats at indices 317, 318, 319, and 320, potentially going out of bounds.
+
+          // Two main issues in the unsafe zone:
+          // 1.) Out-of-bounds memory access: Reading beyond the allocated memory for the source scanline can
+          //     lead to access violations and crashes. '_mm_loadu_ps' attempts to load 16 bytes, so even if
+          //     the starting address is within bounds, subsequent reads might not be.
+          // 2.) Garbage or NaN values: Even if a read doesn't cause a crash, accessing uninitialized or
+          //     out-of-bounds memory (especially for float types) can result in garbage data, including NaN.
+          //     Multiplying by a valid coefficient and accumulating this NaN can contaminate the final result.
+
+          // '_mm256_load_partial_safe_2_m128' safely loads up to 'filter_size_real' pixels and pads with zeros if needed,
+          // preventing out-of-bounds reads and ensuring predictable results even near the image edges.
+
+          data_1_data_5 = _mm256_load_partial_safe_2_m128<filtersizemod4>(src_ptr + begin1, src_ptr + begin5);
+          data_2_data_6 = _mm256_load_partial_safe_2_m128<filtersizemod4>(src_ptr + begin2, src_ptr + begin6);
+          data_3_data_7 = _mm256_load_partial_safe_2_m128<filtersizemod4>(src_ptr + begin3, src_ptr + begin7);
+          data_4_data_8 = _mm256_load_partial_safe_2_m128<filtersizemod4>(src_ptr + begin4, src_ptr + begin8);
+        }
+        else {
+          // In the safe zone, we can directly load 4 pixels at a time using unaligned loads.
+          data_1_data_5 = _mm256_loadu_2_m128(src_ptr + begin1, src_ptr + begin5);
+          data_2_data_6 = _mm256_loadu_2_m128(src_ptr + begin2, src_ptr + begin6);
+          data_3_data_7 = _mm256_loadu_2_m128(src_ptr + begin3, src_ptr + begin7);
+          data_4_data_8 = _mm256_loadu_2_m128(src_ptr + begin4, src_ptr + begin8);
+        }
+
+        _MM_TRANSPOSE8_LANE4_PS(data_1_data_5, data_2_data_6, data_3_data_7, data_4_data_8);
+
+        __m256 result = _mm256_mul_ps(data_1_data_5, coef_1_coef_5);
+        result = _mm256_fmadd_ps(data_2_data_6, coef_2_coef_6, result);
+        result = _mm256_fmadd_ps(data_3_data_7, coef_3_coef_7, result);
+        result = _mm256_fmadd_ps(data_4_data_8, coef_4_coef_8, result);
+
+        _mm256_store_ps(dst_ptr, result);
+
+        dst_ptr += dst_pitch;
+        src_ptr += src_pitch;
+      } // y
+      current_coeff += filter_size * 8; // Move to the next set of coefficients for the next 8 output pixels
+    }; // end of lambda
+
+  // Process the 'safe zone' where direct full unaligned loads are acceptable.
+    for (; x < width_safe_mod; x += PIXELS_AT_A_TIME)
+    {
+      do_h_float_core(std::false_type{}); // partial_load == false, use direct _mm_loadu_ps
+    }
+
+    // Process the potentially 'unsafe zone' near the image edge, using safe loading.
+    for (; x < width; x += PIXELS_AT_A_TIME)
+    {
+      do_h_float_core(std::true_type{}); // partial_load == true, use the safer '_mm256_load_partial_safe_2_m128'
+    }
+  } // if bDoGather
+  else
+  {
+    // do permutex-based upsample
+    for (int x = 0; x < width; x += 8)
+    {
+      // prepare coefs in transposed V-form
+      __m256 coef_0 = _mm256_load_2_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4);
+      __m256 coef_1 = _mm256_load_2_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5);
+      __m256 coef_2 = _mm256_load_2_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6);
+      __m256 coef_3 = _mm256_load_2_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7);
+
+      _MM_TRANSPOSE8_LANE4_PS(coef_0, coef_1, coef_2, coef_3);
+
+      // convert resampling program in H-form into permuting indexes for src transposition in V-form
+      int iStart = program->pixel_offset[x + 0];
+
+      __m256i perm_0 = _mm256_set_epi32(
+        program->pixel_offset[x + 7] - iStart,
+        program->pixel_offset[x + 6] - iStart,
+        program->pixel_offset[x + 5] - iStart,
+        program->pixel_offset[x + 4] - iStart,
+        program->pixel_offset[x + 3] - iStart,
+        program->pixel_offset[x + 2] - iStart,
+        program->pixel_offset[x + 1] - iStart,
+        0);
+      __m256i one_epi32 = _mm256_set1_epi32(1);
+      __m256i perm_1 = _mm256_add_epi32(perm_0, one_epi32);
+      one_epi32 = _mm256_set1_epi32(program->pixel_offset[x + 2] - program->pixel_offset[x + 1]);
+      __m256i perm_2 = _mm256_add_epi32(perm_1, one_epi32);
+      one_epi32 = _mm256_set1_epi32(program->pixel_offset[x + 3] - program->pixel_offset[x + 2]);
+      __m256i perm_3 = _mm256_add_epi32(perm_2, one_epi32);
+      /*
+      one_epi32 = _mm256_set1_epi32(program->pixel_offset[x + 1] - program->pixel_offset[x + 0]);
+      __m256i perm_1 = _mm256_add_epi32(perm_0, one_epi32);
+      one_epi32 = _mm256_set1_epi32(program->pixel_offset[x + 2] - program->pixel_offset[x + 1]);
+      __m256i perm_2 = _mm256_add_epi32(perm_1, one_epi32);
+      one_epi32 = _mm256_set1_epi32(program->pixel_offset[x + 3] - program->pixel_offset[x + 2]);
+      __m256i perm_3 = _mm256_add_epi32(perm_2, one_epi32);
+      __m256i perm_1 = _mm256_set_epi32(
+        program->pixel_offset[x + 7] - iStart,
+        program->pixel_offset[x + 6] - iStart,
+        program->pixel_offset[x + 5] - iStart,
+        program->pixel_offset[x + 4] - iStart,
+        program->pixel_offset[x + 3] - iStart,
+        program->pixel_offset[x + 2] - iStart,
+        program->pixel_offset[x + 1] - iStart,
+        0);
+      */
+
+      float* AVS_RESTRICT dst_ptr = dst + x;
+      const float* src_ptr = src + program->pixel_offset[x + 0]; // all permute offsets relative to this start offset
+
+      for (int y = 0; y < height; y++)
+      {
+        __m256 data_src = _mm256_loadu_ps(src_ptr);
+
+        __m256 data_0 = _mm256_permutevar8x32_ps(data_src, perm_0);
+        __m256 data_1 = _mm256_permutevar8x32_ps(data_src, perm_1);
+        __m256 data_2 = _mm256_permutevar8x32_ps(data_src, perm_2);
+        __m256 data_3 = _mm256_permutevar8x32_ps(data_src, perm_3);
+
+        __m256 result0 = _mm256_mul_ps(data_0, coef_0);
+        __m256 result1 = _mm256_mul_ps(data_2, coef_2);
+
+        result0 = _mm256_fmadd_ps(data_1, coef_1, result0);
+        result1 = _mm256_fmadd_ps(data_3, coef_3, result1);
+
+        _mm256_store_ps(dst_ptr, _mm256_add_ps(result0, result1));
+
+        dst_ptr += dst_pitch;
+        src_ptr += src_pitch;
+      }
+      current_coeff += filter_size * 8;
+    }
+  }
+}
+
+// Instantiate them
+template void resize_h_planar_float_avx2_gather_permutex_vstripe_ks4<0>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+template void resize_h_planar_float_avx2_gather_permutex_vstripe_ks4<1>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+template void resize_h_planar_float_avx2_gather_permutex_vstripe_ks4<2>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+template void resize_h_planar_float_avx2_gather_permutex_vstripe_ks4<3>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+
+
+void resize_h_planar_float_avx2_permutex_vstripe_ks8(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel)
+{
+
+  // assert - check if max pixel_offset is not above single load of 8 src floats (or need several loads and more complex permute program)
+#ifdef _DEBUG
+  for (int x = 0; x < width; x += 8)
+  {
+    int start_off = program->pixel_offset[x + 0];
+    int end_off = program->pixel_offset[x + 7];
+    assert((end_off - start_off) > 7);
+  }
+#endif
+
+  int filter_size = program->filter_size; // must be 8
+  assert(filter_size != 8);
+
+  const float* AVS_RESTRICT current_coeff;
+  __m256i one_epi32 = _mm256_set1_epi32(1);
+
+  src_pitch = src_pitch / sizeof(float);
+  dst_pitch = dst_pitch / sizeof(float);
+
+  float* src = (float*)src8;
+  float* dst = (float*)dst8;
+
+  current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
+
+  for (int x = 0; x < width; x += 8)
+  {
+    // prepare coefs in transposed V-form
+    __m256 coef_0 = _mm256_load_2_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4);
+    __m256 coef_1 = _mm256_load_2_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5);
+    __m256 coef_2 = _mm256_load_2_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6);
+    __m256 coef_3 = _mm256_load_2_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7);
+
+    __m256 coef_4 = _mm256_load_2_m128(current_coeff + filter_size * 0 + 4, current_coeff + filter_size * 4 + 4);
+    __m256 coef_5 = _mm256_load_2_m128(current_coeff + filter_size * 1 + 4, current_coeff + filter_size * 5 + 4);
+    __m256 coef_6 = _mm256_load_2_m128(current_coeff + filter_size * 2 + 4, current_coeff + filter_size * 6 + 4);
+    __m256 coef_7 = _mm256_load_2_m128(current_coeff + filter_size * 3 + 4, current_coeff + filter_size * 7 + 4);
+
+    _MM_TRANSPOSE8_LANE4_PS(coef_0, coef_1, coef_2, coef_3);
+    _MM_TRANSPOSE8_LANE4_PS(coef_4, coef_5, coef_6, coef_7);
+
+    // convert resampling program in H-form into permuting indexes for src transposition in V-form
+    int iStart = program->pixel_offset[x + 0];
+    __m256i perm_0 = _mm256_set_epi32(program->pixel_offset[x + 7] - iStart, program->pixel_offset[x + 6] - iStart, program->pixel_offset[x + 5] - iStart, program->pixel_offset[x + 4] - iStart, program->pixel_offset[x + 3] - iStart, program->pixel_offset[x + 2] - iStart, program->pixel_offset[x + 1] - iStart, 0);
+    __m256i perm_1 = _mm256_add_epi32(perm_0, one_epi32);
+    __m256i perm_2 = _mm256_add_epi32(perm_1, one_epi32);
+    __m256i perm_3 = _mm256_add_epi32(perm_2, one_epi32);
+
+    float* AVS_RESTRICT dst_ptr = dst + x;
+    const float* src_ptr = src + program->pixel_offset[x + 0]; // all permute offsets relative to this start offset
+
+    for (int y = 0; y < height; y++)
+    {
+      __m256 result;
+      __m256 data_src = _mm256_loadu_ps(src_ptr);
+
+      __m256 data_0 = _mm256_permutevar8x32_ps(data_src, perm_0);
+      __m256 data_1 = _mm256_permutevar8x32_ps(data_src, perm_1);
+      __m256 data_2 = _mm256_permutevar8x32_ps(data_src, perm_2);
+      __m256 data_3 = _mm256_permutevar8x32_ps(data_src, perm_3);
+
+      __m256 result0 = _mm256_mul_ps(data_0, coef_0);
+      __m256 result1 = _mm256_mul_ps(data_2, coef_2);
+
+      result0 = _mm256_fmadd_ps(data_1, coef_1, result0);
+      result1 = _mm256_fmadd_ps(data_3, coef_3, result1);
+
+      result = _mm256_add_ps(result0, result1);
+
+      // next next 4 samples + 4 coefs
+      data_src = _mm256_loadu_ps(src_ptr + 4);
+
+      data_0 = _mm256_permutevar8x32_ps(data_src, perm_0);
+      data_1 = _mm256_permutevar8x32_ps(data_src, perm_1);
+      data_2 = _mm256_permutevar8x32_ps(data_src, perm_2);
+      data_3 = _mm256_permutevar8x32_ps(data_src, perm_3);
+
+      result0 = _mm256_mul_ps(data_0, coef_4);
+      result1 = _mm256_mul_ps(data_2, coef_6);
+
+      result0 = _mm256_fmadd_ps(data_1, coef_5, result0);
+      result1 = _mm256_fmadd_ps(data_3, coef_7, result1);
+
+      result = _mm256_add_ps(result, result0);
+      result = _mm256_add_ps(result, result1);
+
+      _mm256_store_ps(dst_ptr, result);
+
+      dst_ptr += dst_pitch;
+      src_ptr += src_pitch;
+    }
+    current_coeff += filter_size * 8;
+  }
+}
+
+#if 0
+// Original DTL2020, made end-contition safe
+// process kernel size from up to 4 - BilinearResize, BicubicResize or sinc up to taps=2
+void resize_h_planar_float_avx_transpose_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel) {
+  int filter_size = program->filter_size;
+
+  const float* AVS_RESTRICT current_coeff;
+
+  src_pitch = src_pitch / sizeof(float);
+  dst_pitch = dst_pitch / sizeof(float);
+
+  float* src = (float*)src8;
+  float* dst = (float*)dst8;
+
+  current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
+
+  for (int x = 0; x < width; x += 8)
+  {
+    __m256 coef_1_coef_5 = _mm256_load_2_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4);
+    __m256 coef_2_coef_6 = _mm256_load_2_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5);
+    __m256 coef_3_coef_7 = _mm256_load_2_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6);
+    __m256 coef_4_coef_8 = _mm256_load_2_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7);
+
+    _MM_TRANSPOSE8_LANE4_PS(coef_1_coef_5, coef_2_coef_6, coef_3_coef_7, coef_4_coef_8);
+
+    float* AVS_RESTRICT dst_ptr = dst + x;
+    const float* src_ptr = src;
+
+    for (int y = 0; y < height; y++)
+    {
+      __m256 data_1_data_5 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 0], src_ptr + program->pixel_offset[x + 4]);
+      __m256 data_2_data_6 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 1], src_ptr + program->pixel_offset[x + 5]);
+      __m256 data_3_data_7 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 2], src_ptr + program->pixel_offset[x + 6]);
+      __m256 data_4_data_8 = _mm256_loadu_2_m128(src_ptr + program->pixel_offset[x + 3], src_ptr + program->pixel_offset[x + 7]);
+
+      _MM_TRANSPOSE8_LANE4_PS(data_1_data_5, data_2_data_6, data_3_data_7, data_4_data_8);
+
+      __m256 result = _mm256_mul_ps(data_1_data_5, coef_1_coef_5);
+      result = _mm256_fmadd_ps(data_2_data_6, coef_2_coef_6, result);
+      result = _mm256_fmadd_ps(data_3_data_7, coef_3_coef_7, result);
+      result = _mm256_fmadd_ps(data_4_data_8, coef_4_coef_8, result);
+
+      _mm256_store_ps(dst_ptr, result);
+
+      dst_ptr += dst_pitch;
+      src_ptr += src_pitch;
+    }
+    current_coeff += filter_size * 8;
+  }
+
+}
+#endif
diff --git a/avs_core/filters/intel/resample_avx2.h b/avs_core/filters/intel/resample_avx2.h
index a3b163aa3..8f4b41625 100644
--- a/avs_core/filters/intel/resample_avx2.h
+++ b/avs_core/filters/intel/resample_avx2.h
@@ -51,5 +51,75 @@ template<bool lessthan16bit>
 void resize_v_avx2_planar_uint16_t(BYTE* dst0, const BYTE* src0, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel);
 
 void resize_v_avx2_planar_float(BYTE* dst0, const BYTE* src0, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel);
+void resize_v_avx2_planar_float_w_sr(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel);
+
+void resize_h_planar_float_avx_transpose(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+
+template<int filtersizemod4>
+void resize_h_planar_float_avx_transpose_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+
+void resize_h_planar_float_avx2_permutex_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+
+template<int filtersizemod4>
+void resize_h_planar_float_avx2_gather_permutex_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+
+// Transpose 4x4 blocks within each lane
+#define _MM_TRANSPOSE8_LANE4_PS(row0, row1, row2, row3) \
+  do { \
+    __m256 __t0, __t1, __t2, __t3; \
+    __t0 = _mm256_unpacklo_ps(row0, row1); \
+    __t1 = _mm256_unpackhi_ps(row0, row1); \
+    __t2 = _mm256_unpacklo_ps(row2, row3); \
+    __t3 = _mm256_unpackhi_ps(row2, row3); \
+    row0 = _mm256_shuffle_ps(__t0, __t2, _MM_SHUFFLE(1, 0, 1, 0)); \
+    row1 = _mm256_shuffle_ps(__t0, __t2, _MM_SHUFFLE(3, 2, 3, 2)); \
+    row2 = _mm256_shuffle_ps(__t1, __t3, _MM_SHUFFLE(1, 0, 1, 0)); \
+    row3 = _mm256_shuffle_ps(__t1, __t3, _MM_SHUFFLE(3, 2, 3, 2)); \
+  } while (0)
+
+#define _MM_TRANSPOSE8_PS(row0, row1, row2, row3, row4, row5, row6, row7) \
+  do { \
+    __m256 __t0, __t1, __t2, __t3, __t4, __t5, __t6, __t7; \
+    __m256 __tt0, __tt1, __tt2, __tt3, __tt4, __tt5, __tt6, __tt7; \
+    __t0 = _mm256_unpacklo_ps(row0, row1); \
+    __t1 = _mm256_unpackhi_ps(row0, row1); \
+    __t2 = _mm256_unpacklo_ps(row2, row3); \
+    __t3 = _mm256_unpackhi_ps(row2, row3); \
+    __t4 = _mm256_unpacklo_ps(row4, row5); \
+    __t5 = _mm256_unpackhi_ps(row4, row5); \
+    __t6 = _mm256_unpacklo_ps(row6, row7); \
+    __t7 = _mm256_unpackhi_ps(row6, row7); \
+    __tt0 = _mm256_shuffle_ps(__t0, __t2, _MM_SHUFFLE(1, 0, 1, 0)); \
+    __tt1 = _mm256_shuffle_ps(__t0, __t2, _MM_SHUFFLE(3, 2, 3, 2)); \
+    __tt2 = _mm256_shuffle_ps(__t1, __t3, _MM_SHUFFLE(1, 0, 1, 0)); \
+    __tt3 = _mm256_shuffle_ps(__t1, __t3, _MM_SHUFFLE(3, 2, 3, 2)); \
+    __tt4 = _mm256_shuffle_ps(__t4, __t6, _MM_SHUFFLE(1, 0, 1, 0)); \
+    __tt5 = _mm256_shuffle_ps(__t4, __t6, _MM_SHUFFLE(3, 2, 3, 2)); \
+    __tt6 = _mm256_shuffle_ps(__t5, __t7, _MM_SHUFFLE(1, 0, 1, 0)); \
+    __tt7 = _mm256_shuffle_ps(__t5, __t7, _MM_SHUFFLE(3, 2, 3, 2)); \
+    row0 = _mm256_permute2f128_ps(__tt0, __tt4, 0x20); \
+    row1 = _mm256_permute2f128_ps(__tt1, __tt5, 0x20); \
+    row2 = _mm256_permute2f128_ps(__tt2, __tt6, 0x20); \
+    row3 = _mm256_permute2f128_ps(__tt3, __tt7, 0x20); \
+    row4 = _mm256_permute2f128_ps(__tt0, __tt4, 0x31); \
+    row5 = _mm256_permute2f128_ps(__tt1, __tt5, 0x31); \
+    row6 = _mm256_permute2f128_ps(__tt2, __tt6, 0x31); \
+    row7 = _mm256_permute2f128_ps(__tt3, __tt7, 0x31); \
+  } while (0)
+
+
+
+#ifndef _mm256_loadu_2_m128
+#define _mm256_loadu_2_m128(/* __m128 const* */ loaddr, \
+                            /* __m128 const* */ hiaddr) \
+    _mm256_set_m128(_mm_loadu_ps(hiaddr), _mm_loadu_ps(loaddr))
+#endif
+
+#ifndef _mm256_load_2_m128
+#define _mm256_load_2_m128(/* __m128 const* */ loaddr, \
+                            /* __m128 const* */ hiaddr) \
+    _mm256_set_m128(_mm_load_ps(hiaddr), _mm_load_ps(loaddr))
+#endif
+
 
 #endif // __Resample_AVX2_H__
diff --git a/avs_core/filters/intel/resample_avx512.cpp b/avs_core/filters/intel/resample_avx512.cpp
new file mode 100644
index 000000000..68e9cae78
--- /dev/null
+++ b/avs_core/filters/intel/resample_avx512.cpp
@@ -0,0 +1,2014 @@
+// Avisynth v2.5.  Copyright 2002 Ben Rudiak-Gould et al.
+// http://avisynth.nl
+
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
+// http://www.gnu.org/copyleft/gpl.html .
+//
+// Linking Avisynth statically or dynamically with other modules is making a
+// combined work based on Avisynth.  Thus, the terms and conditions of the GNU
+// General Public License cover the whole combination.
+//
+// As a special exception, the copyright holders of Avisynth give you
+// permission to link Avisynth with independent modules that communicate with
+// Avisynth solely through the interfaces defined in avisynth.h, regardless of the license
+// terms of these independent modules, and to copy and distribute the
+// resulting combined work under terms of your choice, provided that
+// every copy of the combined work is accompanied by a complete copy of
+// the source code of Avisynth (the version of Avisynth used to produce the
+// combined work), being distributed under the terms of the GNU General
+// Public License plus this exception.  An independent module is a module
+// which is not derived from or based on Avisynth, such as 3rd-party filters,
+// import and export plugins, or graphical user interfaces.
+
+#include <avs/config.h>
+#include "../core/internal.h"
+
+#include <avs/alignment.h>
+#include <avs/minmax.h>
+
+#include "resample_avx512.h"
+//------- 512 bit float Horizontals
+
+// Safe quad lane partial load with AVX512
+// Read exactly N pixels (where N mod 4 is the template parameter), avoiding
+// - reading beyond the end of the source buffer.
+// - avoid NaN contamination by padding with zeros.
+template <int Nmod4>
+AVS_FORCEINLINE static __m512 _mm512_load_partial_safe_4_m128(const float* src_ptr_offsetted1, const float* src_ptr_offsetted2, const float* src_ptr_offsetted3, const float* src_ptr_offsetted4) {
+  __m128 s1, s2, s3, s4;
+  switch (Nmod4) {
+  case 1:
+    s1 = _mm_set_ps(0.0f, 0.0f, 0.0f, src_ptr_offsetted1[0]);
+    s2 = _mm_set_ps(0.0f, 0.0f, 0.0f, src_ptr_offsetted2[0]);
+    s3 = _mm_set_ps(0.0f, 0.0f, 0.0f, src_ptr_offsetted3[0]);
+    s4 = _mm_set_ps(0.0f, 0.0f, 0.0f, src_ptr_offsetted4[0]);
+    // ideally: movss
+    break;
+  case 2:
+    s1 = _mm_set_ps(0.0f, 0.0f, src_ptr_offsetted1[1], src_ptr_offsetted1[0]);
+    s2 = _mm_set_ps(0.0f, 0.0f, src_ptr_offsetted2[1], src_ptr_offsetted2[0]);
+    s3 = _mm_set_ps(0.0f, 0.0f, src_ptr_offsetted3[1], src_ptr_offsetted3[0]);
+    s4 = _mm_set_ps(0.0f, 0.0f, src_ptr_offsetted4[1], src_ptr_offsetted4[0]);
+    // ideally: movsd
+    break;
+  case 3:
+    s1 = _mm_set_ps(0.0f, src_ptr_offsetted1[2], src_ptr_offsetted1[1], src_ptr_offsetted1[0]);
+    s2 = _mm_set_ps(0.0f, src_ptr_offsetted2[2], src_ptr_offsetted2[1], src_ptr_offsetted2[0]);
+    s3 = _mm_set_ps(0.0f, src_ptr_offsetted3[2], src_ptr_offsetted3[1], src_ptr_offsetted3[0]);
+    s4 = _mm_set_ps(0.0f, src_ptr_offsetted4[2], src_ptr_offsetted4[1], src_ptr_offsetted4[0]);
+    // ideally: movss + movsd + shuffle or movsd + insert
+    break;
+  case 0:
+    s1 = _mm_set_ps(src_ptr_offsetted1[3], src_ptr_offsetted1[2], src_ptr_offsetted1[1], src_ptr_offsetted1[0]);
+    s2 = _mm_set_ps(src_ptr_offsetted2[3], src_ptr_offsetted2[2], src_ptr_offsetted2[1], src_ptr_offsetted2[0]);
+    s3 = _mm_set_ps(src_ptr_offsetted3[3], src_ptr_offsetted3[2], src_ptr_offsetted3[1], src_ptr_offsetted3[0]);
+    s4 = _mm_set_ps(src_ptr_offsetted4[3], src_ptr_offsetted4[2], src_ptr_offsetted4[1], src_ptr_offsetted4[0]);
+    // ideally: movups
+    break;
+  default:
+    s1 = _mm_setzero_ps(); // n/a cannot happen
+    s2 = _mm_setzero_ps();
+    s3 = _mm_setzero_ps();
+    s4 = _mm_setzero_ps();
+  }
+  __m512 result = _mm512_castps128_ps512(s1); // Cast the first __m128 to __m512
+  result = _mm512_insertf32x4(result, s2, 1); // Insert the second __m128 at position 1
+  result = _mm512_insertf32x4(result, s3, 2); // Insert the third __m128 at position 2
+  result = _mm512_insertf32x4(result, s4, 3); // Insert the fourth __m128 at position 3
+  return result;
+}
+
+
+
+
+// Processes a horizontal resampling kernel of up to four coefficients for float pixel types.
+// Supports BilinearResize, BicubicResize, or sinc with up to 2 taps (filter size <= 4).
+// AVX512 optimization loads and processes four float coefficients and sixteen pixels simultaneously.
+// The 'filtersizemod4' template parameter (0-3) helps optimize for different filter sizes modulo 4.
+// This AVX512 requires only filter_size_alignment of 4.
+template<int filtersizemod4>
+void resize_h_planar_float_avx512_transpose_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel) {
+  assert(filtersizemod4 >= 0 && filtersizemod4 <= 3);
+
+  const int filter_size = program->filter_size; // aligned, practically the coeff table stride
+
+  src_pitch /= sizeof(float);
+  dst_pitch /= sizeof(float);
+
+  float* src = (float*)src8;
+  float* dst = (float*)dst8;
+
+  const float* AVS_RESTRICT current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
+
+  constexpr int PIXELS_AT_A_TIME = 16; // Process sixteen pixels in parallel using AVX512 (4x4 using m128 lanes)
+
+  // 'source_overread_beyond_targetx' indicates if the filter kernel can read beyond the target width.
+  // Even if the filter alignment allows larger reads, our safety boundary for unaligned loads starts at 4 pixels back
+  // from the target width, as we load 4 floats at once conceptually with our safe load.
+  const int width_safe_mod = (program->safelimit_4_pixels.overread_possible ? program->safelimit_4_pixels.source_overread_beyond_targetx : width) / PIXELS_AT_A_TIME * PIXELS_AT_A_TIME;
+
+  // Preconditions:
+  assert(program->filter_size_real <= 4); // We preload all relevant coefficients (up to 4) before the height loop.
+
+  // 'target_size_alignment' ensures we can safely access coefficients using offsets like
+  // 'filter_size * 7' when processing 8 H pixels at a time or
+  // 'filter_size * 15' when processing 16 H pixels at a time
+  assert(program->target_size_alignment >= 16); // Adjusted for 16 pixels
+  assert(FRAME_ALIGN >= 64); // Adjusted for 16 pixels AviSynth+ default
+
+  // Ensure that coefficient loading beyond the valid target size is safe for 4x4 float loads.
+  assert(program->filter_size_alignment >= 4);
+
+  int x = 0;
+
+  // This 'auto' lambda construct replaces the need of templates
+  auto do_h_float_core = [&](auto partial_load) {
+    // Load up to 4x4 coefficients at once before the height loop.
+    // Pre-loading and transposing coefficients keeps register usage efficient.
+    // Assumes 'filter_size_aligned' is at least 4.
+
+    // Coefficients for the source pixel offset (for src_ptr + begin1 [0..3], begin5 [0..3], begin9 [0..3], begin13 [0..3])
+    __m512 coef_1_5_9_13 = _mm512_load_4_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4, current_coeff + filter_size * 8, current_coeff + filter_size * 12);
+    __m512 coef_2_6_10_14 = _mm512_load_4_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5, current_coeff + filter_size * 9, current_coeff + filter_size * 13);
+    __m512 coef_3_7_11_15 = _mm512_load_4_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6, current_coeff + filter_size * 10, current_coeff + filter_size * 14);
+    __m512 coef_4_8_12_16 = _mm512_load_4_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7, current_coeff + filter_size * 11, current_coeff + filter_size * 15);
+
+    _MM_TRANSPOSE16_LANE4_PS(coef_1_5_9_13, coef_2_6_10_14, coef_3_7_11_15, coef_4_8_12_16);
+
+    float* AVS_RESTRICT dst_ptr = dst + x;
+    const float* src_ptr = src;
+
+    // Pixel offsets for the current target x-positions.
+    // Even for x >= width, these offsets are guaranteed to be within the allocated 'target_size_alignment'.
+    const int begin1 = program->pixel_offset[x + 0];
+    const int begin2 = program->pixel_offset[x + 1];
+    const int begin3 = program->pixel_offset[x + 2];
+    const int begin4 = program->pixel_offset[x + 3];
+    const int begin5 = program->pixel_offset[x + 4];
+    const int begin6 = program->pixel_offset[x + 5];
+    const int begin7 = program->pixel_offset[x + 6];
+    const int begin8 = program->pixel_offset[x + 7];
+    const int begin9 = program->pixel_offset[x + 8];
+    const int begin10 = program->pixel_offset[x + 9];
+    const int begin11 = program->pixel_offset[x + 10];
+    const int begin12 = program->pixel_offset[x + 11];
+    const int begin13 = program->pixel_offset[x + 12];
+    const int begin14 = program->pixel_offset[x + 13];
+    const int begin15 = program->pixel_offset[x + 14];
+    const int begin16 = program->pixel_offset[x + 15];
+
+    for (int y = 0; y < height; y++)
+    {
+      __m512 data_1_5_9_13;
+      __m512 data_2_6_10_14;
+      __m512 data_3_7_11_15;
+      __m512 data_4_8_12_16;
+
+      if constexpr (partial_load) {
+        // In the potentially unsafe zone (near the right edge of the image), we use a safe loading function
+        // to prevent reading beyond the allocated source scanline.
+
+        data_1_5_9_13 = _mm512_load_partial_safe_4_m128<filtersizemod4>(src_ptr + begin1, src_ptr + begin5, src_ptr + begin9, src_ptr + begin13);
+        data_2_6_10_14 = _mm512_load_partial_safe_4_m128<filtersizemod4>(src_ptr + begin2, src_ptr + begin6, src_ptr + begin10, src_ptr + begin14);
+        data_3_7_11_15 = _mm512_load_partial_safe_4_m128<filtersizemod4>(src_ptr + begin3, src_ptr + begin7, src_ptr + begin11, src_ptr + begin15);
+        data_4_8_12_16 = _mm512_load_partial_safe_4_m128<filtersizemod4>(src_ptr + begin4, src_ptr + begin8, src_ptr + begin12, src_ptr + begin16);
+      }
+      else {
+        // In the safe zone, we can directly load 4 pixels at a time for each of the four lanes.
+        data_1_5_9_13 = _mm512_loadu_4_m128(src_ptr + begin1, src_ptr + begin5, src_ptr + begin9, src_ptr + begin13);
+        data_2_6_10_14 = _mm512_loadu_4_m128(src_ptr + begin2, src_ptr + begin6, src_ptr + begin10, src_ptr + begin14);
+        data_3_7_11_15 = _mm512_loadu_4_m128(src_ptr + begin3, src_ptr + begin7, src_ptr + begin11, src_ptr + begin15);
+        data_4_8_12_16 = _mm512_loadu_4_m128(src_ptr + begin4, src_ptr + begin8, src_ptr + begin12, src_ptr + begin16);
+      }
+
+      _MM_TRANSPOSE16_LANE4_PS(data_1_5_9_13, data_2_6_10_14, data_3_7_11_15, data_4_8_12_16);
+
+      __m512 result = _mm512_mul_ps(data_1_5_9_13, coef_1_5_9_13);
+      result = _mm512_fmadd_ps(data_2_6_10_14, coef_2_6_10_14, result);
+      result = _mm512_fmadd_ps(data_3_7_11_15, coef_3_7_11_15, result);
+      result = _mm512_fmadd_ps(data_4_8_12_16, coef_4_8_12_16, result);
+
+      _mm512_store_ps(dst_ptr, result); 
+
+      dst_ptr += dst_pitch;
+      src_ptr += src_pitch;
+    } // y
+    current_coeff += filter_size * 16; // Move to the next set of coefficients for the next 16 output pixels
+    }; // end of lambda
+
+  // Process the 'safe zone' where direct full unaligned loads are acceptable.
+  for (; x < width_safe_mod; x += PIXELS_AT_A_TIME)
+  {
+    do_h_float_core(std::false_type{}); // partial_load == false, use direct _mm512_loadu_4_m128
+  }
+
+  // Process the potentially 'unsafe zone' near the image edge, using safe loading.
+  for (; x < width; x += PIXELS_AT_A_TIME)
+  {
+    do_h_float_core(std::true_type{}); // partial_load == true, use the safer '_mm512_load_partial_safe_4_m128'
+  }
+}
+
+// Instantiate them
+template void resize_h_planar_float_avx512_transpose_vstripe_ks4<0>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+template void resize_h_planar_float_avx512_transpose_vstripe_ks4<1>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+template void resize_h_planar_float_avx512_transpose_vstripe_ks4<2>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+template void resize_h_planar_float_avx512_transpose_vstripe_ks4<3>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+
+
+/* Universal function supporting 2 ways of processing depending on the max offset of the source samples to read in the resampling program :
+1. For high upsampling ratios it uses low read (single 8 float source samples) and permute-transpose before V-fma
+2. For downsample and no-resize convolution - use each input sequence gathering by direct addressing
+*/
+template<int filtersizemod4>
+void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel)
+{
+  assert(filtersizemod4 >= 0 && filtersizemod4 <= 3);
+
+  const int filter_size = program->filter_size; // aligned, practically the coeff table stride
+
+  src_pitch /= sizeof(float);
+  dst_pitch /= sizeof(float);
+
+  float* src = (float*)src8;
+  float* dst = (float*)dst8;
+
+  const float* AVS_RESTRICT current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
+
+  constexpr int PIXELS_AT_A_TIME = 16; // Process sixteen pixels in parallel using AVX512 (4x4 using m128 lanes)
+
+  // 'source_overread_beyond_targetx' indicates if the filter kernel can read beyond the target width.
+  // Even if the filter alignment allows larger reads, our safety boundary for unaligned loads starts at 4 pixels back
+  // from the target width, as we load 4 floats at once conceptually with our safe load.
+  const int width_safe_mod = (program->safelimit_4_pixels.overread_possible ? program->safelimit_4_pixels.source_overread_beyond_targetx : width) / PIXELS_AT_A_TIME * PIXELS_AT_A_TIME;
+
+  // Preconditions:
+  assert(program->filter_size_real <= 4); // We preload all relevant coefficients (up to 4) before the height loop.
+
+  // 'target_size_alignment' ensures we can safely access coefficients using offsets like
+  // 'filter_size * 7' when processing 8 H pixels at a time or
+  // 'filter_size * 15' when processing 16 H pixels at a time
+  assert(program->target_size_alignment >= 16); // Adjusted for 16 pixels
+  assert(FRAME_ALIGN >= 64); // Adjusted for 16 pixels AviSynth+ default
+
+  // Ensure that coefficient loading beyond the valid target size is safe for 4x4 float loads.
+  assert(program->filter_size_alignment >= 4);
+
+  bool bDoGather = false;
+  // Analyse input resampling program to select method of processing
+  for (int x = 0; x < width - 16; x += 16) // -16 to save from vector overrread at program->pixel_offset[x + 15 + 3]; ?
+  {
+    int start_off = program->pixel_offset[x + 0];
+    int end_off = program->pixel_offset[x + 15];
+    if ((end_off - start_off) + (program->filter_size_real - 1) > 32) bDoGather = true;
+
+    start_off = program->pixel_offset[x + 1];
+    end_off = program->pixel_offset[x + 15 + 1];
+    if ((end_off - start_off) + (program->filter_size_real - 1) > 32) bDoGather = true;
+
+    start_off = program->pixel_offset[x + 2];
+    end_off = program->pixel_offset[x + 15 + 2];
+    if ((end_off - start_off) + (program->filter_size_real - 1) > 32) bDoGather = true;
+
+    start_off = program->pixel_offset[x + 3];
+    end_off = program->pixel_offset[x + 15 + 3];
+    if ((end_off - start_off) + (program->filter_size_real - 1) > 32) bDoGather = true;
+  }
+
+  int x = 0;
+
+  if (bDoGather)
+  {
+    // This 'auto' lambda construct replaces the need of templates
+    auto do_h_float_core = [&](auto partial_load) {
+      // Load up to 4x4 coefficients at once before the height loop.
+      // Pre-loading and transposing coefficients keeps register usage efficient.
+      // Assumes 'filter_size_aligned' is at least 4.
+
+      // Coefficients for the source pixel offset (for src_ptr + begin1 [0..3], begin5 [0..3], begin9 [0..3], begin13 [0..3])
+      __m512 coef_1_5_9_13 = _mm512_load_4_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4, current_coeff + filter_size * 8, current_coeff + filter_size * 12);
+      __m512 coef_2_6_10_14 = _mm512_load_4_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5, current_coeff + filter_size * 9, current_coeff + filter_size * 13);
+      __m512 coef_3_7_11_15 = _mm512_load_4_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6, current_coeff + filter_size * 10, current_coeff + filter_size * 14);
+      __m512 coef_4_8_12_16 = _mm512_load_4_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7, current_coeff + filter_size * 11, current_coeff + filter_size * 15);
+
+      _MM_TRANSPOSE16_LANE4_PS(coef_1_5_9_13, coef_2_6_10_14, coef_3_7_11_15, coef_4_8_12_16);
+
+      float* AVS_RESTRICT dst_ptr = dst + x;
+      const float* src_ptr = src;
+
+      // Pixel offsets for the current target x-positions.
+      // Even for x >= width, these offsets are guaranteed to be within the allocated 'target_size_alignment'.
+      const int begin1 = program->pixel_offset[x + 0];
+      const int begin2 = program->pixel_offset[x + 1];
+      const int begin3 = program->pixel_offset[x + 2];
+      const int begin4 = program->pixel_offset[x + 3];
+      const int begin5 = program->pixel_offset[x + 4];
+      const int begin6 = program->pixel_offset[x + 5];
+      const int begin7 = program->pixel_offset[x + 6];
+      const int begin8 = program->pixel_offset[x + 7];
+      const int begin9 = program->pixel_offset[x + 8];
+      const int begin10 = program->pixel_offset[x + 9];
+      const int begin11 = program->pixel_offset[x + 10];
+      const int begin12 = program->pixel_offset[x + 11];
+      const int begin13 = program->pixel_offset[x + 12];
+      const int begin14 = program->pixel_offset[x + 13];
+      const int begin15 = program->pixel_offset[x + 14];
+      const int begin16 = program->pixel_offset[x + 15];
+
+      for (int y = 0; y < height; y++)
+      {
+        __m512 data_1_5_9_13;
+        __m512 data_2_6_10_14;
+        __m512 data_3_7_11_15;
+        __m512 data_4_8_12_16;
+
+        if constexpr (partial_load) {
+          // In the potentially unsafe zone (near the right edge of the image), we use a safe loading function
+          // to prevent reading beyond the allocated source scanline.
+
+          data_1_5_9_13 = _mm512_load_partial_safe_4_m128<filtersizemod4>(src_ptr + begin1, src_ptr + begin5, src_ptr + begin9, src_ptr + begin13);
+          data_2_6_10_14 = _mm512_load_partial_safe_4_m128<filtersizemod4>(src_ptr + begin2, src_ptr + begin6, src_ptr + begin10, src_ptr + begin14);
+          data_3_7_11_15 = _mm512_load_partial_safe_4_m128<filtersizemod4>(src_ptr + begin3, src_ptr + begin7, src_ptr + begin11, src_ptr + begin15);
+          data_4_8_12_16 = _mm512_load_partial_safe_4_m128<filtersizemod4>(src_ptr + begin4, src_ptr + begin8, src_ptr + begin12, src_ptr + begin16);
+        }
+        else {
+          // In the safe zone, we can directly load 4 pixels at a time for each of the four lanes.
+          data_1_5_9_13 = _mm512_loadu_4_m128(src_ptr + begin1, src_ptr + begin5, src_ptr + begin9, src_ptr + begin13);
+          data_2_6_10_14 = _mm512_loadu_4_m128(src_ptr + begin2, src_ptr + begin6, src_ptr + begin10, src_ptr + begin14);
+          data_3_7_11_15 = _mm512_loadu_4_m128(src_ptr + begin3, src_ptr + begin7, src_ptr + begin11, src_ptr + begin15);
+          data_4_8_12_16 = _mm512_loadu_4_m128(src_ptr + begin4, src_ptr + begin8, src_ptr + begin12, src_ptr + begin16);
+        }
+
+        _MM_TRANSPOSE16_LANE4_PS(data_1_5_9_13, data_2_6_10_14, data_3_7_11_15, data_4_8_12_16);
+
+        __m512 result = _mm512_mul_ps(data_1_5_9_13, coef_1_5_9_13);
+        result = _mm512_fmadd_ps(data_2_6_10_14, coef_2_6_10_14, result);
+        result = _mm512_fmadd_ps(data_3_7_11_15, coef_3_7_11_15, result);
+        result = _mm512_fmadd_ps(data_4_8_12_16, coef_4_8_12_16, result);
+
+        _mm512_store_ps(dst_ptr, result); 
+
+        dst_ptr += dst_pitch;
+        src_ptr += src_pitch;
+      } // y
+      current_coeff += filter_size * 16; // Move to the next set of coefficients for the next 16 output pixels
+    }; // end of lambda
+
+  // Process the 'safe zone' where direct full unaligned loads are acceptable.
+    for (; x < width_safe_mod; x += PIXELS_AT_A_TIME)
+    {
+      do_h_float_core(std::false_type{}); // partial_load == false, use direct _mm512_loadu_4_m128
+    }
+
+    // Process the potentially 'unsafe zone' near the image edge, using safe loading.
+    for (; x < width; x += PIXELS_AT_A_TIME)
+    {
+      do_h_float_core(std::true_type{}); // partial_load == true, use the safer '_mm512_load_partial_safe_4_m128'
+    }
+  } 
+  else // if(bDoGather)
+  {
+    for (int x = 0; x < width; x += 16)
+    {
+      // prepare coefs in transposed V-form
+      __m512 coef_r0 = _mm512_load_4_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4, current_coeff + filter_size * 8, current_coeff + filter_size * 12);
+      __m512 coef_r1 = _mm512_load_4_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5, current_coeff + filter_size * 9, current_coeff + filter_size * 13);
+      __m512 coef_r2 = _mm512_load_4_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6, current_coeff + filter_size * 10, current_coeff + filter_size * 14);
+      __m512 coef_r3 = _mm512_load_4_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7, current_coeff + filter_size * 11, current_coeff + filter_size * 15);
+
+      _MM_TRANSPOSE16_LANE4_PS(coef_r0, coef_r1, coef_r2, coef_r3);
+
+      // convert resampling program in H-form into permuting indexes for src transposition in V-form
+      int iStart = program->pixel_offset[x + 0];
+
+      __m512i perm_0 = _mm512_set_epi32(
+        program->pixel_offset[x + 15] - iStart,
+        program->pixel_offset[x + 14] - iStart,
+        program->pixel_offset[x + 13] - iStart,
+        program->pixel_offset[x + 12] - iStart,
+        program->pixel_offset[x + 11] - iStart,
+        program->pixel_offset[x + 10] - iStart,
+        program->pixel_offset[x + 9] - iStart,
+        program->pixel_offset[x + 8] - iStart,
+        program->pixel_offset[x + 7] - iStart,
+        program->pixel_offset[x + 6] - iStart,
+        program->pixel_offset[x + 5] - iStart,
+        program->pixel_offset[x + 4] - iStart,
+        program->pixel_offset[x + 3] - iStart,
+        program->pixel_offset[x + 2] - iStart,
+        program->pixel_offset[x + 1] - iStart,
+        0);
+
+      __m512i one_epi32 = _mm512_set1_epi32(1);
+      __m512i perm_1 = _mm512_add_epi32(perm_0, one_epi32);
+      one_epi32 = _mm512_set1_epi32(program->pixel_offset[x + 2] - program->pixel_offset[x + 1]);
+      __m512i perm_2 = _mm512_add_epi32(perm_1, one_epi32);
+      one_epi32 = _mm512_set1_epi32(program->pixel_offset[x + 3] - program->pixel_offset[x + 2]);
+      __m512i perm_3 = _mm512_add_epi32(perm_2, one_epi32);
+
+      float* AVS_RESTRICT dst_ptr = dst + x;
+      const float* src_ptr = src + program->pixel_offset[x + 0]; // all permute offsets relative to this start offset
+
+      for (int y = 0; y < height; y++) // single row proc
+      {
+        __m512 data_src = _mm512_loadu_ps(src_ptr);
+        __m512 data_src2 = _mm512_loadu_ps(src_ptr + 16); // not always needed for upscale also can cause end of buffer overread - need to add limitation (special end of buffer processing ?)
+
+        __m512 data_0 = _mm512_permutex2var_ps(data_src, perm_0, data_src2);
+        __m512 data_1 = _mm512_permutex2var_ps(data_src, perm_1, data_src2);
+        __m512 data_2 = _mm512_permutex2var_ps(data_src, perm_2, data_src2);
+        __m512 data_3 = _mm512_permutex2var_ps(data_src, perm_3, data_src2);
+
+        __m512 result0 = _mm512_mul_ps(data_0, coef_r0);
+        __m512 result1 = _mm512_mul_ps(data_2, coef_r2);
+
+        result0 = _mm512_fmadd_ps(data_1, coef_r1, result0);
+        result1 = _mm512_fmadd_ps(data_3, coef_r3, result1);
+
+        _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1)); 
+
+        dst_ptr += dst_pitch;
+        src_ptr += src_pitch;
+      }
+
+      current_coeff += filter_size * 16;
+    }
+  }
+}
+
+template void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4<0>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+template void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4<1>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+template void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4<2>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+template void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4<3>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+
+
+/* Universal function supporting 2 ways of processing depending on the max offset of the source samples to read in the resampling program :
+1. For high upsampling ratios it uses low read (single 8 float source samples) and permute-transpose before V-fma
+2. For downsample and no-resize convolution - use each input sequence gathering by direct addressing
+*/
+template<int filtersizemod4>
+void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel)
+{
+  assert(filtersizemod4 >= 0 && filtersizemod4 <= 3);
+
+  const int filter_size = program->filter_size; // aligned, practically the coeff table stride
+
+  src_pitch /= sizeof(float);
+  dst_pitch /= sizeof(float);
+
+  float* src = (float*)src8;
+  float* dst = (float*)dst8;
+
+  const float* AVS_RESTRICT current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
+
+  const int width_mod32 = (width / 32) * 32; // Process by 2x 512it (2 x 16 floats) to make memory read/write linear streams longer,
+
+  constexpr int MAX_PIXELS_AT_A_TIME = 32; // Process sixteen pixels in parallel using AVX512 (4x4 using m128 lanes)
+  constexpr int PIXELS_AT_A_TIME = 16; // Process sixteen pixels in parallel using AVX512 (4x4 using m128 lanes)
+
+  // 'source_overread_beyond_targetx' indicates if the filter kernel can read beyond the target width.
+  // Even if the filter alignment allows larger reads, our safety boundary for unaligned loads starts at 4 pixels back
+  // from the target width, as we load 4 floats at once conceptually with our safe load.
+  const int width_safe_mod = (program->safelimit_4_pixels.overread_possible ? program->safelimit_4_pixels.source_overread_beyond_targetx : width) / MAX_PIXELS_AT_A_TIME * MAX_PIXELS_AT_A_TIME;
+
+  // Preconditions:
+  assert(program->filter_size_real <= 4); // We preload all relevant coefficients (up to 4) before the height loop.
+
+  // 'target_size_alignment' ensures we can safely access coefficients using offsets like
+  // 'filter_size * 7' when processing 8 H pixels at a time or
+  // 'filter_size * 15' when processing 16 H pixels at a time
+  assert(program->target_size_alignment >= 16); // Adjusted for 16 pixels
+  assert(FRAME_ALIGN >= 64); // Adjusted for 16 pixels AviSynth+ default
+
+  // Ensure that coefficient loading beyond the valid target size is safe for 4x4 float loads.
+  assert(program->filter_size_alignment >= 4);
+
+  bool bDoGather = false;
+  // Analyse input resampling program to select method of processing
+  for (int x = 0; x < width - 16; x += 16) // -16 to save from vector overrread at program->pixel_offset[x + 15 + 3]; ?
+  {
+    int start_off = program->pixel_offset[x + 0];
+    int end_off = program->pixel_offset[x + 15];
+    if ((end_off - start_off) + (program->filter_size_real - 1) > 32) bDoGather = true;
+
+    start_off = program->pixel_offset[x + 1];
+    end_off = program->pixel_offset[x + 15 + 1];
+    if ((end_off - start_off) + (program->filter_size_real - 1) > 32) bDoGather = true;
+
+    start_off = program->pixel_offset[x + 2];
+    end_off = program->pixel_offset[x + 15 + 2];
+    if ((end_off - start_off) + (program->filter_size_real - 1) > 32) bDoGather = true;
+
+    start_off = program->pixel_offset[x + 3];
+    end_off = program->pixel_offset[x + 15 + 3];
+    if ((end_off - start_off) + (program->filter_size_real - 1) > 32) bDoGather = true;
+  }
+
+  int x = 0;
+
+  if (bDoGather) 
+  {
+    // This 'auto' lambda construct replaces the need of templates
+    auto do_h_float_core_16 = [&](auto partial_load) {
+      // Load up to 4x4 coefficients at once before the height loop.
+      // Pre-loading and transposing coefficients keeps register usage efficient.
+      // Assumes 'filter_size_aligned' is at least 4.
+
+      // Coefficients for the source pixel offset (for src_ptr + begin1 [0..3], begin5 [0..3], begin9 [0..3], begin13 [0..3])
+      __m512 coef_1_5_9_13 = _mm512_load_4_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4, current_coeff + filter_size * 8, current_coeff + filter_size * 12);
+      __m512 coef_2_6_10_14 = _mm512_load_4_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5, current_coeff + filter_size * 9, current_coeff + filter_size * 13);
+      __m512 coef_3_7_11_15 = _mm512_load_4_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6, current_coeff + filter_size * 10, current_coeff + filter_size * 14);
+      __m512 coef_4_8_12_16 = _mm512_load_4_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7, current_coeff + filter_size * 11, current_coeff + filter_size * 15);
+
+      _MM_TRANSPOSE16_LANE4_PS(coef_1_5_9_13, coef_2_6_10_14, coef_3_7_11_15, coef_4_8_12_16);
+
+      float* AVS_RESTRICT dst_ptr = dst + x;
+      const float* src_ptr = src;
+
+      // Pixel offsets for the current target x-positions.
+      // Even for x >= width, these offsets are guaranteed to be within the allocated 'target_size_alignment'.
+      const int begin1 = program->pixel_offset[x + 0];
+      const int begin2 = program->pixel_offset[x + 1];
+      const int begin3 = program->pixel_offset[x + 2];
+      const int begin4 = program->pixel_offset[x + 3];
+      const int begin5 = program->pixel_offset[x + 4];
+      const int begin6 = program->pixel_offset[x + 5];
+      const int begin7 = program->pixel_offset[x + 6];
+      const int begin8 = program->pixel_offset[x + 7];
+      const int begin9 = program->pixel_offset[x + 8];
+      const int begin10 = program->pixel_offset[x + 9];
+      const int begin11 = program->pixel_offset[x + 10];
+      const int begin12 = program->pixel_offset[x + 11];
+      const int begin13 = program->pixel_offset[x + 12];
+      const int begin14 = program->pixel_offset[x + 13];
+      const int begin15 = program->pixel_offset[x + 14];
+      const int begin16 = program->pixel_offset[x + 15];
+
+      for (int y = 0; y < height; y++)
+      {
+        __m512 data_1_5_9_13;
+        __m512 data_2_6_10_14;
+        __m512 data_3_7_11_15;
+        __m512 data_4_8_12_16;
+
+        if constexpr (partial_load) {
+          // In the potentially unsafe zone (near the right edge of the image), we use a safe loading function
+          // to prevent reading beyond the allocated source scanline.
+
+          data_1_5_9_13 = _mm512_load_partial_safe_4_m128<filtersizemod4>(src_ptr + begin1, src_ptr + begin5, src_ptr + begin9, src_ptr + begin13);
+          data_2_6_10_14 = _mm512_load_partial_safe_4_m128<filtersizemod4>(src_ptr + begin2, src_ptr + begin6, src_ptr + begin10, src_ptr + begin14);
+          data_3_7_11_15 = _mm512_load_partial_safe_4_m128<filtersizemod4>(src_ptr + begin3, src_ptr + begin7, src_ptr + begin11, src_ptr + begin15);
+          data_4_8_12_16 = _mm512_load_partial_safe_4_m128<filtersizemod4>(src_ptr + begin4, src_ptr + begin8, src_ptr + begin12, src_ptr + begin16);
+        }
+        else {
+          // In the safe zone, we can directly load 4 pixels at a time for each of the four lanes.
+          data_1_5_9_13 = _mm512_loadu_4_m128(src_ptr + begin1, src_ptr + begin5, src_ptr + begin9, src_ptr + begin13);
+          data_2_6_10_14 = _mm512_loadu_4_m128(src_ptr + begin2, src_ptr + begin6, src_ptr + begin10, src_ptr + begin14);
+          data_3_7_11_15 = _mm512_loadu_4_m128(src_ptr + begin3, src_ptr + begin7, src_ptr + begin11, src_ptr + begin15);
+          data_4_8_12_16 = _mm512_loadu_4_m128(src_ptr + begin4, src_ptr + begin8, src_ptr + begin12, src_ptr + begin16);
+        }
+
+        _MM_TRANSPOSE16_LANE4_PS(data_1_5_9_13, data_2_6_10_14, data_3_7_11_15, data_4_8_12_16);
+
+        __m512 result = _mm512_mul_ps(data_1_5_9_13, coef_1_5_9_13);
+        result = _mm512_fmadd_ps(data_2_6_10_14, coef_2_6_10_14, result);
+        result = _mm512_fmadd_ps(data_3_7_11_15, coef_3_7_11_15, result);
+        result = _mm512_fmadd_ps(data_4_8_12_16, coef_4_8_12_16, result);
+
+        _mm512_store_ps(dst_ptr, result); 
+
+        dst_ptr += dst_pitch;
+        src_ptr += src_pitch;
+      } // y
+      current_coeff += filter_size * 16; // Move to the next set of coefficients for the next 16 output pixels
+    }; // end of lambda_16
+
+    // This 'auto' lambda construct replaces the need of templates
+    auto do_h_float_core_32 = [&](auto partial_load) {
+      // Load up to 4x4 coefficients at once before the height loop.
+      // Pre-loading and transposing coefficients keeps register usage efficient.
+      // Assumes 'filter_size_aligned' is at least 4.
+
+      // Coefficients for the source pixel offset (for src_ptr + begin1 [0..3], begin5 [0..3], begin9 [0..3], begin13 [0..3])
+      __m512 coef_1_5_9_13 = _mm512_load_4_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4, current_coeff + filter_size * 8, current_coeff + filter_size * 12);
+      __m512 coef_2_6_10_14 = _mm512_load_4_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5, current_coeff + filter_size * 9, current_coeff + filter_size * 13);
+      __m512 coef_3_7_11_15 = _mm512_load_4_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6, current_coeff + filter_size * 10, current_coeff + filter_size * 14);
+      __m512 coef_4_8_12_16 = _mm512_load_4_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7, current_coeff + filter_size * 11, current_coeff + filter_size * 15);
+
+      _MM_TRANSPOSE16_LANE4_PS(coef_1_5_9_13, coef_2_6_10_14, coef_3_7_11_15, coef_4_8_12_16);
+
+      // Coefficients for the source pixel offset (for src_ptr + begin1 [0..3], begin5 [0..3], begin9 [0..3], begin13 [0..3])
+      __m512 coef_1_5_9_13_2 = _mm512_load_4_m128(current_coeff + filter_size * 16, current_coeff + filter_size * 20, current_coeff + filter_size * 24, current_coeff + filter_size * 28);
+      __m512 coef_2_6_10_14_2 = _mm512_load_4_m128(current_coeff + filter_size * 17, current_coeff + filter_size * 21, current_coeff + filter_size * 25, current_coeff + filter_size * 29);
+      __m512 coef_3_7_11_15_2 = _mm512_load_4_m128(current_coeff + filter_size * 18, current_coeff + filter_size * 22, current_coeff + filter_size * 26, current_coeff + filter_size * 30);
+      __m512 coef_4_8_12_16_2 = _mm512_load_4_m128(current_coeff + filter_size * 19, current_coeff + filter_size * 23, current_coeff + filter_size * 27, current_coeff + filter_size * 31);
+
+      _MM_TRANSPOSE16_LANE4_PS(coef_1_5_9_13_2, coef_2_6_10_14_2, coef_3_7_11_15_2, coef_4_8_12_16_2);
+
+      float* AVS_RESTRICT dst_ptr = dst + x;
+      const float* src_ptr = src;
+
+      // Pixel offsets for the current target x-positions.
+      // Even for x >= width, these offsets are guaranteed to be within the allocated 'target_size_alignment'.
+      const int begin1 = program->pixel_offset[x + 0];
+      const int begin2 = program->pixel_offset[x + 1];
+      const int begin3 = program->pixel_offset[x + 2];
+      const int begin4 = program->pixel_offset[x + 3];
+      const int begin5 = program->pixel_offset[x + 4];
+      const int begin6 = program->pixel_offset[x + 5];
+      const int begin7 = program->pixel_offset[x + 6];
+      const int begin8 = program->pixel_offset[x + 7];
+      const int begin9 = program->pixel_offset[x + 8];
+      const int begin10 = program->pixel_offset[x + 9];
+      const int begin11 = program->pixel_offset[x + 10];
+      const int begin12 = program->pixel_offset[x + 11];
+      const int begin13 = program->pixel_offset[x + 12];
+      const int begin14 = program->pixel_offset[x + 13];
+      const int begin15 = program->pixel_offset[x + 14];
+      const int begin16 = program->pixel_offset[x + 15];
+
+      // Pixel offsets for the current target x-positions.
+      // Even for x >= width, these offsets are guaranteed to be within the allocated 'target_size_alignment'.
+      const int begin1_2 = program->pixel_offset[x + 16];
+      const int begin2_2 = program->pixel_offset[x + 17];
+      const int begin3_2 = program->pixel_offset[x + 18];
+      const int begin4_2 = program->pixel_offset[x + 19];
+      const int begin5_2 = program->pixel_offset[x + 20];
+      const int begin6_2 = program->pixel_offset[x + 21];
+      const int begin7_2 = program->pixel_offset[x + 22];
+      const int begin8_2 = program->pixel_offset[x + 23];
+      const int begin9_2 = program->pixel_offset[x + 24];
+      const int begin10_2 = program->pixel_offset[x + 25];
+      const int begin11_2 = program->pixel_offset[x + 26];
+      const int begin12_2 = program->pixel_offset[x + 27];
+      const int begin13_2 = program->pixel_offset[x + 28];
+      const int begin14_2 = program->pixel_offset[x + 29];
+      const int begin15_2 = program->pixel_offset[x + 30];
+      const int begin16_2 = program->pixel_offset[x + 31];
+
+      for (int y = 0; y < height; y++)
+      {
+        __m512 data_1_5_9_13;
+        __m512 data_2_6_10_14;
+        __m512 data_3_7_11_15;
+        __m512 data_4_8_12_16;
+
+        __m512 data_1_5_9_13_2;
+        __m512 data_2_6_10_14_2;
+        __m512 data_3_7_11_15_2;
+        __m512 data_4_8_12_16_2;
+
+        if constexpr (partial_load) {
+          // In the potentially unsafe zone (near the right edge of the image), we use a safe loading function
+          // to prevent reading beyond the allocated source scanline.
+
+          data_1_5_9_13 = _mm512_load_partial_safe_4_m128<filtersizemod4>(src_ptr + begin1, src_ptr + begin5, src_ptr + begin9, src_ptr + begin13);
+          data_2_6_10_14 = _mm512_load_partial_safe_4_m128<filtersizemod4>(src_ptr + begin2, src_ptr + begin6, src_ptr + begin10, src_ptr + begin14);
+          data_3_7_11_15 = _mm512_load_partial_safe_4_m128<filtersizemod4>(src_ptr + begin3, src_ptr + begin7, src_ptr + begin11, src_ptr + begin15);
+          data_4_8_12_16 = _mm512_load_partial_safe_4_m128<filtersizemod4>(src_ptr + begin4, src_ptr + begin8, src_ptr + begin12, src_ptr + begin16);
+
+          data_1_5_9_13_2 = _mm512_load_partial_safe_4_m128<filtersizemod4>(src_ptr + begin1_2, src_ptr + begin5_2, src_ptr + begin9_2, src_ptr + begin13_2);
+          data_2_6_10_14_2 = _mm512_load_partial_safe_4_m128<filtersizemod4>(src_ptr + begin2_2, src_ptr + begin6_2, src_ptr + begin10_2, src_ptr + begin14_2);
+          data_3_7_11_15_2 = _mm512_load_partial_safe_4_m128<filtersizemod4>(src_ptr + begin3_2, src_ptr + begin7_2, src_ptr + begin11_2, src_ptr + begin15_2);
+          data_4_8_12_16_2 = _mm512_load_partial_safe_4_m128<filtersizemod4>(src_ptr + begin4_2, src_ptr + begin8_2, src_ptr + begin12_2, src_ptr + begin16_2);
+
+        }
+        else {
+          // In the safe zone, we can directly load 4 pixels at a time for each of the four lanes.
+          data_1_5_9_13 = _mm512_loadu_4_m128(src_ptr + begin1, src_ptr + begin5, src_ptr + begin9, src_ptr + begin13);
+          data_2_6_10_14 = _mm512_loadu_4_m128(src_ptr + begin2, src_ptr + begin6, src_ptr + begin10, src_ptr + begin14);
+          data_3_7_11_15 = _mm512_loadu_4_m128(src_ptr + begin3, src_ptr + begin7, src_ptr + begin11, src_ptr + begin15);
+          data_4_8_12_16 = _mm512_loadu_4_m128(src_ptr + begin4, src_ptr + begin8, src_ptr + begin12, src_ptr + begin16);
+
+          data_1_5_9_13_2 = _mm512_loadu_4_m128(src_ptr + begin1_2, src_ptr + begin5_2, src_ptr + begin9_2, src_ptr + begin13_2);
+          data_2_6_10_14_2 = _mm512_loadu_4_m128(src_ptr + begin2_2, src_ptr + begin6_2, src_ptr + begin10_2, src_ptr + begin14_2);
+          data_3_7_11_15_2 = _mm512_loadu_4_m128(src_ptr + begin3_2, src_ptr + begin7_2, src_ptr + begin11_2, src_ptr + begin15_2);
+          data_4_8_12_16_2 = _mm512_loadu_4_m128(src_ptr + begin4_2, src_ptr + begin8_2, src_ptr + begin12_2, src_ptr + begin16_2);
+
+        }
+
+        _MM_TRANSPOSE16_LANE4_PS(data_1_5_9_13, data_2_6_10_14, data_3_7_11_15, data_4_8_12_16);
+        _MM_TRANSPOSE16_LANE4_PS(data_1_5_9_13_2, data_2_6_10_14_2, data_3_7_11_15_2, data_4_8_12_16_2);
+
+        __m512 result = _mm512_mul_ps(data_1_5_9_13, coef_1_5_9_13);
+        result = _mm512_fmadd_ps(data_2_6_10_14, coef_2_6_10_14, result);
+        result = _mm512_fmadd_ps(data_3_7_11_15, coef_3_7_11_15, result);
+        result = _mm512_fmadd_ps(data_4_8_12_16, coef_4_8_12_16, result);
+
+        __m512 result_2 = _mm512_mul_ps(data_1_5_9_13_2, coef_1_5_9_13_2);
+        result_2 = _mm512_fmadd_ps(data_2_6_10_14_2, coef_2_6_10_14_2, result_2);
+        result_2 = _mm512_fmadd_ps(data_3_7_11_15_2, coef_3_7_11_15_2, result_2);
+        result_2 = _mm512_fmadd_ps(data_4_8_12_16_2, coef_4_8_12_16_2, result_2);
+
+
+        _mm512_store_ps(dst_ptr, result); 
+        _mm512_store_ps(dst_ptr + 16, result_2);
+
+        dst_ptr += dst_pitch;
+        src_ptr += src_pitch;
+      } // y
+      current_coeff += filter_size * 32; // Move to the next set of coefficients for the next 32 output pixels
+    }; // end of lambda
+
+    // Process the 'safe zone' where direct full unaligned loads are acceptable.
+    for (; x < std::min(width_mod32, width_safe_mod); x += 32)
+    {
+      do_h_float_core_32(std::false_type{}); // partial_load == false, use direct _mm512_loadu_4_m128
+    }
+
+    for (width_mod32; x < width_safe_mod; x += PIXELS_AT_A_TIME) 
+    {
+      do_h_float_core_16(std::false_type{}); // partial_load == false, use direct _mm512_loadu_4_m128
+    }
+
+    // Process the potentially 'unsafe zone' near the image edge, using safe loading.
+    for (; x < width; x += PIXELS_AT_A_TIME)
+    {
+      do_h_float_core_16(std::true_type{}); // partial_load == true, use the safer '_mm512_load_partial_safe_4_m128'
+    }
+  }
+  else // if(bDoGather)
+  {
+    for (int x = 0; x < width_mod32; x += 32)
+    {
+      // prepare coefs in transposed V-form
+      __m512 coef_r0 = _mm512_load_4_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4, current_coeff + filter_size * 8, current_coeff + filter_size * 12);
+      __m512 coef_r1 = _mm512_load_4_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5, current_coeff + filter_size * 9, current_coeff + filter_size * 13);
+      __m512 coef_r2 = _mm512_load_4_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6, current_coeff + filter_size * 10, current_coeff + filter_size * 14);
+      __m512 coef_r3 = _mm512_load_4_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7, current_coeff + filter_size * 11, current_coeff + filter_size * 15);
+
+      _MM_TRANSPOSE16_LANE4_PS(coef_r0, coef_r1, coef_r2, coef_r3);
+
+      __m512 coef_r0_2 = _mm512_load_4_m128(current_coeff + filter_size * 16, current_coeff + filter_size * 20, current_coeff + filter_size * 24, current_coeff + filter_size * 28);
+      __m512 coef_r1_2 = _mm512_load_4_m128(current_coeff + filter_size * 17, current_coeff + filter_size * 21, current_coeff + filter_size * 25, current_coeff + filter_size * 29);
+      __m512 coef_r2_2 = _mm512_load_4_m128(current_coeff + filter_size * 18, current_coeff + filter_size * 22, current_coeff + filter_size * 26, current_coeff + filter_size * 30);
+      __m512 coef_r3_2 = _mm512_load_4_m128(current_coeff + filter_size * 19, current_coeff + filter_size * 23, current_coeff + filter_size * 27, current_coeff + filter_size * 31);
+
+      _MM_TRANSPOSE16_LANE4_PS(coef_r0_2, coef_r1_2, coef_r2_2, coef_r3_2);
+
+      // convert resampling program in H-form into permuting indexes for src transposition in V-form
+      int iStart = program->pixel_offset[x + 0];
+
+      __m512i perm_0 = _mm512_set_epi32(
+        program->pixel_offset[x + 15] - iStart,
+        program->pixel_offset[x + 14] - iStart,
+        program->pixel_offset[x + 13] - iStart,
+        program->pixel_offset[x + 12] - iStart,
+        program->pixel_offset[x + 11] - iStart,
+        program->pixel_offset[x + 10] - iStart,
+        program->pixel_offset[x + 9] - iStart,
+        program->pixel_offset[x + 8] - iStart,
+        program->pixel_offset[x + 7] - iStart,
+        program->pixel_offset[x + 6] - iStart,
+        program->pixel_offset[x + 5] - iStart,
+        program->pixel_offset[x + 4] - iStart,
+        program->pixel_offset[x + 3] - iStart,
+        program->pixel_offset[x + 2] - iStart,
+        program->pixel_offset[x + 1] - iStart,
+        0);
+
+      __m512i one_epi32 = _mm512_set1_epi32(1);
+      __m512i perm_1 = _mm512_add_epi32(perm_0, one_epi32);
+      one_epi32 = _mm512_set1_epi32(program->pixel_offset[x + 2] - program->pixel_offset[x + 1]);
+      __m512i perm_2 = _mm512_add_epi32(perm_1, one_epi32);
+      one_epi32 = _mm512_set1_epi32(program->pixel_offset[x + 3] - program->pixel_offset[x + 2]);
+      __m512i perm_3 = _mm512_add_epi32(perm_2, one_epi32);
+
+      // second gropup
+      __m512i perm_0_2 = _mm512_set_epi32(
+        program->pixel_offset[x + 31] - iStart,
+        program->pixel_offset[x + 30] - iStart,
+        program->pixel_offset[x + 29] - iStart,
+        program->pixel_offset[x + 28] - iStart,
+        program->pixel_offset[x + 27] - iStart,
+        program->pixel_offset[x + 26] - iStart,
+        program->pixel_offset[x + 25] - iStart,
+        program->pixel_offset[x + 24] - iStart,
+        program->pixel_offset[x + 23] - iStart,
+        program->pixel_offset[x + 22] - iStart,
+        program->pixel_offset[x + 21] - iStart,
+        program->pixel_offset[x + 20] - iStart,
+        program->pixel_offset[x + 19] - iStart,
+        program->pixel_offset[x + 18] - iStart,
+        program->pixel_offset[x + 17] - iStart,
+        program->pixel_offset[x + 16] - iStart);
+
+
+      __m512i perm_1_2 = _mm512_add_epi32(perm_0_2, one_epi32);
+      one_epi32 = _mm512_set1_epi32(program->pixel_offset[x + 2] - program->pixel_offset[x + 1]);
+      __m512i perm_2_2 = _mm512_add_epi32(perm_1_2, one_epi32);
+      one_epi32 = _mm512_set1_epi32(program->pixel_offset[x + 3] - program->pixel_offset[x + 2]);
+      __m512i perm_3_2 = _mm512_add_epi32(perm_2_2, one_epi32);
+
+      float* AVS_RESTRICT dst_ptr = dst + x;
+      const float* src_ptr = src + program->pixel_offset[x + 0]; // all permute offsets relative to this start offset
+      const float* src_ptr2 = src + program->pixel_offset[x + 16]; // all permute offsets relative to this start offset
+
+      for (int y = 0; y < height; y++) // single row proc
+      {
+        __m512 data_src = _mm512_loadu_ps(src_ptr);
+        __m512 data_src2 = _mm512_loadu_ps(src_ptr + 16); // not always needed for upscale also can cause end of buffer overread - need to add limitation (special end of buffer processing ?)
+
+        __m512 data_src_2 = _mm512_loadu_ps(src_ptr2);
+        __m512 data_src2_2 = _mm512_loadu_ps(src_ptr2 + 16); // not always needed for upscale also can cause end of buffer overread - need to add limitation (special end of buffer processing ?)
+
+        __m512 data_0 = _mm512_permutex2var_ps(data_src, perm_0, data_src2);
+        __m512 data_1 = _mm512_permutex2var_ps(data_src, perm_1, data_src2);
+        __m512 data_2 = _mm512_permutex2var_ps(data_src, perm_2, data_src2);
+        __m512 data_3 = _mm512_permutex2var_ps(data_src, perm_3, data_src2);
+
+        __m512 data_0_2 = _mm512_permutex2var_ps(data_src_2, perm_0_2, data_src2_2);
+        __m512 data_1_2 = _mm512_permutex2var_ps(data_src_2, perm_1_2, data_src2_2);
+        __m512 data_2_2 = _mm512_permutex2var_ps(data_src_2, perm_2_2, data_src2_2);
+        __m512 data_3_2 = _mm512_permutex2var_ps(data_src_2, perm_3_2, data_src2_2);
+
+        __m512 result0 = _mm512_mul_ps(data_0, coef_r0);
+        __m512 result1 = _mm512_mul_ps(data_2, coef_r2);
+
+        __m512 result0_2 = _mm512_mul_ps(data_0_2, coef_r0_2);
+        __m512 result1_2 = _mm512_mul_ps(data_2_2, coef_r2_2);
+
+        result0 = _mm512_fmadd_ps(data_1, coef_r1, result0);
+        result1 = _mm512_fmadd_ps(data_3, coef_r3, result1);
+
+        result0_2 = _mm512_fmadd_ps(data_1_2, coef_r1_2, result0_2);
+        result1_2 = _mm512_fmadd_ps(data_3_2, coef_r3_2, result1_2);
+
+
+        _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1)); 
+        _mm512_store_ps(dst_ptr + 16, _mm512_add_ps(result0_2, result1_2)); 
+
+        dst_ptr += dst_pitch;
+        src_ptr += src_pitch;
+      }
+
+      current_coeff += filter_size * 32;
+    } // to width_mo32
+
+    for (int x = width_mod32; x < width; x += 16)
+    {
+      // prepare coefs in transposed V-form
+      __m512 coef_r0 = _mm512_load_4_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4, current_coeff + filter_size * 8, current_coeff + filter_size * 12);
+      __m512 coef_r1 = _mm512_load_4_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5, current_coeff + filter_size * 9, current_coeff + filter_size * 13);
+      __m512 coef_r2 = _mm512_load_4_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6, current_coeff + filter_size * 10, current_coeff + filter_size * 14);
+      __m512 coef_r3 = _mm512_load_4_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7, current_coeff + filter_size * 11, current_coeff + filter_size * 15);
+
+      _MM_TRANSPOSE16_LANE4_PS(coef_r0, coef_r1, coef_r2, coef_r3);
+
+      // convert resampling program in H-form into permuting indexes for src transposition in V-form
+      int iStart = program->pixel_offset[x + 0];
+
+      __m512i perm_0 = _mm512_set_epi32(
+        program->pixel_offset[x + 15] - iStart,
+        program->pixel_offset[x + 14] - iStart,
+        program->pixel_offset[x + 13] - iStart,
+        program->pixel_offset[x + 12] - iStart,
+        program->pixel_offset[x + 11] - iStart,
+        program->pixel_offset[x + 10] - iStart,
+        program->pixel_offset[x + 9] - iStart,
+        program->pixel_offset[x + 8] - iStart,
+        program->pixel_offset[x + 7] - iStart,
+        program->pixel_offset[x + 6] - iStart,
+        program->pixel_offset[x + 5] - iStart,
+        program->pixel_offset[x + 4] - iStart,
+        program->pixel_offset[x + 3] - iStart,
+        program->pixel_offset[x + 2] - iStart,
+        program->pixel_offset[x + 1] - iStart,
+        0);
+
+      __m512i one_epi32 = _mm512_set1_epi32(1);
+      __m512i perm_1 = _mm512_add_epi32(perm_0, one_epi32);
+      one_epi32 = _mm512_set1_epi32(program->pixel_offset[x + 2] - program->pixel_offset[x + 1]);
+      __m512i perm_2 = _mm512_add_epi32(perm_1, one_epi32);
+      one_epi32 = _mm512_set1_epi32(program->pixel_offset[x + 3] - program->pixel_offset[x + 2]);
+      __m512i perm_3 = _mm512_add_epi32(perm_2, one_epi32);
+
+      float* AVS_RESTRICT dst_ptr = dst + x;
+      const float* src_ptr = src + program->pixel_offset[x + 0]; // all permute offsets relative to this start offset
+
+      for (int y = 0; y < height; y++) // single row proc
+      {
+        __m512 data_src = _mm512_loadu_ps(src_ptr);
+        __m512 data_src2 = _mm512_loadu_ps(src_ptr + 16); // not always needed for upscale also can cause end of buffer overread - need to add limitation (special end of buffer processing ?)
+
+        __m512 data_0 = _mm512_permutex2var_ps(data_src, perm_0, data_src2);
+        __m512 data_1 = _mm512_permutex2var_ps(data_src, perm_1, data_src2);
+        __m512 data_2 = _mm512_permutex2var_ps(data_src, perm_2, data_src2);
+        __m512 data_3 = _mm512_permutex2var_ps(data_src, perm_3, data_src2);
+
+        __m512 result0 = _mm512_mul_ps(data_0, coef_r0);
+        __m512 result1 = _mm512_mul_ps(data_2, coef_r2);
+
+        result0 = _mm512_fmadd_ps(data_1, coef_r1, result0);
+        result1 = _mm512_fmadd_ps(data_3, coef_r3, result1);
+
+        _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1)); 
+
+        dst_ptr += dst_pitch;
+        src_ptr += src_pitch;
+      }
+
+      current_coeff += filter_size * 16;
+    } // to width
+  }
+}
+
+template void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w<0>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+template void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w<1>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+template void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w<2>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+template void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w<3>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+
+
+#if 0 // DTL version
+// Transpose-based
+// process kernel size from up to 4 - BilinearResize, BicubicResize or sinc up to taps=2
+void resize_h_planar_float_avx512_transpose_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel) {
+    int filter_size = program->filter_size;
+
+    const float* AVS_RESTRICT current_coeff;
+
+    src_pitch = src_pitch / sizeof(float);
+    dst_pitch = dst_pitch / sizeof(float);
+
+    float* src = (float*)src8;
+    float* dst = (float*)dst8;
+
+    current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
+
+    // this 16xfloat works, since AviSynth aligns scanlines to 64 bytes.
+    for (int x = 0; x < width; x += 16) // is it safe to read by 16 floats = 64 bytes ?
+    {
+        __m512 c1_c5_c9_c13 = _mm512_load_4_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4, current_coeff + filter_size * 8, current_coeff + filter_size * 12);
+        __m512 c2_c6_c10_c14 = _mm512_load_4_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5, current_coeff + filter_size * 9, current_coeff + filter_size * 13);
+        __m512 c3_c7_c11_c15 = _mm512_load_4_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6, current_coeff + filter_size * 10, current_coeff + filter_size * 14);
+        __m512 c4_c8_c12_c16 = _mm512_load_4_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7, current_coeff + filter_size * 11, current_coeff + filter_size * 15);
+
+        _MM_TRANSPOSE16_LANE4_PS(c1_c5_c9_c13, c2_c6_c10_c14, c3_c7_c11_c15, c4_c8_c12_c16);
+
+        float* AVS_RESTRICT dst_ptr = dst + x;
+        const float* src_ptr = src;
+
+        for (int y = 0; y < height; y++)
+        {
+            __m512 d1_d5_d9_d13 = _mm512_loadu_4_m128(src_ptr + program->pixel_offset[x + 0], src_ptr + program->pixel_offset[x + 4], src_ptr + program->pixel_offset[x + 8], src_ptr + program->pixel_offset[x + 12]);
+            __m512 d2_d6_d10_d14 = _mm512_loadu_4_m128(src_ptr + program->pixel_offset[x + 1], src_ptr + program->pixel_offset[x + 5], src_ptr + program->pixel_offset[x + 9], src_ptr + program->pixel_offset[x + 13]);
+            __m512 d3_d7_d11_d15 = _mm512_loadu_4_m128(src_ptr + program->pixel_offset[x + 2], src_ptr + program->pixel_offset[x + 6], src_ptr + program->pixel_offset[x + 10], src_ptr + program->pixel_offset[x + 14]);
+            __m512 d4_d8_d12_d16 = _mm512_loadu_4_m128(src_ptr + program->pixel_offset[x + 3], src_ptr + program->pixel_offset[x + 7], src_ptr + program->pixel_offset[x + 11], src_ptr + program->pixel_offset[x + 15]);
+
+            _MM_TRANSPOSE16_LANE4_PS(d1_d5_d9_d13, d2_d6_d10_d14, d3_d7_d11_d15, d4_d8_d12_d16);
+
+            __m512 result = _mm512_mul_ps(d1_d5_d9_d13, c1_c5_c9_c13);
+            result = _mm512_fmadd_ps(d2_d6_d10_d14, c2_c6_c10_c14, result);
+            result = _mm512_fmadd_ps(d3_d7_d11_d15, c3_c7_c11_c15, result);
+            result = _mm512_fmadd_ps(d4_d8_d12_d16, c4_c8_c12_c16, result);
+
+            _mm512_store_ps(dst_ptr, result); 
+
+            dst_ptr += dst_pitch;
+            src_ptr += src_pitch;
+        }
+        current_coeff += filter_size * 16;
+    }
+
+}
+#endif
+
+#if 0
+void resize_h_planar_float_avx512_permutex_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel)
+{
+
+  // assert - check if max pixel_offset is not above single load of 16 src floats (or need several loads and more complex permute program)
+
+#ifdef _DEBUG
+  for (int x = 0; x < width; x += 16)
+  {
+    int start_off = program->pixel_offset[x + 0];
+    int end_off = program->pixel_offset[x + 15];
+    assert((end_off - start_off) > 15);
+  }
+#endif
+
+  int filter_size = program->filter_size;
+
+  const float* AVS_RESTRICT current_coeff;
+  __m512i one_epi32 = _mm512_set1_epi32(1);
+
+  src_pitch = src_pitch / sizeof(float);
+  dst_pitch = dst_pitch / sizeof(float);
+
+  float* src = (float*)src8;
+  float* dst = (float*)dst8;
+
+  current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
+
+  for (int x = 0; x < width; x += 16)
+  {
+    // prepare coefs in transposed V-form
+    __m512 coef_r0 = _mm512_load_4_m128(current_coeff + filter_size * 0, current_coeff + filter_size * 4, current_coeff + filter_size * 8, current_coeff + filter_size * 12);
+    __m512 coef_r1 = _mm512_load_4_m128(current_coeff + filter_size * 1, current_coeff + filter_size * 5, current_coeff + filter_size * 9, current_coeff + filter_size * 13);
+    __m512 coef_r2 = _mm512_load_4_m128(current_coeff + filter_size * 2, current_coeff + filter_size * 6, current_coeff + filter_size * 10, current_coeff + filter_size * 14);
+    __m512 coef_r3 = _mm512_load_4_m128(current_coeff + filter_size * 3, current_coeff + filter_size * 7, current_coeff + filter_size * 11, current_coeff + filter_size * 15);
+
+    _MM_TRANSPOSE16_LANE4_PS(coef_r0, coef_r1, coef_r2, coef_r3);
+
+    // convert resampling program in H-form into permuting indexes for src transposition in V-form
+    int iStart = program->pixel_offset[x + 0];
+    __m512i perm_0 = _mm512_set_epi32(program->pixel_offset[x + 15] - iStart, program->pixel_offset[x + 14] - iStart, program->pixel_offset[x + 13] - iStart, program->pixel_offset[x + 12] - iStart, program->pixel_offset[x + 11] - iStart, program->pixel_offset[x + 10] - iStart, program->pixel_offset[x + 9] - iStart, program->pixel_offset[x + 8] - iStart, \
+      program->pixel_offset[x + 7] - iStart, program->pixel_offset[x + 6] - iStart, program->pixel_offset[x + 5] - iStart, program->pixel_offset[x + 4] - iStart, program->pixel_offset[x + 3] - iStart, program->pixel_offset[x + 2] - iStart, program->pixel_offset[x + 1] - iStart, 0);
+    __m512i perm_1 = _mm512_add_epi32(perm_0, one_epi32);
+    __m512i perm_2 = _mm512_add_epi32(perm_1, one_epi32);
+    __m512i perm_3 = _mm512_add_epi32(perm_2, one_epi32);
+
+    float* AVS_RESTRICT dst_ptr = dst + x;
+    const float* src_ptr = src + program->pixel_offset[x + 0]; // all permute offsets relative to this start offset
+
+#if 0
+    for (int y = 0; y < height; y++) // single row proc
+    {
+      __m512 data_src = _mm512_loadu_ps(src_ptr);
+
+      __m512 data_0 = _mm512_permutexvar_ps(perm_0, data_src);
+      __m512 data_1 = _mm512_permutexvar_ps(perm_1, data_src);
+      __m512 data_2 = _mm512_permutexvar_ps(perm_2, data_src);
+      __m512 data_3 = _mm512_permutexvar_ps(perm_3, data_src);
+
+      __m512 result0 = _mm512_mul_ps(data_0, coef_r0);
+      __m512 result1 = _mm512_mul_ps(data_2, coef_r2);
+
+      result0 = _mm512_fmadd_ps(data_1, coef_r1, result0);
+      result1 = _mm512_fmadd_ps(data_3, coef_r3, result1);
+
+      _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1)); 
+
+      dst_ptr += dst_pitch;
+      src_ptr += src_pitch;
+    }
+#endif
+
+    const int height_mod2 = (height / 2) * 2; // Process pairs of rows for better efficiency
+    // dual-rows not worst in performance - may be left for the future better memory performance and compute performance hosts
+    for (int y = 0; y < height_mod2; y+=2)
+    {
+      __m512 data_src = _mm512_loadu_ps(src_ptr);
+      __m512 data_src_2 = _mm512_loadu_ps(src_ptr + src_pitch);
+
+      __m512 data_0 = _mm512_permutexvar_ps(perm_0, data_src);
+      __m512 data_1 = _mm512_permutexvar_ps(perm_1, data_src);
+      __m512 data_2 = _mm512_permutexvar_ps(perm_2, data_src);
+      __m512 data_3 = _mm512_permutexvar_ps(perm_3, data_src);
+
+      __m512 data_0_2 = _mm512_permutexvar_ps(perm_0, data_src_2);
+      __m512 data_1_2 = _mm512_permutexvar_ps(perm_1, data_src_2);
+      __m512 data_2_2 = _mm512_permutexvar_ps(perm_2, data_src_2);
+      __m512 data_3_2 = _mm512_permutexvar_ps(perm_3, data_src_2);
+
+      __m512 result0 = _mm512_mul_ps(data_0, coef_r0);
+      __m512 result1 = _mm512_mul_ps(data_0_2, coef_r0);
+
+      result0 = _mm512_fmadd_ps(data_1, coef_r1, result0);
+      result1 = _mm512_fmadd_ps(data_1_2, coef_r1, result1);
+
+      result0 = _mm512_fmadd_ps(data_2, coef_r2, result0);
+      result1 = _mm512_fmadd_ps(data_2_2, coef_r2, result1);
+
+      result0 = _mm512_fmadd_ps(data_3, coef_r3, result0);
+      result1 = _mm512_fmadd_ps(data_3_2, coef_r3, result1);
+
+      _mm512_store_ps(dst_ptr, result0); 
+      _mm512_store_ps(dst_ptr + dst_pitch, result1);
+
+      dst_ptr += dst_pitch * 2;
+      src_ptr += src_pitch * 2;
+    }
+
+    if (height > height_mod2) // last row
+    {
+      __m512 data_src = _mm512_loadu_ps(src_ptr);
+
+      __m512 data_0 = _mm512_permutexvar_ps(perm_0, data_src);
+      __m512 data_1 = _mm512_permutexvar_ps(perm_1, data_src);
+      __m512 data_2 = _mm512_permutexvar_ps(perm_2, data_src);
+      __m512 data_3 = _mm512_permutexvar_ps(perm_3, data_src);
+
+      __m512 result0 = _mm512_mul_ps(data_0, coef_r0);
+      __m512 result1 = _mm512_mul_ps(data_2, coef_r2);
+
+      result0 = _mm512_fmadd_ps(data_1, coef_r1, result0);
+      result1 = _mm512_fmadd_ps(data_3, coef_r3, result1);
+
+      _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1)); 
+    }
+
+    current_coeff += filter_size * 16;
+  }
+}
+#endif
+void resize_h_planar_float_avx512_permutex_vstripe_ks8(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel)
+{
+  // assert - check if max pixel_offset is not above single load of 16 src floats (or need several loads and more complex permute program)
+
+#ifdef _DEBUG
+  for (int x = 0; x < width; x += 16)
+  {
+    int start_off = program->pixel_offset[x + 0];
+    int end_off = program->pixel_offset[x + 15];
+    assert((end_off - start_off) > 15);
+  }
+#endif
+
+  int filter_size = program->filter_size;
+
+  const float* AVS_RESTRICT current_coeff;
+  __m512i one_epi32 = _mm512_set1_epi32(1);
+
+  src_pitch = src_pitch / sizeof(float);
+  dst_pitch = dst_pitch / sizeof(float);
+
+  float* src = (float*)src8;
+  float* dst = (float*)dst8;
+
+  current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
+
+  for (int x = 0; x < width; x += 16)
+  {
+    // prepare coefs in transposed V-form, use gathering - not very slow until TRANSPOSE8_ is designed
+
+    __m512i offsets = _mm512_set_epi32(filter_size * 7, filter_size * 6, filter_size * 5, filter_size * 4, filter_size * 3, filter_size * 2, filter_size * 1, filter_size * 0, \
+                                       filter_size * 7, filter_size * 6, filter_size * 5, filter_size * 4, filter_size * 3, filter_size * 2, filter_size * 1, filter_size * 0 );
+
+    __m512 coef_r0 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r1 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r2 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r3 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r4 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r5 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r6 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r7 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+
+    // convert resampling program in H-form into permuting indexes for src transposition in V-form
+    int iStart = program->pixel_offset[x + 0];
+    __m512i perm_0 = _mm512_set_epi32(program->pixel_offset[x + 15] - iStart, program->pixel_offset[x + 14] - iStart, program->pixel_offset[x + 13] - iStart, program->pixel_offset[x + 12] - iStart, program->pixel_offset[x + 11] - iStart, program->pixel_offset[x + 10] - iStart, program->pixel_offset[x + 9] - iStart, program->pixel_offset[x + 8] - iStart, \
+      program->pixel_offset[x + 7] - iStart, program->pixel_offset[x + 6] - iStart, program->pixel_offset[x + 5] - iStart, program->pixel_offset[x + 4] - iStart, program->pixel_offset[x + 3] - iStart, program->pixel_offset[x + 2] - iStart, program->pixel_offset[x + 1] - iStart, 0);
+    __m512i perm_1 = _mm512_add_epi32(perm_0, one_epi32);
+    __m512i perm_2 = _mm512_add_epi32(perm_1, one_epi32);
+    __m512i perm_3 = _mm512_add_epi32(perm_2, one_epi32);
+    __m512i perm_4 = _mm512_add_epi32(perm_3, one_epi32);
+    __m512i perm_5 = _mm512_add_epi32(perm_4, one_epi32);
+    __m512i perm_6 = _mm512_add_epi32(perm_5, one_epi32);
+    __m512i perm_7 = _mm512_add_epi32(perm_6, one_epi32);
+
+    float* AVS_RESTRICT dst_ptr = dst + x;
+    const float* src_ptr = src + program->pixel_offset[x + 0]; // all permute offsets relative to this start offset
+#if 0
+    for (int y = 0; y < height; y++) // single row proc
+    {
+      __m512 data_src = _mm512_loadu_ps(src_ptr);
+
+      __m512 data_0 = _mm512_permutexvar_ps(perm_0, data_src);
+      __m512 data_1 = _mm512_permutexvar_ps(perm_1, data_src);
+      __m512 data_2 = _mm512_permutexvar_ps(perm_2, data_src);
+      __m512 data_3 = _mm512_permutexvar_ps(perm_3, data_src);
+      __m512 data_4 = _mm512_permutexvar_ps(perm_4, data_src);
+      __m512 data_5 = _mm512_permutexvar_ps(perm_5, data_src);
+      __m512 data_6 = _mm512_permutexvar_ps(perm_6, data_src);
+      __m512 data_7 = _mm512_permutexvar_ps(perm_7, data_src);
+
+      __m512 result0 = _mm512_mul_ps(data_0, coef_r0);
+      __m512 result1 = _mm512_mul_ps(data_4, coef_r4);
+
+      result0 = _mm512_fmadd_ps(data_1, coef_r1, result0);
+      result1 = _mm512_fmadd_ps(data_5, coef_r5, result1);
+
+      result0 = _mm512_fmadd_ps(data_2, coef_r2, result0);
+      result1 = _mm512_fmadd_ps(data_6, coef_r6, result1);
+
+      result0 = _mm512_fmadd_ps(data_3, coef_r3, result0);
+      result1 = _mm512_fmadd_ps(data_7, coef_r7, result1);
+
+      _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1)); 
+
+      dst_ptr += dst_pitch;
+      src_ptr += src_pitch;
+    }
+#endif
+
+    const int height_mod2 = (height / 2) * 2; // Process pairs of rows for better efficiency
+    // dual-rows not worst in performance - may be left for the future better memory performance and compute performance hosts
+    for (int y = 0; y < height_mod2; y += 2)
+    {
+      __m512 data_src = _mm512_loadu_ps(src_ptr);
+      __m512 data_src_2 = _mm512_loadu_ps(src_ptr + src_pitch);
+
+      __m512 data_0 = _mm512_permutexvar_ps(perm_0, data_src);
+      __m512 data_1 = _mm512_permutexvar_ps(perm_1, data_src);
+      __m512 data_2 = _mm512_permutexvar_ps(perm_2, data_src);
+      __m512 data_3 = _mm512_permutexvar_ps(perm_3, data_src);
+      __m512 data_4 = _mm512_permutexvar_ps(perm_4, data_src);
+      __m512 data_5 = _mm512_permutexvar_ps(perm_5, data_src);
+      __m512 data_6 = _mm512_permutexvar_ps(perm_6, data_src);
+      __m512 data_7 = _mm512_permutexvar_ps(perm_7, data_src);
+
+      __m512 data_0_2 = _mm512_permutexvar_ps(perm_0, data_src_2);
+      __m512 data_1_2 = _mm512_permutexvar_ps(perm_1, data_src_2);
+      __m512 data_2_2 = _mm512_permutexvar_ps(perm_2, data_src_2);
+      __m512 data_3_2 = _mm512_permutexvar_ps(perm_3, data_src_2);
+      __m512 data_4_2 = _mm512_permutexvar_ps(perm_4, data_src_2);
+      __m512 data_5_2 = _mm512_permutexvar_ps(perm_5, data_src_2);
+      __m512 data_6_2 = _mm512_permutexvar_ps(perm_6, data_src_2);
+      __m512 data_7_2 = _mm512_permutexvar_ps(perm_7, data_src_2);
+
+      __m512 result0 = _mm512_mul_ps(data_0, coef_r0);
+      __m512 result1 = _mm512_mul_ps(data_0_2, coef_r0);
+
+      result0 = _mm512_fmadd_ps(data_1, coef_r1, result0);
+      result1 = _mm512_fmadd_ps(data_1_2, coef_r1, result1);
+
+      result0 = _mm512_fmadd_ps(data_2, coef_r2, result0);
+      result1 = _mm512_fmadd_ps(data_2_2, coef_r2, result1);
+
+      result0 = _mm512_fmadd_ps(data_3, coef_r3, result0);
+      result1 = _mm512_fmadd_ps(data_3_2, coef_r3, result1);
+
+      result0 = _mm512_fmadd_ps(data_4, coef_r4, result0);
+      result1 = _mm512_fmadd_ps(data_4_2, coef_r4, result1);
+
+      result0 = _mm512_fmadd_ps(data_5, coef_r5, result0);
+      result1 = _mm512_fmadd_ps(data_5_2, coef_r5, result1);
+
+      result0 = _mm512_fmadd_ps(data_6, coef_r6, result0);
+      result1 = _mm512_fmadd_ps(data_6_2, coef_r6, result1);
+
+      result0 = _mm512_fmadd_ps(data_7, coef_r7, result0);
+      result1 = _mm512_fmadd_ps(data_7_2, coef_r7, result1);
+
+      _mm512_store_ps(dst_ptr, result0); 
+      _mm512_store_ps(dst_ptr + dst_pitch, result1);
+
+      dst_ptr += dst_pitch * 2;
+      src_ptr += src_pitch * 2;
+    }
+
+    if (height > height_mod2) // last row
+    {
+      __m512 data_src = _mm512_loadu_ps(src_ptr);
+
+      __m512 data_0 = _mm512_permutexvar_ps(perm_0, data_src);
+      __m512 data_1 = _mm512_permutexvar_ps(perm_1, data_src);
+      __m512 data_2 = _mm512_permutexvar_ps(perm_2, data_src);
+      __m512 data_3 = _mm512_permutexvar_ps(perm_3, data_src);
+      __m512 data_4 = _mm512_permutexvar_ps(perm_4, data_src);
+      __m512 data_5 = _mm512_permutexvar_ps(perm_5, data_src);
+      __m512 data_6 = _mm512_permutexvar_ps(perm_6, data_src);
+      __m512 data_7 = _mm512_permutexvar_ps(perm_7, data_src);
+
+      __m512 result0 = _mm512_mul_ps(data_0, coef_r0);
+      __m512 result1 = _mm512_mul_ps(data_4, coef_r4);
+
+      result0 = _mm512_fmadd_ps(data_1, coef_r1, result0);
+      result1 = _mm512_fmadd_ps(data_5, coef_r5, result1);
+
+      result0 = _mm512_fmadd_ps(data_2, coef_r2, result0);
+      result1 = _mm512_fmadd_ps(data_6, coef_r6, result1);
+
+      result0 = _mm512_fmadd_ps(data_3, coef_r3, result0);
+      result1 = _mm512_fmadd_ps(data_7, coef_r7, result1);
+
+      _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1)); 
+    }
+
+    current_coeff += filter_size * 16;
+  }
+}
+
+#if 0
+void resize_h_planar_float_avx512_permutex_vstripe_ks16(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel)
+{
+  // assert - check if max pixel_offset is not above single load of 16 src floats (or need several loads and more complex permute program)
+#ifdef _DEBUG
+  for (int x = 0; x < width; x += 16)
+  {
+    int start_off = program->pixel_offset[x + 0];
+    int end_off = program->pixel_offset[x + 15];
+    assert((end_off - start_off) > 15);
+  }
+#endif
+
+  int filter_size = program->filter_size;
+
+  const float* AVS_RESTRICT current_coeff;
+  __m512i one_epi32 = _mm512_set1_epi32(1);
+
+  src_pitch = src_pitch / sizeof(float);
+  dst_pitch = dst_pitch / sizeof(float);
+
+  float* src = (float*)src8;
+  float* dst = (float*)dst8;
+
+  current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
+
+  for (int x = 0; x < width; x += 16)
+  {
+    // prepare coefs in transposed V-form, use gathering - not very slow until TRANSPOSE8_ is designed
+
+    __m512i offsets = _mm512_set_epi32(filter_size * 15, filter_size * 14, filter_size * 13, filter_size * 12, filter_size * 11, filter_size * 10, filter_size * 9, filter_size * 8, \
+      filter_size * 7, filter_size * 6, filter_size * 5, filter_size * 4, filter_size * 3, filter_size * 2, filter_size * 1, filter_size * 0);
+
+    __m512 coef_r0 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r1 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r2 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r3 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r4 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r5 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r6 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r7 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r8 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r9 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r10 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r11 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r12 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r13 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r14 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+    offsets = _mm512_add_epi32(offsets, one_epi32);
+    __m512 coef_r15 = _mm512_i32gather_ps(offsets, current_coeff, 4);
+
+
+    // convert resampling program in H-form into permuting indexes for src transposition in V-form
+    int iStart = program->pixel_offset[x + 0];
+    __m512i perm_0 = _mm512_set_epi32(program->pixel_offset[x + 15] - iStart, program->pixel_offset[x + 14] - iStart, program->pixel_offset[x + 13] - iStart, program->pixel_offset[x + 12] - iStart, program->pixel_offset[x + 11] - iStart, program->pixel_offset[x + 10] - iStart, program->pixel_offset[x + 9] - iStart, program->pixel_offset[x + 8] - iStart, \
+      program->pixel_offset[x + 7] - iStart, program->pixel_offset[x + 6] - iStart, program->pixel_offset[x + 5] - iStart, program->pixel_offset[x + 4] - iStart, program->pixel_offset[x + 3] - iStart, program->pixel_offset[x + 2] - iStart, program->pixel_offset[x + 1] - iStart, 0);
+    __m512i perm_1 = _mm512_add_epi32(perm_0, one_epi32);
+    __m512i perm_2 = _mm512_add_epi32(perm_1, one_epi32);
+    __m512i perm_3 = _mm512_add_epi32(perm_2, one_epi32);
+    __m512i perm_4 = _mm512_add_epi32(perm_3, one_epi32);
+    __m512i perm_5 = _mm512_add_epi32(perm_4, one_epi32);
+    __m512i perm_6 = _mm512_add_epi32(perm_5, one_epi32);
+    __m512i perm_7 = _mm512_add_epi32(perm_6, one_epi32);
+    __m512i perm_8 = _mm512_add_epi32(perm_7, one_epi32);
+    __m512i perm_9 = _mm512_add_epi32(perm_8, one_epi32);
+    __m512i perm_10 = _mm512_add_epi32(perm_9, one_epi32);
+    __m512i perm_11 = _mm512_add_epi32(perm_10, one_epi32);
+    __m512i perm_12 = _mm512_add_epi32(perm_11, one_epi32);
+    __m512i perm_13 = _mm512_add_epi32(perm_12, one_epi32);
+    __m512i perm_14 = _mm512_add_epi32(perm_13, one_epi32);
+    __m512i perm_15 = _mm512_add_epi32(perm_14, one_epi32); // to do : test if better to add one_epi32 in the loop and only store perm_0 complex to fill dataword
+
+    float* AVS_RESTRICT dst_ptr = dst + x;
+    const float* src_ptr = src + program->pixel_offset[x + 0]; // all permute offsets relative to this start offset
+
+    for (int y = 0; y < height; y++) // single row proc
+    {
+      __m512 data_src = _mm512_loadu_ps(src_ptr);
+
+      __m512 data_0 = _mm512_permutexvar_ps(perm_0, data_src);
+      __m512 data_1 = _mm512_permutexvar_ps(perm_1, data_src);
+      __m512 data_2 = _mm512_permutexvar_ps(perm_2, data_src);
+      __m512 data_3 = _mm512_permutexvar_ps(perm_3, data_src);
+      __m512 data_4 = _mm512_permutexvar_ps(perm_4, data_src);
+      __m512 data_5 = _mm512_permutexvar_ps(perm_5, data_src);
+      __m512 data_6 = _mm512_permutexvar_ps(perm_6, data_src);
+      __m512 data_7 = _mm512_permutexvar_ps(perm_7, data_src);
+      __m512 data_8 = _mm512_permutexvar_ps(perm_8, data_src);
+      __m512 data_9 = _mm512_permutexvar_ps(perm_9, data_src);
+      __m512 data_10 = _mm512_permutexvar_ps(perm_10, data_src);
+      __m512 data_11 = _mm512_permutexvar_ps(perm_11, data_src);
+      __m512 data_12 = _mm512_permutexvar_ps(perm_12, data_src);
+      __m512 data_13 = _mm512_permutexvar_ps(perm_13, data_src);
+      __m512 data_14 = _mm512_permutexvar_ps(perm_14, data_src);
+      __m512 data_15 = _mm512_permutexvar_ps(perm_15, data_src);
+
+      __m512 result0 = _mm512_mul_ps(data_0, coef_r0);
+      __m512 result1 = _mm512_mul_ps(data_8, coef_r8);
+
+      result0 = _mm512_fmadd_ps(data_1, coef_r1, result0);
+      result1 = _mm512_fmadd_ps(data_9, coef_r9, result1);
+
+      result0 = _mm512_fmadd_ps(data_2, coef_r2, result0);
+      result1 = _mm512_fmadd_ps(data_10, coef_r10, result1);
+
+      result0 = _mm512_fmadd_ps(data_3, coef_r3, result0);
+      result1 = _mm512_fmadd_ps(data_11, coef_r11, result1);
+
+      result0 = _mm512_fmadd_ps(data_4, coef_r4, result0);
+      result1 = _mm512_fmadd_ps(data_12, coef_r12, result1);
+
+      result0 = _mm512_fmadd_ps(data_5, coef_r5, result0);
+      result1 = _mm512_fmadd_ps(data_13, coef_r13, result1);
+
+      result0 = _mm512_fmadd_ps(data_6, coef_r6, result0);
+      result1 = _mm512_fmadd_ps(data_14, coef_r14, result1);
+
+      result0 = _mm512_fmadd_ps(data_7, coef_r7, result0);
+      result1 = _mm512_fmadd_ps(data_15, coef_r15, result1);
+
+      _mm512_store_ps(dst_ptr, _mm512_add_ps(result0, result1)); 
+
+      dst_ptr += dst_pitch;
+      src_ptr += src_pitch;
+    }
+
+    current_coeff += filter_size * 16;
+  }
+}
+#endif
+
+
+
+//-------- 512 bit float Verticals
+
+void resize_v_avx512_planar_float(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel)
+{
+  AVS_UNUSED(bits_per_pixel);
+
+  const int filter_size = program->filter_size;
+  const float* AVS_RESTRICT current_coeff = program->pixel_coefficient_float;
+
+  const float* src = (const float*)src8;
+  float* AVS_RESTRICT dst = (float*)dst8;
+  dst_pitch = dst_pitch / sizeof(float);
+  src_pitch = src_pitch / sizeof(float);
+
+  const int kernel_size = program->filter_size_real; // not the aligned
+  const int kernel_size_mod2 = (kernel_size / 2) * 2; // Process pairs of rows for better efficiency
+  const bool notMod2 = kernel_size_mod2 < kernel_size;
+
+  for (int y = 0; y < target_height; y++) {
+    int offset = program->pixel_offset[y];
+    const float* src_ptr = src + offset * src_pitch;
+
+    // 64 byte 16 floats (AVX512 register holds 16 floats)
+    // no need for wmod8, alignment is safe 32 bytes at least - is it safe for 64 bytes ?
+    for (int x = 0; x < width; x += 16) {
+      __m512 result_single = _mm512_setzero_ps();
+      __m512 result_single_2 = _mm512_setzero_ps();
+
+      const float* AVS_RESTRICT src2_ptr = src_ptr + x; // __restrict here
+
+      // Process pairs of rows for better efficiency (2 coeffs/cycle)
+      // two result variables for potential parallel operation
+      int i = 0;
+      for (; i < kernel_size_mod2; i += 2) {
+        __m512 coeff_even = _mm512_set1_ps(current_coeff[i]);
+        __m512 coeff_odd = _mm512_set1_ps(current_coeff[i + 1]);
+
+        __m512 src_even = _mm512_loadu_ps(src2_ptr);
+        __m512 src_odd = _mm512_loadu_ps(src2_ptr + src_pitch);
+
+        result_single = _mm512_fmadd_ps(src_even, coeff_even, result_single);
+        result_single_2 = _mm512_fmadd_ps(src_odd, coeff_odd, result_single_2);
+
+        src2_ptr += 2 * src_pitch;
+      }
+
+      result_single = _mm512_add_ps(result_single, result_single_2);
+
+      // Process the last odd row if needed
+      if (notMod2) {
+        __m512 coeff = _mm512_set1_ps(current_coeff[i]);
+        __m512 src_val = _mm512_loadu_ps(src2_ptr);
+        result_single = _mm512_fmadd_ps(src_val, coeff, result_single);
+      }
+
+      _mm512_store_ps(dst + x, result_single);
+    }
+
+    dst += dst_pitch;
+    current_coeff += filter_size;
+  }
+}
+
+void resize_v_avx512_planar_float_w_sr(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel)
+{
+  AVS_UNUSED(bits_per_pixel);
+
+  const int filter_size = program->filter_size;
+  const float* AVS_RESTRICT current_coeff = program->pixel_coefficient_float;
+
+  const float* src = (const float*)src8;
+  float* AVS_RESTRICT dst = (float*)dst8;
+  dst_pitch = dst_pitch / sizeof(float);
+  src_pitch = src_pitch / sizeof(float);
+
+  const int kernel_size = program->filter_size_real; // not the aligned
+  const int kernel_size_mod2 = (kernel_size / 2) * 2; // Process pairs of rows for better efficiency
+  const bool notMod2 = kernel_size_mod2 < kernel_size;
+
+  const int width_mod128 = (width / 128) * 128; // Process by 8x 512it (8 x 16 floats) to make memory read/write linear streams longer, 32x512 bit registers should be enough
+  const int width_mod64 = (width / 64) * 64; // Process by 4x 512it (4 x 16 floats) to make memory read/write linear streams longer,
+  const int width_mod32 = (width / 32) * 32; // Process by 2x 512it (2 x 16 floats) to make memory read/write linear streams longer,
+
+  for (int y = 0; y < target_height; y++) {
+    int offset = program->pixel_offset[y];
+    const float* src_ptr = src + offset * src_pitch;
+
+    for (int x = 0; x < width_mod128; x += 128) {
+      __m512 result_1 = _mm512_setzero_ps();
+      __m512 result_2 = _mm512_setzero_ps();
+      __m512 result_3 = _mm512_setzero_ps();
+      __m512 result_4 = _mm512_setzero_ps();
+      __m512 result_5 = _mm512_setzero_ps();
+      __m512 result_6 = _mm512_setzero_ps();
+      __m512 result_7 = _mm512_setzero_ps();
+      __m512 result_8 = _mm512_setzero_ps();
+
+      const float* AVS_RESTRICT src2_ptr = src_ptr + x; // __restrict here
+
+      int i = 0;
+      for (; i < kernel_size; i ++) {
+        __m512 coeff = _mm512_set1_ps(current_coeff[i]);
+
+        __m512 src_1 = _mm512_load_ps(src2_ptr);
+        __m512 src_2 = _mm512_load_ps(src2_ptr + 16);
+        __m512 src_3 = _mm512_load_ps(src2_ptr + 32);
+        __m512 src_4 = _mm512_load_ps(src2_ptr + 48);
+        __m512 src_5 = _mm512_load_ps(src2_ptr + 64);
+        __m512 src_6 = _mm512_load_ps(src2_ptr + 80);
+        __m512 src_7 = _mm512_load_ps(src2_ptr + 96);
+        __m512 src_8 = _mm512_load_ps(src2_ptr + 112);
+
+        result_1 = _mm512_fmadd_ps(src_1, coeff, result_1);
+        result_2 = _mm512_fmadd_ps(src_2, coeff, result_2);
+        result_3 = _mm512_fmadd_ps(src_3, coeff, result_3);
+        result_4 = _mm512_fmadd_ps(src_4, coeff, result_4);
+        result_5 = _mm512_fmadd_ps(src_5, coeff, result_5);
+        result_6 = _mm512_fmadd_ps(src_6, coeff, result_6);
+        result_7 = _mm512_fmadd_ps(src_7, coeff, result_7);
+        result_8 = _mm512_fmadd_ps(src_8, coeff, result_8);
+
+        src2_ptr += src_pitch;
+      }
+
+      _mm512_store_ps(dst + x, result_1); 
+      _mm512_store_ps(dst + x + 16, result_2);
+      _mm512_store_ps(dst + x + 32, result_3);
+      _mm512_store_ps(dst + x + 48, result_4);
+      _mm512_store_ps(dst + x + 64, result_5);
+      _mm512_store_ps(dst + x + 80, result_6);
+      _mm512_store_ps(dst + x + 96, result_7);
+      _mm512_store_ps(dst + x + 112, result_8);
+    }
+
+    for (int x = width_mod128; x < width_mod64; x += 64) {
+      __m512 result_1 = _mm512_setzero_ps();
+      __m512 result_2 = _mm512_setzero_ps();
+      __m512 result_3 = _mm512_setzero_ps();
+      __m512 result_4 = _mm512_setzero_ps();
+
+      const float* AVS_RESTRICT src2_ptr = src_ptr + x; // __restrict here
+
+      int i = 0;
+      for (; i < kernel_size; i++) {
+        __m512 coeff = _mm512_set1_ps(current_coeff[i]);
+
+        __m512 src_1 = _mm512_load_ps(src2_ptr);
+        __m512 src_2 = _mm512_load_ps(src2_ptr + 16);
+        __m512 src_3 = _mm512_load_ps(src2_ptr + 32);
+        __m512 src_4 = _mm512_load_ps(src2_ptr + 48);
+
+        result_1 = _mm512_fmadd_ps(src_1, coeff, result_1);
+        result_2 = _mm512_fmadd_ps(src_2, coeff, result_2);
+        result_3 = _mm512_fmadd_ps(src_3, coeff, result_3);
+        result_4 = _mm512_fmadd_ps(src_4, coeff, result_4);
+
+        src2_ptr += src_pitch;
+      }
+
+      _mm512_store_ps(dst + x, result_1);
+      _mm512_store_ps(dst + x + 16, result_2);
+      _mm512_store_ps(dst + x + 32, result_3);
+      _mm512_store_ps(dst + x + 48, result_4);
+    }
+
+    for (int x = width_mod64; x < width_mod32; x += 32) {
+      __m512 result_1 = _mm512_setzero_ps();
+      __m512 result_2 = _mm512_setzero_ps();
+
+      const float* AVS_RESTRICT src2_ptr = src_ptr + x; // __restrict here
+
+      int i = 0;
+      for (; i < kernel_size; i++) {
+        __m512 coeff = _mm512_set1_ps(current_coeff[i]);
+
+        __m512 src_1 = _mm512_load_ps(src2_ptr);
+        __m512 src_2 = _mm512_load_ps(src2_ptr + 16);
+
+        result_1 = _mm512_fmadd_ps(src_1, coeff, result_1);
+        result_2 = _mm512_fmadd_ps(src_2, coeff, result_2);
+
+        src2_ptr += src_pitch;
+      }
+
+      _mm512_store_ps(dst + x, result_1);
+      _mm512_store_ps(dst + x + 16, result_2);
+    }
+
+
+    // 64 byte 16 floats (AVX512 register holds 16 floats)
+    // row alignment is 64 bytes - so it is safe to load mod16 of float32 ?
+    for (int x = width_mod32; x < width; x += 16) {
+      __m512 result_single = _mm512_setzero_ps();
+      __m512 result_single_2 = _mm512_setzero_ps();
+
+      const float* AVS_RESTRICT src2_ptr = src_ptr + x; // __restrict here
+
+      // Process pairs of rows for better efficiency (2 coeffs/cycle)
+      // two result variables for potential parallel operation
+      int i = 0;
+      for (; i < kernel_size_mod2; i += 2) {
+        __m512 coeff_even = _mm512_set1_ps(current_coeff[i]);
+        __m512 coeff_odd = _mm512_set1_ps(current_coeff[i + 1]);
+
+        __m512 src_even = _mm512_load_ps(src2_ptr);
+        __m512 src_odd = _mm512_load_ps(src2_ptr + src_pitch);
+
+        result_single = _mm512_fmadd_ps(src_even, coeff_even, result_single);
+        result_single_2 = _mm512_fmadd_ps(src_odd, coeff_odd, result_single_2);
+
+        src2_ptr += 2 * src_pitch;
+      }
+
+      result_single = _mm512_add_ps(result_single, result_single_2);
+
+      // Process the last odd row if needed
+      if (notMod2) {
+        __m512 coeff = _mm512_set1_ps(current_coeff[i]);
+        __m512 src_val = _mm512_loadu_ps(src2_ptr);
+        result_single = _mm512_fmadd_ps(src_val, coeff, result_single);
+      }
+
+      _mm512_store_ps(dst + x, result_single);
+    }
+
+
+    dst += dst_pitch;
+    current_coeff += filter_size;
+  }
+}
+
+// uint8_t
+void resize_v_avx512_planar_uint8_t_w_sr(BYTE* AVS_RESTRICT dst, const BYTE* src, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel)
+{
+  AVS_UNUSED(bits_per_pixel);
+  int filter_size = program->filter_size;
+  const short* AVS_RESTRICT current_coeff = program->pixel_coefficient;
+  __m512i rounder = _mm512_set1_epi32(1 << (FPScale8bits - 1));
+  __m512i zero = _mm512_setzero_si512();
+
+  const int kernel_size = program->filter_size_real; // not the aligned
+
+  const int width_mod128 = (width / 128) * 128;
+
+  const __m512i perm_idx1 = _mm512_set_epi64(8 + 5, 8 + 4, 8 + 1, 8 + 0, 5, 4, 1, 0);
+  const __m512i perm_idx2 = _mm512_set_epi64(8 + 7, 8 + 6, 8 + 3, 8 + 2, 7, 6, 3, 2);
+
+  for (int y = 0; y < target_height; y++) {
+    int offset = program->pixel_offset[y];
+    const BYTE* AVS_RESTRICT src_ptr = src + offset * src_pitch;
+
+    for (int x = 0; x < width_mod128; x += 128) {
+
+      __m512i result_lo = rounder;
+      __m512i result_hi = rounder;
+      __m512i result_lo2 = rounder;
+      __m512i result_hi2 = rounder;
+
+      __m512i result_lo_2 = rounder;
+      __m512i result_hi_2 = rounder;
+      __m512i result_lo2_2 = rounder;
+      __m512i result_hi2_2 = rounder;
+
+      const uint8_t* AVS_RESTRICT src2_ptr = src_ptr + x;
+
+      int i = 0;
+      // 128 byte 128 pixel
+      for (; i < kernel_size; i++) {
+        // Broadcast a single coefficients
+        __m512i coeff = _mm512_set1_epi16(*reinterpret_cast<const short*>(current_coeff + i)); // 0|co|0|co|0|co|0|co   0|co|0|co|0|co|0|co
+
+        __m512i src_1_1 = _mm512_cvtepu8_epi16(_mm256_load_si256(reinterpret_cast<const __m256i*>(src2_ptr))); // 32x 8->16bit pixels
+        __m512i src_1_2 = _mm512_cvtepu8_epi16(_mm256_load_si256(reinterpret_cast<const __m256i*>(src2_ptr + 32))); // 32x 8->16bit pixels
+        __m512i src_2_1 = _mm512_cvtepu8_epi16(_mm256_load_si256(reinterpret_cast<const __m256i*>(src2_ptr + 64))); // 32x 8->16bit pixels
+        __m512i src_2_2 = _mm512_cvtepu8_epi16(_mm256_load_si256(reinterpret_cast<const __m256i*>(src2_ptr + 96))); // 32x 8->16bit pixels
+
+        __m512i src_lo = _mm512_unpacklo_epi16(src_1_1, zero);
+        __m512i src_hi = _mm512_unpackhi_epi16(src_1_1, zero);
+        __m512i src_lo2 = _mm512_unpacklo_epi16(src_1_2, zero);
+        __m512i src_hi2 = _mm512_unpackhi_epi16(src_1_2, zero);
+
+        __m512i src_lo_2 = _mm512_unpacklo_epi16(src_2_1, zero);
+        __m512i src_hi_2 = _mm512_unpackhi_epi16(src_2_1, zero);
+        __m512i src_lo2_2 = _mm512_unpacklo_epi16(src_2_2, zero);
+        __m512i src_hi2_2 = _mm512_unpackhi_epi16(src_2_2, zero);
+
+        result_lo = _mm512_add_epi32(result_lo, _mm512_madd_epi16(src_lo, coeff)); // a*b + c
+        result_hi = _mm512_add_epi32(result_hi, _mm512_madd_epi16(src_hi, coeff)); // a*b + c
+        result_lo2 = _mm512_add_epi32(result_lo2, _mm512_madd_epi16(src_lo2, coeff)); // a*b + c
+        result_hi2 = _mm512_add_epi32(result_hi2, _mm512_madd_epi16(src_hi2, coeff)); // a*b + c
+
+        result_lo_2 = _mm512_add_epi32(result_lo_2, _mm512_madd_epi16(src_lo_2, coeff)); // a*b + c
+        result_hi_2 = _mm512_add_epi32(result_hi_2, _mm512_madd_epi16(src_hi_2, coeff)); // a*b + c
+        result_lo2_2 = _mm512_add_epi32(result_lo2_2, _mm512_madd_epi16(src_lo2_2, coeff)); // a*b + c
+        result_hi2_2 = _mm512_add_epi32(result_hi2_2, _mm512_madd_epi16(src_hi2_2, coeff)); // a*b + c
+
+        src2_ptr += src_pitch;
+
+      }
+
+      // scale back, store
+      // shift back integer arithmetic 14 bits precision
+      result_lo = _mm512_srai_epi32(result_lo, FPScale8bits);
+      result_hi = _mm512_srai_epi32(result_hi, FPScale8bits);
+      result_lo2 = _mm512_srai_epi32(result_lo2, FPScale8bits);
+      result_hi2 = _mm512_srai_epi32(result_hi2, FPScale8bits);
+
+      result_lo_2 = _mm512_srai_epi32(result_lo_2, FPScale8bits);
+      result_hi_2 = _mm512_srai_epi32(result_hi_2, FPScale8bits);
+      result_lo2_2 = _mm512_srai_epi32(result_lo2_2, FPScale8bits);
+      result_hi2_2 = _mm512_srai_epi32(result_hi2_2, FPScale8bits);
+
+      __m512i result_2x8x_uint16 = _mm512_packus_epi32(result_lo, result_hi);
+      __m512i result2_2x8x_uint16 = _mm512_packus_epi32(result_lo2, result_hi2);
+
+      __m512i result_2x8x_uint16_2 = _mm512_packus_epi32(result_lo_2, result_hi_2);
+      __m512i result2_2x8x_uint16_2 = _mm512_packus_epi32(result_lo2_2, result_hi2_2);
+
+      __m512i pack_1 = _mm512_permutex2var_epi64(result_2x8x_uint16, perm_idx1, result2_2x8x_uint16);
+      __m512i pack_2 = _mm512_permutex2var_epi64(result_2x8x_uint16, perm_idx2, result2_2x8x_uint16);
+
+      __m512i pack_1_2 = _mm512_permutex2var_epi64(result_2x8x_uint16_2, perm_idx1, result2_2x8x_uint16_2);
+      __m512i pack_2_2 = _mm512_permutex2var_epi64(result_2x8x_uint16_2, perm_idx2, result2_2x8x_uint16_2);
+
+      __m512i res = _mm512_packus_epi16(pack_1, pack_2);
+      __m512i res_2 = _mm512_packus_epi16(pack_1_2, pack_2_2);
+
+      _mm512_store_si512(reinterpret_cast<__m512i*>(dst + x), res);
+      _mm512_store_si512(reinterpret_cast<__m512i*>(dst + x + 64), res_2);
+
+    }
+
+    // 64 byte 64 pixel
+    // no need wmod16, alignment is safe at least 32
+    for (int x = width_mod128; x < width; x += 64) {
+
+      __m512i result_lo = rounder;
+      __m512i result_hi = rounder;
+
+      __m512i result_lo2 = rounder;
+      __m512i result_hi2 = rounder;
+
+      const uint8_t* AVS_RESTRICT src2_ptr = src_ptr + x;
+
+      int i = 0;
+      for (; i < kernel_size; i++) {
+        // Broadcast a single coefficients
+        __m512i coeff = _mm512_set1_epi16(*reinterpret_cast<const short*>(current_coeff + i)); // 0|co|0|co|0|co|0|co   0|co|0|co|0|co|0|co
+
+        __m512i src_1_1 = _mm512_cvtepu8_epi16(_mm256_load_si256(reinterpret_cast<const __m256i*>(src2_ptr))); // 32x 8->16bit pixels
+        __m512i src_1_2 = _mm512_cvtepu8_epi16(_mm256_load_si256(reinterpret_cast<const __m256i*>(src2_ptr + 32))); // 32x 8->16bit pixels
+
+        __m512i src_lo = _mm512_unpacklo_epi16(src_1_1, zero);
+        __m512i src_hi = _mm512_unpackhi_epi16(src_1_1, zero);
+
+        __m512i src_lo2 = _mm512_unpacklo_epi16(src_1_2, zero);
+        __m512i src_hi2 = _mm512_unpackhi_epi16(src_1_2, zero);
+
+        result_lo = _mm512_add_epi32(result_lo, _mm512_madd_epi16(src_lo, coeff)); // a*b + c
+        result_hi = _mm512_add_epi32(result_hi, _mm512_madd_epi16(src_hi, coeff)); // a*b + c
+
+        result_lo2 = _mm512_add_epi32(result_lo2, _mm512_madd_epi16(src_lo2, coeff)); // a*b + c
+        result_hi2 = _mm512_add_epi32(result_hi2, _mm512_madd_epi16(src_hi2, coeff)); // a*b + c
+
+        src2_ptr += src_pitch;
+
+      }
+
+      // scale back, store
+      // shift back integer arithmetic 14 bits precision
+      result_lo = _mm512_srai_epi32(result_lo, FPScale8bits);
+      result_hi = _mm512_srai_epi32(result_hi, FPScale8bits);
+
+      result_lo2 = _mm512_srai_epi32(result_lo2, FPScale8bits);
+      result_hi2 = _mm512_srai_epi32(result_hi2, FPScale8bits);
+
+      __m512i result_2x8x_uint16 = _mm512_packus_epi32(result_lo, result_hi);
+      __m512i result_2x8x_uint16_2 = _mm512_packus_epi32(result_lo2, result_hi2);
+
+      __m512i pack_1 = _mm512_permutex2var_epi64(result_2x8x_uint16, perm_idx1, result_2x8x_uint16_2);
+      __m512i pack_2 = _mm512_permutex2var_epi64(result_2x8x_uint16, perm_idx2, result_2x8x_uint16_2);
+
+      __m512i res = _mm512_packus_epi16(pack_1, pack_2);
+
+      _mm512_store_si512(reinterpret_cast<__m512i*>(dst + x), res);
+
+    }
+
+    dst += dst_pitch;
+    current_coeff += filter_size;
+  }
+}
+
+//uint16_t
+template<bool lessthan16bit>
+void resize_v_avx512_planar_uint16_t_w_sr(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel)
+{
+  int filter_size = program->filter_size;
+  const short* AVS_RESTRICT current_coeff = program->pixel_coefficient;
+
+  const __m512i zero = _mm512_setzero_si512();
+
+  const int width_mod64 = (width / 64) * 64;
+
+  // for 16 bits only
+  const __m512i shifttosigned = _mm512_set1_epi16(-32768);
+  const __m512i shiftfromsigned = _mm512_set1_epi32(32768 << FPScale16bits);
+
+  const __m512i rounder = _mm512_set1_epi32(1 << (FPScale16bits - 1));
+
+  const uint16_t* src = (uint16_t*)src8;
+  uint16_t* AVS_RESTRICT dst = (uint16_t * AVS_RESTRICT)dst8;
+  dst_pitch = dst_pitch / sizeof(uint16_t);
+  src_pitch = src_pitch / sizeof(uint16_t);
+
+  const int kernel_size = program->filter_size_real; // not the aligned
+
+  const int limit = (1 << bits_per_pixel) - 1;
+  __m512i clamp_limit = _mm512_set1_epi16((short)limit); // clamp limit for <16 bits
+
+  for (int y = 0; y < target_height; y++) {
+    int offset = program->pixel_offset[y];
+    const uint16_t* src_ptr = src + offset * src_pitch;
+
+    // 128 byte 32 word
+    for (int x = 0; x < width_mod64; x += 64) {
+
+      __m512i result_lo = rounder;
+      __m512i result_hi = rounder;
+
+      __m512i result_lo_2 = rounder;
+      __m512i result_hi_2 = rounder;
+
+      const uint16_t* AVS_RESTRICT src2_ptr = src_ptr + x;
+
+      int i = 0;
+      for (; i < kernel_size; i++) {
+        // Broadcast a single coefficients
+        __m512i coeff = _mm512_set1_epi16(current_coeff[i]); // 0|co|0|co|0|co|0|co   0|co|0|co|0|co|0|co
+
+        __m512i src = _mm512_load_si512(reinterpret_cast<const __m512i*>(src2_ptr)); // 32x 16bit pixels
+        __m512i src_2 = _mm512_load_si512(reinterpret_cast<const __m512i*>(src2_ptr + 32)); // 32x 16bit pixels
+
+        if (!lessthan16bit) {
+          src = _mm512_add_epi16(src, shifttosigned);
+          src_2 = _mm512_add_epi16(src_2, shifttosigned);
+        }
+
+        __m512i src_lo = _mm512_unpacklo_epi16(src, zero);
+        __m512i src_hi = _mm512_unpackhi_epi16(src, zero);
+
+        __m512i src_lo_2 = _mm512_unpacklo_epi16(src_2, zero);
+        __m512i src_hi_2 = _mm512_unpackhi_epi16(src_2, zero);
+
+        result_lo = _mm512_add_epi32(result_lo, _mm512_madd_epi16(src_lo, coeff)); // a*b + c
+        result_hi = _mm512_add_epi32(result_hi, _mm512_madd_epi16(src_hi, coeff)); // a*b + c
+
+        result_lo_2 = _mm512_add_epi32(result_lo_2, _mm512_madd_epi16(src_lo_2, coeff)); // a*b + c
+        result_hi_2 = _mm512_add_epi32(result_hi_2, _mm512_madd_epi16(src_hi_2, coeff)); // a*b + c
+
+        src2_ptr += src_pitch;
+      }
+
+      if (!lessthan16bit) {
+        result_lo = _mm512_add_epi32(result_lo, shiftfromsigned);
+        result_hi = _mm512_add_epi32(result_hi, shiftfromsigned);
+
+        result_lo_2 = _mm512_add_epi32(result_lo_2, shiftfromsigned);
+        result_hi_2 = _mm512_add_epi32(result_hi_2, shiftfromsigned);
+
+      }
+      // shift back integer arithmetic 13 bits precision
+      result_lo = _mm512_srai_epi32(result_lo, FPScale16bits);
+      result_hi = _mm512_srai_epi32(result_hi, FPScale16bits);
+
+      result_lo_2 = _mm512_srai_epi32(result_lo_2, FPScale16bits);
+      result_hi_2 = _mm512_srai_epi32(result_hi_2, FPScale16bits);
+
+      __m512i result_2x8x_uint16 = _mm512_packus_epi32(result_lo, result_hi);
+      __m512i result_2x8x_uint16_2 = _mm512_packus_epi32(result_lo_2, result_hi_2);
+      if (lessthan16bit) {
+        result_2x8x_uint16 = _mm512_min_epu16(result_2x8x_uint16, clamp_limit); // extra clamp for 10-14 bit
+        result_2x8x_uint16_2 = _mm512_min_epu16(result_2x8x_uint16_2, clamp_limit); // extra clamp for 10-14 bit
+      }
+      _mm512_store_si512(reinterpret_cast<__m512i*>(dst + x), result_2x8x_uint16);
+      _mm512_store_si512(reinterpret_cast<__m512i*>(dst + x + 32), result_2x8x_uint16_2);
+    }
+    
+    // last 32
+    // 64 byte 32 word
+    for (int x = width_mod64; x < width; x += 32) { 
+
+      __m512i result_lo = rounder;
+      __m512i result_hi = rounder;
+
+      const uint16_t* AVS_RESTRICT src2_ptr = src_ptr + x;
+
+      int i = 0;
+      for (; i < kernel_size; i++) {
+        // Broadcast a single coefficients
+        __m512i coeff = _mm512_set1_epi16(current_coeff[i]); // 0|co|0|co|0|co|0|co   0|co|0|co|0|co|0|co
+
+        __m512i src = _mm512_load_si512(reinterpret_cast<const __m512i*>(src2_ptr)); // 32x 16bit pixels
+        if (!lessthan16bit) {
+          src = _mm512_add_epi16(src, shifttosigned);
+        }
+        __m512i src_lo = _mm512_unpacklo_epi16(src, zero);
+        __m512i src_hi = _mm512_unpackhi_epi16(src, zero);
+        result_lo = _mm512_add_epi32(result_lo, _mm512_madd_epi16(src_lo, coeff)); // a*b + c
+        result_hi = _mm512_add_epi32(result_hi, _mm512_madd_epi16(src_hi, coeff)); // a*b + c
+
+        src2_ptr += src_pitch;
+      }
+
+      if (!lessthan16bit) {
+        result_lo = _mm512_add_epi32(result_lo, shiftfromsigned);
+        result_hi = _mm512_add_epi32(result_hi, shiftfromsigned);
+      }
+      // shift back integer arithmetic 13 bits precision
+      result_lo = _mm512_srai_epi32(result_lo, FPScale16bits);
+      result_hi = _mm512_srai_epi32(result_hi, FPScale16bits);
+
+      __m512i result_2x8x_uint16 = _mm512_packus_epi32(result_lo, result_hi);
+      if (lessthan16bit) {
+        result_2x8x_uint16 = _mm512_min_epu16(result_2x8x_uint16, clamp_limit); // extra clamp for 10-14 bit
+      }
+      _mm512_store_si512(reinterpret_cast<__m512i*>(dst + x), result_2x8x_uint16);
+
+    }
+
+    dst += dst_pitch;
+    current_coeff += filter_size;
+  }
+}
+
+// avx512 16
+template void resize_v_avx512_planar_uint16_t_w_sr<false>(BYTE* dst0, const BYTE* src0, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel);
+// avx512 10-14bit
+template void resize_v_avx512_planar_uint16_t_w_sr<true>(BYTE* dst0, const BYTE* src0, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel);
+
+
+
+
diff --git a/avs_core/filters/intel/resample_avx512.h b/avs_core/filters/intel/resample_avx512.h
new file mode 100644
index 000000000..a9b5a0706
--- /dev/null
+++ b/avs_core/filters/intel/resample_avx512.h
@@ -0,0 +1,147 @@
+// Avisynth v2.5.  Copyright 2002 Ben Rudiak-Gould et al.
+// http://avisynth.nl
+
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
+// http://www.gnu.org/copyleft/gpl.html .
+//
+// Linking Avisynth statically or dynamically with other modules is making a
+// combined work based on Avisynth.  Thus, the terms and conditions of the GNU
+// General Public License cover the whole combination.
+//
+// As a special exception, the copyright holders of Avisynth give you
+// permission to link Avisynth with independent modules that communicate with
+// Avisynth solely through the interfaces defined in avisynth.h, regardless of the license
+// terms of these independent modules, and to copy and distribute the
+// resulting combined work under terms of your choice, provided that
+// every copy of the combined work is accompanied by a complete copy of
+// the source code of Avisynth (the version of Avisynth used to produce the
+// combined work), being distributed under the terms of the GNU General
+// Public License plus this exception.  An independent module is a module
+// which is not derived from or based on Avisynth, such as 3rd-party filters,
+// import and export plugins, or graphical user interfaces.
+
+#ifndef __Resample_AVX512_H__
+#define __Resample_AVX512_H__
+
+#include <avisynth.h>
+#include "../resample_functions.h"
+
+#include <immintrin.h> // includes AVX, AVX2, FMA3, AVX512F, AVX512BW, etc. for MSVC, Clang, and GCC
+
+// compiler feature checks and error handling
+#if defined(__clang__) && !defined(_MSC_VER)
+#if !defined(__AVX512F__) || !defined(__AVX512BW__)
+#error "This code requires a compiler that supports AVX-512F and AVX-512BW.  Use compiler flags -mavx512f -mavx512bw."
+#endif
+#elif defined(__GNUC__)
+#if !defined(__AVX512F__) || !defined(__AVX512BW__)
+#error "This code requires a compiler that supports AVX-512F and AVX-512BW.  Use compiler flags -mavx512f -mavx512bw."
+#endif
+#elif defined(_MSC_VER)
+  #if !defined(_M_X64) && !defined(_M_AMD64) && !defined(_M_ARM64)
+  #error "AVX-512 is only supported on x64 and ARM64 architectures."
+  #endif
+  // MSVC's <immintrin.h> provides AVX-512 support when /arch:AVX512 is used.
+  // However, MSVC may not define __AVX512F__ or __AVX512BW__ consistently.
+  // We rely on /arch:AVX512 having been set, and assume that if the user is
+  // including this header, they intend to use AVX-512.
+#else
+  #error "Unsupported compiler. This code requires a compiler that supports AVX-512F and AVX-512BW (GCC, Clang, or MSVC)."
+#endif
+
+#if !defined(__FMA__)
+// Assume that all processors that have AVX2/AVX512 also have FMA3
+#if defined (__GNUC__) && ! defined (__INTEL_COMPILER) && ! defined (__clang__)
+// Prevent error message in g++ when using FMA intrinsics with avx2:
+#pragma message "It is recommended to specify also option -mfma when using -mavx2 or higher"
+#else
+#define __FMA__  1
+#endif
+#endif
+// FMA3 instruction set
+#if defined (__FMA__) && (defined(__GNUC__) || defined(__clang__))  && ! defined (__INTEL_COMPILER)
+#include <fmaintrin.h>
+#endif // __FMA__
+
+// MSVC Missing Intrinsics (Workaround for older MSVC versions)
+#if defined(_MSC_VER) && !defined(__clang__)
+#if _MSC_VER < 1922 // Check for MSVC version less than 16.2 (VS 2019 16.2)
+  // Define missing AVX-512BW mask intrinsics for older MSVC.
+  // inline functions that perform the mask operations directly.
+  // Since this is MSVC only, using specific __forceinline.
+__forceinline __mmask64 _kand_mask64(__mmask64 a, __mmask64 b) { return a & b; }
+__forceinline __mmask64 _kor_mask64(__mmask64 a, __mmask64 b) { return a | b; }
+__forceinline __mmask32 _kand_mask32(__mmask32 a, __mmask32 b) { return a & b; }
+__forceinline __mmask32 _kor_mask32(__mmask32 a, __mmask32 b) { return a | b; }
+#endif
+#endif
+
+// useful macros
+
+#define _MM_TRANSPOSE16_LANE4_PS(row0, row1, row2, row3) \
+  do { \
+    __m512 __t0, __t1, __t2, __t3; \
+    __t0 = _mm512_unpacklo_ps(row0, row1); \
+    __t1 = _mm512_unpackhi_ps(row0, row1); \
+    __t2 = _mm512_unpacklo_ps(row2, row3); \
+    __t3 = _mm512_unpackhi_ps(row2, row3); \
+    row0 = _mm512_shuffle_ps(__t0, __t2, _MM_SHUFFLE(1, 0, 1, 0)); \
+    row1 = _mm512_shuffle_ps(__t0, __t2, _MM_SHUFFLE(3, 2, 3, 2)); \
+    row2 = _mm512_shuffle_ps(__t1, __t3, _MM_SHUFFLE(1, 0, 1, 0)); \
+    row3 = _mm512_shuffle_ps(__t1, __t3, _MM_SHUFFLE(3, 2, 3, 2)); \
+  } while (0)
+
+#ifndef _mm512_loadu_4_m128
+#define _mm512_loadu_4_m128(/* __m128 const* */ addr1, \
+                            /* __m128 const* */ addr2, \
+                            /* __m128 const* */ addr3, \
+                            /* __m128 const* */ addr4) \
+_mm512_insertf32x4(_mm512_insertf32x4(_mm512_insertf32x4(_mm512_castps128_ps512(_mm_loadu_ps(addr1)), _mm_loadu_ps(addr2), 1), _mm_loadu_ps(addr3), 2), _mm_loadu_ps(addr4), 3)
+#endif
+
+#ifndef _mm512_load_4_m128
+#define _mm512_load_4_m128(/* __m128 const* */ addr1, \
+                            /* __m128 const* */ addr2, \
+                            /* __m128 const* */ addr3, \
+                            /* __m128 const* */ addr4) \
+_mm512_insertf32x4(_mm512_insertf32x4(_mm512_insertf32x4(_mm512_castps128_ps512(_mm_load_ps(addr1)), _mm_load_ps(addr2), 1), _mm_load_ps(addr3), 2), _mm_load_ps(addr4), 3)
+#endif
+
+
+template<int filtersizemod4>
+void resize_h_planar_float_avx512_transpose_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+
+template<int filtersizemod4>
+void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+
+template<int filtersizemod4>
+void resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+
+
+void resize_h_planar_float_avx512_permutex_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+void resize_h_planar_float_avx512_permutex_vstripe_ks8(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+void resize_h_planar_float_avx512_permutex_vstripe_ks16(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+
+void resize_v_avx512_planar_float(BYTE* dst0, const BYTE* src0, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel);
+void resize_v_avx512_planar_float_w_sr(BYTE* dst0, const BYTE* src0, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel);
+
+// uint8_t
+void resize_v_avx512_planar_uint8_t_w_sr(BYTE* AVS_RESTRICT dst, const BYTE* src, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel);
+
+// uint16_t
+template<bool lessthan16bit>
+void resize_v_avx512_planar_uint16_t_w_sr(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel);
+
+#endif // __Resample_AVX512_H__
diff --git a/avs_core/filters/intel/resample_sse.cpp b/avs_core/filters/intel/resample_sse.cpp
index 0fc6c66ca..7f0afbca3 100644
--- a/avs_core/filters/intel/resample_sse.cpp
+++ b/avs_core/filters/intel/resample_sse.cpp
@@ -152,7 +152,7 @@ void resize_v_mmx_planar(BYTE* dst, const BYTE* src, int dst_pitch, int src_pitc
 }
 #endif
 
-/*
+#if 0
 void resize_v_sse2_planar(BYTE* dst8, const BYTE* src, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel)
 {
   AVS_UNUSED(bits_per_pixel);
@@ -226,7 +226,7 @@ void resize_v_sse2_planar(BYTE* dst8, const BYTE* src, int dst_pitch, int src_pi
     current_coeff += filter_size;
   }
 }
-*/
+#else
 
 void resize_v_sse2_planar(BYTE* dst8, const BYTE* src, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel)
 {
@@ -340,7 +340,7 @@ void resize_v_sse2_planar(BYTE* dst8, const BYTE* src, int dst_pitch, int src_pi
         current_coeff += filter_size;
     }
 }
-
+#endif
 // like the AVX2 version, but only 8 pixels at a time
 template<bool lessthan16bit>
 void resize_v_sse2_planar_uint16_t(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel)
@@ -597,7 +597,7 @@ void resizer_h_ssse3_generic_float(BYTE* dst8, const BYTE* src8, int dst_pitch,
   dst_pitch = dst_pitch / sizeof(float);
   src_pitch = src_pitch / sizeof(float);
 
-  const int w_safe_mod8 = (program->overread_possible ? program->source_overread_beyond_targetx : width) / 8 * 8;
+  const int w_safe_mod8 = (program->safelimit_filter_size_aligned.overread_possible ? program->safelimit_filter_size_aligned.source_overread_beyond_targetx : width) / 8 * 8;
 
   for (int y = 0; y < height; y++) {
     float* current_coeff_base = program->pixel_coefficient_float;
@@ -987,7 +987,7 @@ void resizer_h_ssse3_generic_uint8_16(BYTE* dst8, const BYTE* src8, int dst_pitc
   dst_pitch /= sizeof(pixel_t);
   src_pitch /= sizeof(pixel_t);
 
-  const int w_safe_mod8 = (program->overread_possible ? program->source_overread_beyond_targetx : width) / 8 * 8;
+  const int w_safe_mod8 = (program->safelimit_filter_size_aligned.overread_possible ? program->safelimit_filter_size_aligned.source_overread_beyond_targetx : width) / 8 * 8;
 
   for (int y = 0; y < height; y++) {
     const short* AVS_RESTRICT current_coeff_base = program->pixel_coefficient;
@@ -1018,3 +1018,289 @@ template void resizer_h_ssse3_generic_uint8_16<uint16_t, true>(BYTE* dst8, const
 template void resize_v_sse2_planar_uint16_t<false>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel);
 template void resize_v_sse2_planar_uint16_t<true>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int target_height, int bits_per_pixel);
 
+// Transpose-based SIMD
+void resize_h_planar_float_sse_transpose(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel) {
+  int filter_size = program->filter_size;
+
+  const float* AVS_RESTRICT current_coeff;
+
+  src_pitch = src_pitch / sizeof(float);
+  dst_pitch = dst_pitch / sizeof(float);
+
+  float* src = (float*)src8;
+  float* dst = (float*)dst8;
+
+  const int kernel_size = program->filter_size_real;
+  const int ksmod4 = kernel_size / 4 * 4;
+  //	const int ksmod8 = kernel_size / 8 * 8;
+
+#if 0
+    // single row processing - slower
+  for (int y = 0; y < height; y++) {
+    current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
+
+    float* AVS_RESTRICT dst2_ptr = dst + y * dst_pitch;
+    const float* src_ptr = src + y * src_pitch;
+
+    // FIXME: the SIMD safe end is not width, but safe_width
+    for (int x = 0; x < width; x += 4) {
+
+      __m128 result = _mm_setzero_ps();
+
+      for (int i = 0; i < ksmod4; i += 4) {
+        // 4 pixels, in outer x loop. Each has different "begin" offset
+        __m128 data_1 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + 0] + i);
+        __m128 data_2 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + 1] + i);
+        __m128 data_3 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + 2] + i);
+        __m128 data_4 = _mm_loadu_ps(src_ptr + program->pixel_offset[x + 3] + i);
+
+        __m128 coeff_1 = _mm_load_ps(current_coeff + i + filter_size * 0);
+        __m128 coeff_2 = _mm_load_ps(current_coeff + i + filter_size * 1);
+        __m128 coeff_3 = _mm_load_ps(current_coeff + i + filter_size * 2);
+        __m128 coeff_4 = _mm_load_ps(current_coeff + i + filter_size * 3);
+
+        _MM_TRANSPOSE4_PS(data_1, data_2, data_3, data_4);
+        _MM_TRANSPOSE4_PS(coeff_1, coeff_2, coeff_3, coeff_4);
+
+        result = _mm_add_ps(_mm_mul_ps(data_1, coeff_1), result);
+        result = _mm_add_ps(_mm_mul_ps(data_2, coeff_2), result);
+        result = _mm_add_ps(_mm_mul_ps(data_3, coeff_3), result);
+        result = _mm_add_ps(_mm_mul_ps(data_4, coeff_4), result);
+      }
+
+      _mm_store_ps(dst2_ptr + x, result);
+      current_coeff += filter_size * 4;
+    }
+  }
+#endif
+  constexpr int PIXELS_AT_A_TIME = 4;
+  // source_overread_beyond_targetx must be compatible with the number of source pixels loaded by SIMD load.
+  // loadu_ps: 4 pixels.
+  const int width_safe_mod = (program->safelimit_4_pixels.overread_possible ? program->safelimit_4_pixels.source_overread_beyond_targetx : width) / PIXELS_AT_A_TIME * PIXELS_AT_A_TIME;
+
+  // this is not good, height mod 2 must be used src_ptr2 would access beyond frame
+  for (int y = 0; y < height; y += 2) {
+    current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
+
+    float* AVS_RESTRICT dst2_ptr = dst + y * dst_pitch;
+    float* AVS_RESTRICT dst2_ptr2 = dst + (y + 1) * dst_pitch;
+    const float* src_ptr = src + y * src_pitch;
+    const float* src_ptr2 = src + (y + 1) * src_pitch;
+
+    // 1st pass: from 0 to width_safe_mod in PIXELS_AT_A_TIME steps
+    // 2nd pass: from width_safe_mod to width in single pixel steps
+    //for (int x = 0; x < width_safe_mod; x += PIXELS_AT_A_TIME) {
+    for (int x = 0; x < width; x += PIXELS_AT_A_TIME) {
+
+      __m128 result = _mm_setzero_ps();
+      __m128 result2 = _mm_setzero_ps();
+
+      for (int i = 0; i < kernel_size; i += 4) { // it is always mod4 ?
+
+        const int begin1 = program->pixel_offset[x + 0];
+        const int begin2 = program->pixel_offset[x + 1];
+        const int begin3 = program->pixel_offset[x + 2];
+        const int begin4 = program->pixel_offset[x + 3];
+
+        // this is not good, src_ptr must be used instead of src_ptr + i
+        __m128 data_1 = _mm_loadu_ps(src_ptr + i + begin1);
+        __m128 data_2 = _mm_loadu_ps(src_ptr + i + begin2);
+        __m128 data_3 = _mm_loadu_ps(src_ptr + i + begin3);
+        __m128 data_4 = _mm_loadu_ps(src_ptr + i + begin4);
+
+        __m128 data_1_2 = _mm_loadu_ps(src_ptr2 + i + begin1);
+        __m128 data_2_2 = _mm_loadu_ps(src_ptr2 + i + begin2);
+        __m128 data_3_2 = _mm_loadu_ps(src_ptr2 + i + begin3);
+        __m128 data_4_2 = _mm_loadu_ps(src_ptr2 + i + begin4);
+
+        __m128 coeff_1 = _mm_load_ps(current_coeff + i + filter_size * 0);
+        __m128 coeff_2 = _mm_load_ps(current_coeff + i + filter_size * 1);
+        __m128 coeff_3 = _mm_load_ps(current_coeff + i + filter_size * 2);
+        __m128 coeff_4 = _mm_load_ps(current_coeff + i + filter_size * 3);
+
+        _MM_TRANSPOSE4_PS(data_1, data_2, data_3, data_4);
+        _MM_TRANSPOSE4_PS(data_1_2, data_2_2, data_3_2, data_4_2);
+        _MM_TRANSPOSE4_PS(coeff_1, coeff_2, coeff_3, coeff_4);
+
+        result = _mm_add_ps(_mm_mul_ps(data_1, coeff_1), result);
+        result = _mm_add_ps(_mm_mul_ps(data_2, coeff_2), result);
+        result = _mm_add_ps(_mm_mul_ps(data_3, coeff_3), result);
+        result = _mm_add_ps(_mm_mul_ps(data_4, coeff_4), result);
+
+        result2 = _mm_add_ps(_mm_mul_ps(data_1_2, coeff_1), result2);
+        result2 = _mm_add_ps(_mm_mul_ps(data_2_2, coeff_2), result2);
+        result2 = _mm_add_ps(_mm_mul_ps(data_3_2, coeff_3), result2);
+        result2 = _mm_add_ps(_mm_mul_ps(data_4_2, coeff_4), result2);
+
+      }
+
+      _mm_store_ps(dst2_ptr + x, result);
+      _mm_store_ps(dst2_ptr2 + x, result2);
+
+      current_coeff += filter_size * 4;
+    }
+  }
+
+  // to do - need to process last row of not-mod2 heights
+}
+
+// Safe partial load with SSE2
+// Read exactly N pixels, avoiding
+// - reading beyond the end of the source buffer.
+// - avoid NaN contamination, since event with zero coefficients NaN * 0 = NaN
+template <int Nmod4>
+AVS_FORCEINLINE static __m128 load_partial_safe_sse2(const float* src_ptr_offsetted) {
+  switch (Nmod4) {
+  case 1:
+    return _mm_set_ps(0.0f, 0.0f, 0.0f, src_ptr_offsetted[0]);
+    // ideally: movss
+  case 2:
+    return _mm_set_ps(0.0f, 0.0f, src_ptr_offsetted[1], src_ptr_offsetted[0]);
+    // ideally: movsd
+  case 3:
+    return _mm_set_ps(0.0f, src_ptr_offsetted[2], src_ptr_offsetted[1], src_ptr_offsetted[0]);
+    // ideally: movss + movsd + shuffle or movsd + insert
+  case 0:
+    return _mm_set_ps(src_ptr_offsetted[3], src_ptr_offsetted[2], src_ptr_offsetted[1], src_ptr_offsetted[0]);
+    // ideally: movups
+  default:
+    return _mm_setzero_ps(); // n/a cannot happen
+  }
+}
+
+// Processes a horizontal resampling kernel of up to four coefficients for float pixel types.
+// Supports BilinearResize, BicubicResize, or sinc with up to 2 taps (filter size <= 4).
+// SSE2 optimization loads and processes four float coefficients and pixels simultaneously.
+// The 'filtersizemod4' template parameter (0-3) helps optimize for different filter sizes modulo 4.
+// This SSE2 requires only filter_size_alignment of 4.
+template<int filtersizemod4>
+void resize_h_planar_float_sse_transpose_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel) {
+  assert(filtersizemod4 >= 0 && filtersizemod4 <= 3);
+
+  const int filter_size = program->filter_size; // aligned, practically the coeff table stride
+
+  src_pitch /= sizeof(float);
+  dst_pitch /= sizeof(float);
+
+  float* src = (float*)src8;
+  float* dst = (float*)dst8;
+
+  const float* AVS_RESTRICT current_coeff = (const float* AVS_RESTRICT)program->pixel_coefficient_float;
+
+  constexpr int PIXELS_AT_A_TIME = 4; // Process four pixels in parallel using SSE2
+
+  // 'source_overread_beyond_targetx' indicates if the filter kernel can read beyond the target width.
+  // Even if the filter alignment allows larger reads, our safety boundary for unaligned loads starts at 4 pixels back
+  // from the target width, as we load 4 floats at once with '_mm_loadu_ps'.
+  // In AVX2 we process two lanes, so any of the 8 offsets cannot be safely used, fallback to the unsafe case.
+  // This is why then safelimit_4_pixels is used combined with safelimit_4 / PIXELS_AT_A_TIME * PIXELS_AT_A_TIME.
+  const int width_safe_mod = (program->safelimit_4_pixels.overread_possible ? program->safelimit_4_pixels.source_overread_beyond_targetx : width) / PIXELS_AT_A_TIME * PIXELS_AT_A_TIME;
+
+  // Preconditions:
+  assert(program->filter_size_real <= 4); // We preload all relevant coefficients (up to 4) before the height loop.
+
+  // 'target_size_alignment' ensures we can safely access coefficients using offsets like
+  // 'filter_size * 3' when processing 4 H pixels at a time or
+  // 'filter_size * 7' when processing 8 H pixels at a time or
+  // 'filter_size * 15' when processing 16 H pixels at a time
+  assert(program->target_size_alignment >= 4);
+
+  // Ensure that coefficient loading beyond the valid target size is safe for 4x4 float loads.
+  assert(program->filter_size_alignment >= 4);
+
+  int x = 0;
+
+  // This 'auto' lambda construct replaces the need of templates
+  auto do_h_float_core = [&](auto partial_load) {
+    // Load up to 4 coefficients at once before the height loop.
+    // Pre-loading and transposing coefficients keeps register usage efficient.
+    // Assumes 'filter_size_aligned' is at least 4.
+    __m128 coeff_1 = _mm_load_ps(current_coeff + filter_size * 0); // Coefficients for the source pixel offset (for src_ptr + begin1 [0..3])
+    __m128 coeff_2 = _mm_load_ps(current_coeff + filter_size * 1); // for src_ptr + begin2 [0..3]
+    __m128 coeff_3 = _mm_load_ps(current_coeff + filter_size * 2); // for src_ptr + begin3 [0..3]
+    __m128 coeff_4 = _mm_load_ps(current_coeff + filter_size * 3); // for src_ptr + begin4 [0..3]
+
+    _MM_TRANSPOSE4_PS(coeff_1, coeff_2, coeff_3, coeff_4);
+
+    float* AVS_RESTRICT dst_ptr = dst + x;
+    const float* src_ptr = src;
+
+    // Pixel offsets for the current target x-positions.
+    // Even for x >= width, these offsets are guaranteed to be within the allocated 'target_size_alignment'.
+    const int begin1 = program->pixel_offset[x + 0];
+    const int begin2 = program->pixel_offset[x + 1];
+    const int begin3 = program->pixel_offset[x + 2];
+    const int begin4 = program->pixel_offset[x + 3];
+
+    for (int y = 0; y < height; y++)
+    {
+      __m128 data_1;
+      __m128 data_2;
+      __m128 data_3;
+      __m128 data_4;
+      if constexpr (partial_load) {
+        // In the potentially unsafe zone (near the right edge of the image), we use a safe loading function
+        // to prevent reading beyond the allocated source scanline. This handles cases where loading 4 floats
+        // starting from 'src_ptr + beginX' might exceed the source buffer.
+
+        // Example of the unsafe scenario: If target width is 320, a naive load at src_ptr + 317
+        // would attempt to read floats at indices 317, 318, 319, and 320, potentially going out of bounds.
+
+        // Two main issues in the unsafe zone:
+        // 1.) Out-of-bounds memory access: Reading beyond the allocated memory for the source scanline can
+        //     lead to access violations and crashes. '_mm_loadu_ps' attempts to load 16 bytes, so even if
+        //     the starting address is within bounds, subsequent reads might not be.
+        // 2.) Garbage or NaN values: Even if a read doesn't cause a crash, accessing uninitialized or
+        //     out-of-bounds memory (especially for float types) can result in garbage data, including NaN.
+        //     Multiplying by a valid coefficient and accumulating this NaN can contaminate the final result.
+
+        // 'load_partial_safe_sse2' safely loads up to 'filter_size_real' pixels and pads with zeros if needed,
+        // preventing out-of-bounds reads and ensuring predictable results even near the image edges.
+
+        data_1 = load_partial_safe_sse2<filtersizemod4>(src_ptr + begin1);
+        data_2 = load_partial_safe_sse2<filtersizemod4>(src_ptr + begin2);
+        data_3 = load_partial_safe_sse2<filtersizemod4>(src_ptr + begin3);
+        data_4 = load_partial_safe_sse2<filtersizemod4>(src_ptr + begin4);
+      }
+      else {
+        // In the safe zone, we can directly load 4 pixels at a time using unaligned loads.
+        data_1 = _mm_loadu_ps(src_ptr + begin1);
+        data_2 = _mm_loadu_ps(src_ptr + begin2);
+        data_3 = _mm_loadu_ps(src_ptr + begin3);
+        data_4 = _mm_loadu_ps(src_ptr + begin4);
+      }
+
+      _MM_TRANSPOSE4_PS(data_1, data_2, data_3, data_4);
+
+      __m128 result = _mm_mul_ps(data_1, coeff_1);
+      result = _mm_add_ps(_mm_mul_ps(data_2, coeff_2), result);
+      result = _mm_add_ps(_mm_mul_ps(data_3, coeff_3), result);
+      result = _mm_add_ps(_mm_mul_ps(data_4, coeff_4), result);
+
+      _mm_store_ps(dst_ptr, result);
+
+      dst_ptr += dst_pitch;
+      src_ptr += src_pitch;
+    } // y
+    current_coeff += filter_size * 4; // Move to the next set of coefficients for the next 4 output pixels
+    }; // end of lambda
+
+  // Process the 'safe zone' where direct full unaligned loads are acceptable.
+  for (; x < width_safe_mod; x += PIXELS_AT_A_TIME)
+  {
+    do_h_float_core(std::false_type{}); // partial_load == false, use direct _mm_loadu_ps
+  }
+
+  // Process the potentially 'unsafe zone' near the image edge, using safe loading.
+  for (; x < width; x += PIXELS_AT_A_TIME)
+  {
+    do_h_float_core(std::true_type{}); // partial_load == true, use the safer 'load_partial_safe_sse2'
+  }
+}
+
+// Instantiate them
+template void resize_h_planar_float_sse_transpose_vstripe_ks4<0>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+template void resize_h_planar_float_sse_transpose_vstripe_ks4<1>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+template void resize_h_planar_float_sse_transpose_vstripe_ks4<2>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+template void resize_h_planar_float_sse_transpose_vstripe_ks4<3>(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+
diff --git a/avs_core/filters/intel/resample_sse.h b/avs_core/filters/intel/resample_sse.h
index 8c85795a3..c09228a07 100644
--- a/avs_core/filters/intel/resample_sse.h
+++ b/avs_core/filters/intel/resample_sse.h
@@ -60,4 +60,9 @@ __attribute__((__target__("ssse3")))
 #endif
 void resizer_h_ssse3_generic_float(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
 
+void resize_h_planar_float_sse_transpose(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+
+template<int filtersizemod4>
+void resize_h_planar_float_sse_transpose_vstripe_ks4(BYTE* dst8, const BYTE* src8, int dst_pitch, int src_pitch, ResamplingProgram* program, int width, int height, int bits_per_pixel);
+
 #endif // __Resample_SSE_H__
diff --git a/avs_core/filters/resample.cpp b/avs_core/filters/resample.cpp
index 7caf9d467..18ca2ba5c 100644
--- a/avs_core/filters/resample.cpp
+++ b/avs_core/filters/resample.cpp
@@ -36,6 +36,9 @@
 #ifdef INTEL_INTRINSICS
 #include "intel/resample_sse.h"
 #include "intel/resample_avx2.h"
+#ifdef INTEL_INTRINSICS_AVX512
+#include "intel/resample_avx512.h"
+#endif
 #include "intel/turn_sse.h"
 #endif
 #include <avs/config.h>
@@ -71,15 +74,34 @@
 // while maintaining correct coefficient positioning and proper zero padding.
 
 
+static void checkAndSetOverread(int end_pos, SafeLimit& safelimit, int start_pos, int i, int source_size) {
+  if (end_pos > source_size) {
+    if (!safelimit.overread_possible) {
+      safelimit.overread_possible = true;
+      safelimit.source_overread_offset = start_pos;
+      safelimit.source_overread_beyond_targetx = i;
+    }
+  }
+}
+
+
 void resize_prepare_coeffs(ResamplingProgram* p, IScriptEnvironment* env, int filter_size_alignment) {
   p->filter_size_alignment = filter_size_alignment;
-  p->overread_possible = false;
+  p->safelimit_filter_size_aligned.overread_possible = false;
+  p->safelimit_4_pixels.overread_possible = false;
+  p->safelimit_8_pixels.overread_possible = false;
+  p->safelimit_16_pixels.overread_possible = false;
+  p->safelimit_32_pixels.overread_possible = false;
 
   // note: filter_size_real was the max(kernel_sizes[])
   int filter_size_aligned = AlignNumber(p->filter_size_real, p->filter_size_alignment);
 
   int target_size_aligned = AlignNumber(p->target_size, ALIGN_RESIZER_TARGET_SIZE);
 
+  // align target_size to 8 units to allow safe up to 8 pixels/cycle in H resizers. modded later.
+  p->target_size_alignment = ALIGN_RESIZER_TARGET_SIZE;
+
+
   // Common variables for both float and integer paths
   void* new_coeff = nullptr;
   void* src_coeff = nullptr;
@@ -146,28 +168,27 @@ void resize_prepare_coeffs(ResamplingProgram* p, IScriptEnvironment* env, int fi
     // we must protect against source scanline overread.
     // Using this not in only 32-bit float resizers is new in 3.7.4.
     const int start_pos = p->pixel_offset[i];
-    const int end_pos_aligned = start_pos + filter_size_aligned - 1;
     const int end_pos = start_pos + p->filter_size_real - 1;
     if (end_pos >= p->source_size) {
       // This issue has already been fixed, so it cannot occur.
     }
 
     // Check for SIMD optimization limits
-    if (end_pos_aligned >= p->source_size) {
-      if (!p->overread_possible) {
-        // Register the first occurrence, because we are entering the danger zone from here.
-        // Up to this point, template-based alignment-aware quick code can be used
-        // in H resizers. But beyond this point an e.g. _mm256_loadu_si256() would read into 
-        // invalid memory area at the end of the frame buffer.
-        p->overread_possible = true;
-        p->source_overread_offset = start_pos;
-        p->source_overread_beyond_targetx = i; 
-      }
-    }
+    // a.) when filter_size_aligned pixels are read (e.g. 16 byte SIMD load: 4 float pixels must be safely read)
+    // b.-e.) same for exacly 4, 8, 16 and 32 pixels
+    // We register only the first occurrence, because we are entering the danger zone from there.
+    // Up to this point, it is safe to read 4/8/... pixels from "start_pos" in the actual line.
+    // e.g. reading 4 floats will not read beyond the last pixel in line. Used in modified H resizers.
+
+    checkAndSetOverread(start_pos + filter_size_aligned - 1, p->safelimit_filter_size_aligned, start_pos, i, p->source_size);
+    checkAndSetOverread(start_pos + 4 - 1, p->safelimit_4_pixels, start_pos, i, p->source_size);
+    checkAndSetOverread(start_pos + 8 - 1, p->safelimit_8_pixels, start_pos, i, p->source_size);
+    checkAndSetOverread(start_pos + 16 - 1, p->safelimit_16_pixels, start_pos, i, p->source_size);
+    checkAndSetOverread(start_pos + 32 - 1, p->safelimit_32_pixels, start_pos, i, p->source_size);
   }
 
   // Fill the extra offset after target_size with fake values.
-  // Our aim is to have a safe, up to 8 pixels/cycle simd loop for V resizers.
+  // Our aim is to have a safe, up to 8 pixels/cycle simd loop for V and specific H resizers.
   // Their coeffs will be 0, so they don't count if such coeffs
   // are multiplied with invalid pixels.
   if (p->target_size < target_size_aligned) {
@@ -176,6 +197,8 @@ void resize_prepare_coeffs(ResamplingProgram* p, IScriptEnvironment* env, int fi
     for (int i = p->target_size; i < target_size_aligned; ++i) {
       p->kernel_sizes[i] = p->filter_size_real;
       p->pixel_offset[i] = 0; // 0th pixel offset makes no harm
+      // even if this ensures the in-line safety, alternative H resizer implementations must
+      // not read beyond last line, where y>=height.
     }
   }
 
@@ -1044,7 +1067,7 @@ void resizer_h_c_generic_uint8_16_vectorized(BYTE* dst8, const BYTE* src8, int d
   dst_pitch /= sizeof(pixel_t);
   src_pitch /= sizeof(pixel_t);
 
-  const int w_safe_mod8 = (program->overread_possible ? program->source_overread_beyond_targetx : width) / 8 * 8;
+  const int w_safe_mod8 = (program->safelimit_filter_size_aligned.overread_possible ? program->safelimit_filter_size_aligned.source_overread_beyond_targetx : width) / 8 * 8;
 
   for (int y = 0; y < height; y++) {
     const short* current_coeff_base = program->pixel_coefficient;
@@ -1584,11 +1607,46 @@ ResamplerH FilteredResizeH::GetResampler(int CPU, int pixelsize, int bits_per_pi
   }
   else { //if (pixelsize == 4)
 #ifdef INTEL_INTRINSICS
+#ifdef INTEL_INTRINSICS_AVX512
+    if ((CPU & CPUF_AVX512F) && program->filter_size_real <= 4) {
+      //return resize_h_planar_float_avx2_permutex_vstripe_ks4;
+      switch (program->filter_size_real) {
+/*      case 1: return resize_h_planar_float_avx512_transpose_vstripe_ks4<1>; break;
+      case 2: return resize_h_planar_float_avx512_transpose_vstripe_ks4<2>; break;
+      case 3: return resize_h_planar_float_avx512_transpose_vstripe_ks4<3>; break;
+      case 4: return resize_h_planar_float_avx512_transpose_vstripe_ks4<0>; break;*/
+      case 1: return resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w<1>; break;
+      case 2: return resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w<2>; break;
+      case 3: return resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w<3>; break;
+      case 4: return resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w<0>; break;
+      }
+    }
+#endif
     if (CPU & CPUF_AVX2) {
-      return resizer_h_avx2_generic_float;
+      //return resize_h_planar_float_avx2_permutex_vstripe_ks4;
+      
+      switch (program->filter_size_real) {
+/*      case 1: return resize_h_planar_float_avx_transpose_vstripe_ks4<1>; break;
+      case 2: return resize_h_planar_float_avx_transpose_vstripe_ks4<2>; break;
+      case 3: return resize_h_planar_float_avx_transpose_vstripe_ks4<3>; break;
+      case 4: return resize_h_planar_float_avx_transpose_vstripe_ks4<0>; break;*/
+      case 1: return resize_h_planar_float_avx2_gather_permutex_vstripe_ks4<1>; break;
+      case 2: return resize_h_planar_float_avx2_gather_permutex_vstripe_ks4<2>; break;
+      case 3: return resize_h_planar_float_avx2_gather_permutex_vstripe_ks4<3>; break;
+      case 4: return resize_h_planar_float_avx2_gather_permutex_vstripe_ks4<0>; break;
+      default: return resizer_h_avx2_generic_float;
+      }
+      
     }
     if (CPU & CPUF_SSSE3) {
-      return resizer_h_ssse3_generic_float;
+      //      return resizer_h_ssse3_generic_float;
+      switch (program->filter_size_real) {
+      case 1: return resize_h_planar_float_sse_transpose_vstripe_ks4<1>; break;
+      case 2: return resize_h_planar_float_sse_transpose_vstripe_ks4<2>; break;
+      case 3: return resize_h_planar_float_sse_transpose_vstripe_ks4<3>; break;
+      case 4: return resize_h_planar_float_sse_transpose_vstripe_ks4<0>; break;
+      default: return resizer_h_ssse3_generic_float;
+      }
     }
 #endif
     return resize_h_c_planar<float, 0>;
@@ -1751,6 +1809,10 @@ ResamplerV FilteredResizeV::GetResampler(int CPU, int pixelsize, int bits_per_pi
     if (pixelsize == 1)
     {
 #ifdef INTEL_INTRINSICS
+#ifdef INTEL_INTRINSICS_AVX512
+      if (CPU & CPUF_AVX512F)
+        return resize_v_avx512_planar_uint8_t_w_sr;
+#endif
       if (CPU & CPUF_AVX2)
         return resize_v_avx2_planar_uint8_t;
       if (CPU & CPUF_SSE2)
@@ -1766,6 +1828,13 @@ ResamplerV FilteredResizeV::GetResampler(int CPU, int pixelsize, int bits_per_pi
     else if (pixelsize == 2)
     {
 #ifdef INTEL_INTRINSICS
+#ifdef INTEL_INTRINSICS_AVX512
+      if (CPU & CPUF_AVX512F)
+        if (bits_per_pixel < 16)
+          return resize_v_avx512_planar_uint16_t_w_sr<true>;
+        else
+          return resize_v_avx512_planar_uint16_t_w_sr<false>;
+#endif
       if (CPU & CPUF_AVX2) {
         if (bits_per_pixel < 16)
           return resize_v_avx2_planar_uint16_t<true>;
@@ -1788,8 +1857,15 @@ ResamplerV FilteredResizeV::GetResampler(int CPU, int pixelsize, int bits_per_pi
     else // pixelsize== 4
     {
 #ifdef INTEL_INTRINSICS
+#ifdef INTEL_INTRINSICS_AVX512
+      if (CPU & CPUF_AVX512F) {
+//        return resize_v_avx512_planar_float;
+        return resize_v_avx512_planar_float_w_sr;
+      }
+#endif
       if (CPU & CPUF_AVX2) {
-        return resize_v_avx2_planar_float;
+//        return resize_v_avx2_planar_float;
+        return resize_v_avx2_planar_float_w_sr;
       }
       if (CPU & CPUF_SSE2) {
         return resize_v_sse2_planar_float;
@@ -1909,15 +1985,24 @@ PClip FilteredResize::CreateResize(PClip clip, int target_width, int target_heig
   // 3 - force H and V
   const bool force_H = force == 1 || force == 3;
   const bool force_V = force == 2 || force == 3;
-  if (area_FirstH < area_FirstV)
-  {
-    result = CreateResizeV(clip, subrange_top, subrange_height, target_height, force_V, f, preserve_center, chroma_placement, env);
-    result = CreateResizeH(result, subrange_left, subrange_width, target_width, force_H, f, preserve_center, chroma_placement, env);
-  }
+
+  if (force == 3) // not very good manual forcing of special 2pass mode, better to nake selection if both H and V resizs required, currently for test only
+    result = new FilteredResize_2p(clip,
+      subrange_left, subrange_width, target_width,
+      subrange_top, subrange_height, target_height,
+      f, preserve_center, chroma_placement, env); 
   else
   {
-    result = CreateResizeH(clip, subrange_left, subrange_width, target_width, force_H, f, preserve_center, chroma_placement, env);
-    result = CreateResizeV(result, subrange_top, subrange_height, target_height, force_V, f, preserve_center, chroma_placement, env);
+    if (area_FirstH < area_FirstV)
+    {
+      result = CreateResizeV(clip, subrange_top, subrange_height, target_height, force_V, f, preserve_center, chroma_placement, env);
+      result = CreateResizeH(result, subrange_left, subrange_width, target_width, force_H, f, preserve_center, chroma_placement, env);
+    }
+    else
+    {
+      result = CreateResizeH(clip, subrange_left, subrange_width, target_width, force_H, f, preserve_center, chroma_placement, env);
+      result = CreateResizeV(result, subrange_top, subrange_height, target_height, force_V, f, preserve_center, chroma_placement, env);
+    }
   }
   return result;
 }
@@ -2095,3 +2180,482 @@ AVSValue __cdecl FilteredResize::Create_UserDefined2Resize(AVSValue args, void*,
   return CreateResize(args[0].AsClip(), args[1].AsInt(), args[2].AsInt(), &args[6], force, &f, preserve_center, placement_name, forced_chroma_placement, env);
 }
 
+
+/***************************************
+ *****    Filtered Resize - 2p    ******
+ ***************************************/
+
+FilteredResize_2p::FilteredResize_2p(PClip _child,
+  double subrange_left, double subrange_width, int target_width,
+  double subrange_top, double subrange_height, int target_height,
+  ResamplingFunction* func, bool preserve_center, int chroma_placement, IScriptEnvironment* env)
+  : GenericVideoFilter(_child),
+  resampling_program_luma_h(0), resampling_program_chroma_h(0),
+  resampling_program_luma_v(0), resampling_program_chroma_v(0)
+{
+  if (target_height <= 0)
+    env->ThrowError("Resize: Height must be greater than 0.");
+
+  if (target_width <= 0)
+    env->ThrowError("Resize: Width must be greater than 0.");
+
+  // set class globals
+  src_width = vi.width;
+  src_height = vi.height;
+  dst_width = target_width;
+  dst_height = target_height;
+
+  pixelsize = vi.ComponentSize(); // AVS16
+  bits_per_pixel = vi.BitsPerComponent();
+  grey = vi.IsY();
+  bool isRGBPfamily = vi.IsPlanarRGB() || vi.IsPlanarRGBA();
+
+  if (vi.IsPlanar() && !grey && !isRGBPfamily) {
+    const int mask = (1 << vi.GetPlaneHeightSubsampling(PLANAR_U)) - 1;
+
+    if (target_height & mask)
+      env->ThrowError("Resize: Planar destination height must be a multiple of %d.", mask + 1);
+  }
+
+  if (vi.IsRGB() && !isRGBPfamily)
+    subrange_top = vi.height - subrange_top - subrange_height; // packed RGB upside down
+
+#ifdef INTEL_INTRINSICS
+  int cpu = env->GetCPUFlags();
+#else
+  int cpu = 0;
+#endif
+
+  double center_pos_v_luma;
+  double center_pos_v_chroma;
+  GetCenterShiftForResizers(center_pos_v_luma, center_pos_v_chroma, preserve_center, chroma_placement, vi, false /* for vertical */);
+
+  double center_pos_h_luma;
+  double center_pos_h_chroma;
+  GetCenterShiftForResizers(center_pos_h_luma, center_pos_h_chroma, preserve_center, chroma_placement, vi, true /* for horizontal */);
+  // 3.7.4- parameter, old Avisynth behavior: 0.5, 0.5
+
+  // Create resampling program and pitch table for H
+  resampling_program_luma_h = func->GetResamplingProgram(vi.width, subrange_left, subrange_width, target_width, bits_per_pixel,
+    center_pos_h_luma, center_pos_h_luma, // for resizing it's the same for source and dest
+    env);
+  resampler_luma_h = GetResamplerH(cpu, pixelsize, bits_per_pixel, resampling_program_luma_h, env);
+
+  // Create resampling program and pitch table for V
+  resampling_program_luma_v = func->GetResamplingProgram(vi.height, subrange_top, subrange_height, target_height, bits_per_pixel,
+    center_pos_v_luma, center_pos_v_luma, // for resizing it's the same for source and dest
+    env);
+  resampler_luma_v = GetResamplerV(cpu, pixelsize, bits_per_pixel, resampling_program_luma_v, env);
+
+
+  if (vi.IsPlanar() && !grey && !isRGBPfamily) {
+    const int shift = vi.GetPlaneHeightSubsampling(PLANAR_U);
+    const int div = 1 << shift;
+
+    resampling_program_chroma_v = func->GetResamplingProgram(
+      vi.height >> shift,
+      subrange_top / div,
+      subrange_height / div,
+      target_height >> shift,
+      bits_per_pixel,
+      center_pos_v_chroma, center_pos_v_chroma, // for resizing it's the same for source and dest
+      env);
+
+    resampler_chroma_v = GetResamplerV(cpu, pixelsize, bits_per_pixel, resampling_program_chroma_v, env);
+  }
+
+  if (vi.IsPlanar() && !grey && !isRGBPfamily) {
+    const int shift = vi.GetPlaneWidthSubsampling(PLANAR_U);
+    const int div = 1 << shift;
+
+    resampling_program_chroma_h = func->GetResamplingProgram(
+      vi.width >> shift,
+      subrange_left / div,
+      subrange_width / div,
+      target_width >> shift,
+      bits_per_pixel,
+      center_pos_h_chroma, center_pos_h_chroma, // horizontal
+      env);
+
+    resampler_chroma_h = GetResamplerH(cpu, pixelsize, bits_per_pixel, resampling_program_chroma_h, env);
+  }
+
+  // Change target video info size
+  vi.height = target_height;
+  vi.width = target_width;
+}
+
+#if 0 // expected worse in performance - left for performance tests
+PVideoFrame __stdcall FilteredResize_2p::GetFrame(int n, IScriptEnvironment* env) // use env->Allocate() to get temp buf from other allocated memory - it is NOT returned to the memory pool for the NewVideoFrameP for the downstream filter to write to ?
+{
+  PVideoFrame src = child->GetFrame(n, env);
+  PVideoFrame dst = env->NewVideoFrameP(vi, &src);
+  int src_pitch = src->GetPitch();
+  int dst_pitch = dst->GetPitch();
+  const BYTE* srcp = src->GetReadPtr();
+  BYTE* dstp = dst->GetWritePtr(); // for first (largest ?) plane or for single ?
+
+  bool isRGBPfamily = vi.IsPlanarRGB() || vi.IsPlanarRGBA();
+
+  BYTE* temp_1 = static_cast<BYTE*>(env->Allocate(dst_pitch * dst_height, FRAME_ALIGN, AVS_POOLED_ALLOC));
+  if (!temp_1 ) {
+    env->Free(temp_1);
+    env->ThrowError("Could not reserve temp memory in a resampler_2p.");
+  }
+
+  // Do resizing, single plane by plane
+  resampler_luma_h(temp_1, srcp, dst_pitch, src_pitch, resampling_program_luma_h, dst_width, src_height, bits_per_pixel);
+  int work_height = vi.IsPlanar() ? vi.width : vi.BytesFromPixels(vi.width) / pixelsize; // packed RGB: or vi.width * vi.NumComponent()
+  resampler_luma_v(dstp, temp_1, dst_pitch, dst_pitch, resampling_program_luma_v, work_height, vi.height, bits_per_pixel);
+
+  if (isRGBPfamily)
+  {
+    src_pitch = src->GetPitch(PLANAR_B);
+    dst_pitch = dst->GetPitch(PLANAR_B);
+    srcp = src->GetReadPtr(PLANAR_B);
+    dstp = dst->GetWritePtr(PLANAR_B);
+
+    resampler_luma_h(temp_1, srcp, dst_pitch, src_pitch, resampling_program_luma_h, dst_width, src_height, bits_per_pixel);
+    int work_height = vi.IsPlanar() ? vi.width : vi.BytesFromPixels(vi.width) / pixelsize; // packed RGB: or vi.width * vi.NumComponent()
+    resampler_luma_v(dstp, temp_1, dst_pitch, dst_pitch, resampling_program_luma_v, work_height, vi.height, bits_per_pixel);
+
+    src_pitch = src->GetPitch(PLANAR_R);
+    dst_pitch = dst->GetPitch(PLANAR_R);
+    srcp = src->GetReadPtr(PLANAR_R);
+    dstp = dst->GetWritePtr(PLANAR_R);
+
+    resampler_luma_h(temp_1, srcp, dst_pitch, src_pitch, resampling_program_luma_h, dst_width, src_height, bits_per_pixel);
+    resampler_luma_v(dstp, temp_1, dst_pitch, dst_pitch, resampling_program_luma_v, work_height, vi.height, bits_per_pixel);
+
+  }
+  else if (!grey && vi.IsPlanar()) {
+    int width = vi.width >> vi.GetPlaneWidthSubsampling(PLANAR_U);
+    int height = vi.height >> vi.GetPlaneHeightSubsampling(PLANAR_U);
+
+    // Plane U resizing
+    src_pitch = src->GetPitch(PLANAR_U);
+    dst_pitch = dst->GetPitch(PLANAR_U);
+    srcp = src->GetReadPtr(PLANAR_U);
+    dstp = dst->GetWritePtr(PLANAR_U);
+
+    resampler_chroma_h(temp_1, srcp, dst_pitch, src_pitch, resampling_program_chroma_h, width, src_height >> vi.GetPlaneHeightSubsampling(PLANAR_U), bits_per_pixel);
+    resampler_chroma_v(dstp, temp_1, dst_pitch, dst_pitch, resampling_program_chroma_v, width, height, bits_per_pixel);
+
+    // Plane V resizing
+    src_pitch = src->GetPitch(PLANAR_V);
+    dst_pitch = dst->GetPitch(PLANAR_V);
+    srcp = src->GetReadPtr(PLANAR_V);
+    dstp = dst->GetWritePtr(PLANAR_V);
+
+    resampler_chroma_h(temp_1, srcp, dst_pitch, src_pitch, resampling_program_chroma_h, width, src_height >> vi.GetPlaneHeightSubsampling(PLANAR_U), bits_per_pixel);
+    resampler_chroma_v(dstp, temp_1, dst_pitch, dst_pitch, resampling_program_chroma_v, width, height, bits_per_pixel);
+
+  }
+
+  if (vi.IsYUVA() || vi.IsPlanarRGBA()) {
+    src_pitch = src->GetPitch(PLANAR_A);
+    dst_pitch = dst->GetPitch(PLANAR_A);
+    srcp = src->GetReadPtr(PLANAR_A);
+    dstp = dst->GetWritePtr(PLANAR_A);
+
+    resampler_luma_h(temp_1, srcp, dst_pitch, src_pitch, resampling_program_luma_h, dst_width, src_height, bits_per_pixel);
+    int work_height = vi.IsPlanar() ? vi.width : vi.BytesFromPixels(vi.width) / pixelsize; // packed RGB: or vi.width * vi.NumComponent()
+    resampler_luma_v(dstp, temp_1, dst_pitch, dst_pitch, resampling_program_luma_v, work_height, vi.height, bits_per_pixel);
+  }
+
+  env->Free(temp_1);
+
+  return dst;
+}
+#endif
+
+PVideoFrame __stdcall FilteredResize_2p::GetFrame(int n, IScriptEnvironment* env) // use NewVideoFrame as temp buf to return it in the vfb pool after exit this filter
+{
+  PVideoFrame src = child->GetFrame(n, env);
+  PVideoFrame dst = env->NewVideoFrameP(vi, &src);
+
+  PVideoFrame tmp = env->NewVideoFrame(vi); // no need frame properties copy, use as temporal buffer only and its refcount will be zeroed at function exit with object auto-release/destructor (PVideoFrame::~PVideoFrame() )
+  /*
+  Here we need to ask ScriptEnvironment to look for output format of downstream filter ? So it is not trans-in-place filter we can request frame buffer larger and left
+  it unused after exiting this function. Only in this case there is a big probability the env->NewVideoFrameP(vi, &src); for downstream filter call will return this same virtual address buffer
+  to the downstream filter and it can be (at least partially) overwritten saving from useless downloading from CPU cache. It is new TODO idea for modification of ScriptEnvironment vfb memory management.
+  After this will be implemented - we can use such method of requesting temp buffer (frame) to use in 2pass resize.
+  If this is last filter in a chain - simply request lowest possible sized frame.
+
+  Update 30.05.2025: The expected transfer of tmp buf address to downstream fiter dst frame sometime happens - but how frequently it happens in real scripts running - need to be discovered.
+
+  As env->Allocate/Free buffers are definitely worse (only good if downstream filter will request same temp buf for write) - this temp method expected to be faster (as first expectations).
+  */
+
+  int src_pitch = src->GetPitch();
+  int dst_pitch = dst->GetPitch();
+  const BYTE* srcp = src->GetReadPtr();
+  BYTE* dstp = dst->GetWritePtr(); // for first (largest ?) plane or for single ?
+
+  bool isRGBPfamily = vi.IsPlanarRGB() || vi.IsPlanarRGBA();
+
+  const BYTE* tmp_srcp = tmp->GetReadPtr();
+  BYTE* tmp_dstp = tmp->GetWritePtr(); // for first (largest ?) plane or for single ?
+  int tmp_pitch = tmp->GetPitch();
+
+  // Do resizing, single plane by plane
+  resampler_luma_h(tmp_dstp, srcp, tmp_pitch, src_pitch, resampling_program_luma_h, dst_width, src_height, bits_per_pixel);
+  int work_height = vi.IsPlanar() ? vi.width : vi.BytesFromPixels(vi.width) / pixelsize; // packed RGB: or vi.width * vi.NumComponent()
+  resampler_luma_v(dstp, tmp_srcp, dst_pitch, tmp_pitch, resampling_program_luma_v, work_height, vi.height, bits_per_pixel);
+
+  
+  if (isRGBPfamily)
+  {
+    src_pitch = src->GetPitch(PLANAR_B);
+    dst_pitch = dst->GetPitch(PLANAR_B);
+    srcp = src->GetReadPtr(PLANAR_B);
+    dstp = dst->GetWritePtr(PLANAR_B);
+
+    resampler_luma_h(tmp_dstp, srcp, tmp_pitch, src_pitch, resampling_program_luma_h, dst_width, src_height, bits_per_pixel);
+    int work_height = vi.IsPlanar() ? vi.width : vi.BytesFromPixels(vi.width) / pixelsize; // packed RGB: or vi.width * vi.NumComponent()
+    resampler_luma_v(dstp, tmp_srcp, dst_pitch, tmp_pitch, resampling_program_luma_v, work_height, vi.height, bits_per_pixel);
+
+    src_pitch = src->GetPitch(PLANAR_R);
+    dst_pitch = dst->GetPitch(PLANAR_R);
+    srcp = src->GetReadPtr(PLANAR_R);
+    dstp = dst->GetWritePtr(PLANAR_R);
+
+    resampler_luma_h(tmp_dstp, srcp, tmp_pitch, src_pitch, resampling_program_luma_h, dst_width, src_height, bits_per_pixel);
+    resampler_luma_v(dstp, tmp_srcp, dst_pitch, tmp_pitch, resampling_program_luma_v, work_height, vi.height, bits_per_pixel);
+
+  }
+  else if (!grey && vi.IsPlanar()) {
+    int width = vi.width >> vi.GetPlaneWidthSubsampling(PLANAR_U);
+    int height = vi.height >> vi.GetPlaneHeightSubsampling(PLANAR_U);
+
+    // Plane U resizing
+    src_pitch = src->GetPitch(PLANAR_U);
+    dst_pitch = dst->GetPitch(PLANAR_U);
+    srcp = src->GetReadPtr(PLANAR_U);
+    dstp = dst->GetWritePtr(PLANAR_U);
+
+    resampler_chroma_h(tmp_dstp, srcp, tmp_pitch, src_pitch, resampling_program_chroma_h, width, src_height >> vi.GetPlaneHeightSubsampling(PLANAR_U), bits_per_pixel);
+    resampler_chroma_v(dstp, tmp_dstp, dst_pitch, tmp_pitch, resampling_program_chroma_v, width, height, bits_per_pixel);
+
+    // Plane V resizing
+    src_pitch = src->GetPitch(PLANAR_V);
+    dst_pitch = dst->GetPitch(PLANAR_V);
+    srcp = src->GetReadPtr(PLANAR_V);
+    dstp = dst->GetWritePtr(PLANAR_V);
+
+    resampler_chroma_h(tmp_dstp, srcp, tmp_pitch, src_pitch, resampling_program_chroma_h, width, src_height >> vi.GetPlaneHeightSubsampling(PLANAR_V), bits_per_pixel);
+    resampler_chroma_v(dstp, tmp_dstp, dst_pitch, tmp_pitch, resampling_program_chroma_v, width, height, bits_per_pixel);
+
+  }
+
+  if (vi.IsYUVA() || vi.IsPlanarRGBA()) {
+    src_pitch = src->GetPitch(PLANAR_A);
+    dst_pitch = dst->GetPitch(PLANAR_A);
+    srcp = src->GetReadPtr(PLANAR_A);
+    dstp = dst->GetWritePtr(PLANAR_A);
+
+    resampler_luma_h(tmp_dstp, srcp, tmp_pitch, src_pitch, resampling_program_luma_h, dst_width, src_height, bits_per_pixel);
+    int work_height = vi.IsPlanar() ? vi.width : vi.BytesFromPixels(vi.width) / pixelsize; // packed RGB: or vi.width * vi.NumComponent()
+    resampler_luma_v(dstp, tmp_dstp, dst_pitch, tmp_pitch, resampling_program_luma_v, work_height, vi.height, bits_per_pixel);
+  }
+
+  return dst;
+}
+
+
+ResamplerV FilteredResize_2p::GetResamplerV(int CPU, int pixelsize, int bits_per_pixel, ResamplingProgram* program, IScriptEnvironment* env) // may be somehow call same method from FilteredResizeV class ?
+{
+
+  resize_prepare_coeffs(program, env, 8);
+  // for SIMD friendliness and more: consolidate the kernel_size vs filter_size at the end.
+  // See comments at FilteredResizeH::GetResampler
+
+  if (program->filter_size == 1) {
+    // Fast pointresize
+    switch (pixelsize) // AVS16
+    {
+    case 1: return resize_v_planar_pointresize<uint8_t>;
+    case 2: return resize_v_planar_pointresize<uint16_t>;
+    default: // case 4:
+      return resize_v_planar_pointresize<float>;
+    }
+  }
+  else {
+    // Other resizers
+    if (pixelsize == 1)
+    {
+#ifdef INTEL_INTRINSICS
+#ifdef INTEL_INTRINSICS_AVX512
+      if (CPU & CPUF_AVX512F)
+        return resize_v_avx512_planar_uint8_t_w_sr;
+#endif
+      if (CPU & CPUF_AVX2)
+        return resize_v_avx2_planar_uint8_t;
+      if (CPU & CPUF_SSE2)
+        return resize_v_sse2_planar;
+#ifdef X86_32
+      if (CPU & CPUF_MMX)
+        return resize_v_mmx_planar;
+#endif
+#endif
+      // C version
+      return resize_v_c_planar_uint8_16_t_auto_vectorized<uint8_t, true>;
+    }
+    else if (pixelsize == 2)
+    {
+#ifdef INTEL_INTRINSICS
+#ifdef INTEL_INTRINSICS_AVX512
+      if (CPU & CPUF_AVX512F)
+        if (bits_per_pixel < 16)
+          return resize_v_avx512_planar_uint16_t_w_sr<true>;
+        else
+          return resize_v_avx512_planar_uint16_t_w_sr<false>;
+#endif
+      if (CPU & CPUF_AVX2) {
+        if (bits_per_pixel < 16)
+          return resize_v_avx2_planar_uint16_t<true>;
+        else
+          return resize_v_avx2_planar_uint16_t<false>;
+      }
+      if (CPU & CPUF_SSE2) {
+        if (bits_per_pixel < 16)
+          return resize_v_sse2_planar_uint16_t<true>;
+        else
+          return resize_v_sse2_planar_uint16_t<false>;
+      }
+#endif
+      // C version
+      if (bits_per_pixel == 16)
+        return resize_v_c_planar_uint8_16_t_auto_vectorized<uint16_t, false>;
+      else
+        return resize_v_c_planar_uint8_16_t_auto_vectorized<uint16_t, true>;
+    }
+    else // pixelsize== 4
+    {
+#ifdef INTEL_INTRINSICS
+#ifdef INTEL_INTRINSICS_AVX512
+      if (CPU & CPUF_AVX512F) {
+//        return resize_v_avx512_planar_float;
+        return resize_v_avx512_planar_float_w_sr;
+      }
+#endif
+      if (CPU & CPUF_AVX2) {
+        return resize_v_avx2_planar_float;
+      }
+      if (CPU & CPUF_SSE2) {
+        return resize_v_sse2_planar_float;
+      }
+#endif
+      return resize_v_c_planar_float_auto_vectorized;
+    }
+  }
+}
+
+ResamplerH FilteredResize_2p::GetResamplerH(int CPU, int pixelsize, int bits_per_pixel, ResamplingProgram* program, IScriptEnvironment* env) // may be somehow call same method from FilteredResizeH class ?
+{
+  int simd_coeff_count_padding = 8;
+
+  // Both 8-bit and 16-bit SSSE3 and AVX2 horizontal resizers benefit from processing 16 pixels per cycle.
+  // Floats also use 32 bytes, but since 32/sizeof(float) = 8, processing 16 pixels is unnecessary.
+  // Even in C, the code is optimized to be vector-friendly.
+  if (pixelsize == 1 || pixelsize == 2)
+    simd_coeff_count_padding = 16;
+
+  // Not only does it prepare and pad for SIMD/vector code, but it also corrects, reorders, and equalizes coefficients 
+  // at the right and bottom ends, since we may have variable kernel sizes due to boundary conditions.
+  resize_prepare_coeffs(program, env, simd_coeff_count_padding);
+
+  if (pixelsize == 1)
+  {
+#ifdef INTEL_INTRINSICS
+    if (CPU & CPUF_AVX2) {
+      return resizer_h_avx2_generic_uint8_t;
+    }
+    if (CPU & CPUF_SSSE3) {
+      return resizer_h_ssse3_generic_uint8_16<uint8_t, true>;
+    }
+#endif
+    return resizer_h_c_generic_uint8_16_vectorized<uint8_t, true>;
+    //return resize_h_c_planar<uint8_t, 1>;
+  }
+  else if (pixelsize == 2) {
+#ifdef INTEL_INTRINSICS
+    if (CPU & CPUF_AVX2) {
+      if (bits_per_pixel < 16)
+        return resizer_h_avx2_generic_uint16_t<true>;
+      else
+        return resizer_h_avx2_generic_uint16_t<false>;
+    }
+    if (CPU & CPUF_SSSE3) {
+      if (bits_per_pixel < 16)
+        return resizer_h_ssse3_generic_uint8_16<uint16_t, true>;
+      else
+        return resizer_h_ssse3_generic_uint8_16<uint16_t, false>;
+    }
+#endif
+    if (bits_per_pixel == 16)
+      return resizer_h_c_generic_uint8_16_vectorized<uint16_t, false>;
+    // return resize_h_c_planar<uint16_t, 0>;
+    else
+      return resizer_h_c_generic_uint8_16_vectorized<uint16_t, true>;
+    // return resize_h_c_planar<uint16_t, 1>;
+  }
+  else { //if (pixelsize == 4)
+#ifdef INTEL_INTRINSICS
+#ifdef INTEL_INTRINSICS_AVX512
+    if ((CPU & CPUF_AVX512F) && program->filter_size_real <= 4) {
+      //return resize_h_planar_float_avx2_permutex_vstripe_ks4;
+      switch (program->filter_size_real) {
+        /*      case 1: return resize_h_planar_float_avx512_transpose_vstripe_ks4<1>; break;
+              case 2: return resize_h_planar_float_avx512_transpose_vstripe_ks4<2>; break;
+              case 3: return resize_h_planar_float_avx512_transpose_vstripe_ks4<3>; break;
+              case 4: return resize_h_planar_float_avx512_transpose_vstripe_ks4<0>; break;*/
+      case 1: return resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w<1>; break;
+      case 2: return resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w<2>; break;
+      case 3: return resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w<3>; break;
+      case 4: return resize_h_planar_float_avx512_gather_permutex_vstripe_ks4_2w<0>; break;
+      }
+    }
+#endif
+    if (CPU & CPUF_AVX2) {
+      //return resize_h_planar_float_avx2_permutex_vstripe_ks4;
+
+      switch (program->filter_size_real) {
+        /*      case 1: return resize_h_planar_float_avx_transpose_vstripe_ks4<1>; break;
+              case 2: return resize_h_planar_float_avx_transpose_vstripe_ks4<2>; break;
+              case 3: return resize_h_planar_float_avx_transpose_vstripe_ks4<3>; break;
+              case 4: return resize_h_planar_float_avx_transpose_vstripe_ks4<0>; break;*/
+      case 1: return resize_h_planar_float_avx2_gather_permutex_vstripe_ks4<1>; break;
+      case 2: return resize_h_planar_float_avx2_gather_permutex_vstripe_ks4<2>; break;
+      case 3: return resize_h_planar_float_avx2_gather_permutex_vstripe_ks4<3>; break;
+      case 4: return resize_h_planar_float_avx2_gather_permutex_vstripe_ks4<0>; break;
+      default: return resizer_h_avx2_generic_float;
+      }
+
+    }
+    if (CPU & CPUF_SSSE3) {
+      //      return resizer_h_ssse3_generic_float;
+      switch (program->filter_size_real) {
+      case 1: return resize_h_planar_float_sse_transpose_vstripe_ks4<1>; break;
+      case 2: return resize_h_planar_float_sse_transpose_vstripe_ks4<2>; break;
+      case 3: return resize_h_planar_float_sse_transpose_vstripe_ks4<3>; break;
+      case 4: return resize_h_planar_float_sse_transpose_vstripe_ks4<0>; break;
+      default: return resizer_h_ssse3_generic_float;
+      }
+    }
+#endif
+    return resize_h_c_planar<float, 0>;
+  }
+}
+
+
+FilteredResize_2p::~FilteredResize_2p(void)
+{
+  if (resampling_program_luma_h) { delete resampling_program_luma_h; }
+  if (resampling_program_chroma_h) { delete resampling_program_chroma_h; }
+
+  if (resampling_program_luma_v) { delete resampling_program_luma_v; }
+  if (resampling_program_chroma_v) { delete resampling_program_chroma_v; }
+
+
+}
diff --git a/avs_core/filters/resample.h b/avs_core/filters/resample.h
index 8afed8775..d6272ebee 100644
--- a/avs_core/filters/resample.h
+++ b/avs_core/filters/resample.h
@@ -121,6 +121,52 @@ class FilteredResizeV : public GenericVideoFilter
 };
 
 
+/**
+  * Class to resize in the dual directions using a specified sampling filter and lower size used temporal buffer
+  * Helper for resample functions
+ **/
+class FilteredResize_2p : public GenericVideoFilter
+{
+public:
+  FilteredResize_2p(PClip _child,
+    double subrange_left, double subrange_width, int target_width, 
+    double subrange_top, double subrange_height, int target_height,
+    ResamplingFunction* func, bool preserve_center, int chroma_placement, IScriptEnvironment* env);
+  virtual ~FilteredResize_2p(void);
+
+  PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment* env) override;
+
+  int __stdcall SetCacheHints(int cachehints, int frame_range) override {
+    AVS_UNUSED(frame_range);
+    return cachehints == CACHE_GET_MTMODE ? MT_NICE_FILTER : 0;
+  }
+
+  static ResamplerH GetResamplerH(int CPU, int pixelsize, int bits_per_pixel, ResamplingProgram* program, IScriptEnvironment* env);
+  static ResamplerV GetResamplerV(int CPU, int pixelsize, int bits_per_pixel, ResamplingProgram* program, IScriptEnvironment* env);
+
+private:
+  bool grey;
+  int pixelsize; // AVS16
+  int bits_per_pixel;
+
+  int src_width, src_height, dst_width, dst_height;
+
+  ResamplingProgram* resampling_program_luma_h;
+  ResamplingProgram* resampling_program_chroma_h;
+
+  ResamplingProgram* resampling_program_luma_v;
+  ResamplingProgram* resampling_program_chroma_v;
+
+  ResamplerH resampler_luma_h;
+  ResamplerH resampler_chroma_h;
+
+  ResamplerV resampler_luma_v;
+  ResamplerV resampler_chroma_v;
+
+};
+
+
+
 /*** Resample factory methods ***/
 
 class FilteredResize
diff --git a/avs_core/filters/resample_functions.cpp b/avs_core/filters/resample_functions.cpp
index 437bc6a73..4b0b3399e 100644
--- a/avs_core/filters/resample_functions.cpp
+++ b/avs_core/filters/resample_functions.cpp
@@ -539,9 +539,9 @@ ResamplingProgram* ResamplingFunction::GetResamplingProgram(int source_size, dou
     // in order not to have NaN floats
     if (start_pos + AlignNumber(fir_filter_size, ALIGN_FLOAT_RESIZER_COEFF_SIZE) - 1 > source_size - 1)
     {
-      if (!program->overread_possible) {
+      if (!program->overread_possible_filter_size_aligned) {
         // register the first occurance
-        program->overread_possible = true;
+        program->overread_possible_filter_size_aligned = true;
         program->source_overread_offset = start_pos;
         program->source_overread_beyond_targetx = i;
       }
diff --git a/avs_core/filters/resample_functions.h b/avs_core/filters/resample_functions.h
index 4e7b71f58..738046824 100644
--- a/avs_core/filters/resample_functions.h
+++ b/avs_core/filters/resample_functions.h
@@ -48,10 +48,16 @@ constexpr int FPScale = 1 << FPScale8bits; // fixed point scaler (1<<14)
 // for 16 bits: one bit less
 constexpr int FPScale16bits = 13;
 constexpr int FPScale16 = 1 << FPScale16bits; // fixed point scaler for 10-16 bit SIMD signed operation
-constexpr int ALIGN_RESIZER_TARGET_SIZE = 8;
+constexpr int ALIGN_RESIZER_TARGET_SIZE = 16; // 16: avx512 float Hoprizontal
 // 09-14-2002 - Vlad59 - Lanczos3Resize - Constant added
 #define M_PI 3.14159265358979323846
 
+struct SafeLimit {
+  bool overread_possible;
+  int source_overread_offset;
+  int source_overread_beyond_targetx;
+};
+
 struct ResamplingProgram {
   IScriptEnvironment * Env;
   int source_size, target_size;
@@ -59,6 +65,7 @@ struct ResamplingProgram {
   int filter_size;
   int filter_size_real; // maybe less than filter_size if dimensions are small
   int filter_size_alignment; // for info, 1 (C, nonvector-friendly), 8 (sse or avx2) or 16 (avx2)
+  int target_size_alignment; // coeff table exists (and containt zero coeffs) even beyond target_size. Helps alternative H resizers.
 
   // Array of Integer indicate starting point of sampling
   std::vector<int> pixel_offset;
@@ -74,24 +81,24 @@ struct ResamplingProgram {
   std::vector<short> kernel_sizes; 
   // 3.7.4- can be different for each line but then they get equalized and aligned.
 
-  // anti-overread helpers for float resizer simd code reading 8 pixels from a given offset
-  bool overread_possible;
-  int source_overread_offset; // offset from where reading 8 bytes requires masking garbage on the right side
-  int source_overread_beyond_targetx; 
   // in H resizers danger zone starts from here.
-  // When reading aligned_filter_size elements from (src+offset) no longer fits image scanline dimensions
-
+  // When reading multiple (SIMD load) source pixels from (src+offset) and it no
+  // longer fits image scanline dimensions (width)
+  SafeLimit safelimit_filter_size_aligned = { false, -1, -1 };
+  SafeLimit safelimit_4_pixels = { false, -1, -1 };
+  SafeLimit safelimit_8_pixels = { false, -1, -1 };
+  SafeLimit safelimit_16_pixels = { false, -1, -1 };
+  SafeLimit safelimit_32_pixels = { false, -1, -1 };
 
   ResamplingProgram(int filter_size, int source_size, int target_size, double crop_start, double crop_size, int bits_per_pixel, IScriptEnvironment* env)
     : Env(env), source_size(source_size), target_size(target_size), crop_start(crop_start), crop_size(crop_size), filter_size(filter_size), filter_size_real(filter_size),
     bits_per_pixel(bits_per_pixel), pixel_coefficient(0), pixel_coefficient_float(0)
   {
-    overread_possible = false;
-    source_overread_offset = -1;
-    source_overread_beyond_targetx = -1;
 
-    // align target_size to 8 units to allow safe 8 pixels/cycle in H resizers
+    
     filter_size_alignment = 1;
+    // align target_size to 8 units to allow safe up to 8 pixels/cycle in H resizers. modded later.
+    target_size_alignment = 1;
     // resize_prepare_coeff can override and realign the size of coefficient table
     if (bits_per_pixel < 32)
       pixel_coefficient = (short*)Env->Allocate(sizeof(short) * target_size * filter_size, 64, AVS_NORMAL_ALLOC);