Skip to content

Commit d250c1c

Browse files
committed
*extend using of AVX-512BW optimization of function Convolution32fNhwcDepthwise_k7p3d1s1w4.
1 parent 1e942f4 commit d250c1c

File tree

3 files changed

+15
-6
lines changed

3 files changed

+15
-6
lines changed

docs/2025.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ <h5>Improving</h5>
4747
<li>Extend using of AMX-BF16 optimization of function DepthwiseConvolution_k7p3d1s1w4.</li>
4848
<li>Extend using of AMX-BF16 optimization of function DepthwiseConvolution_k7p3d1s1w6.</li>
4949
<li>Extend using of AMX-BF16 optimization of function DepthwiseConvolution_k7p3d1s1w8.</li>
50+
<li>Extend using of AVX-512BW optimization of function Convolution32fNhwcDepthwise_k7p3d1s1w4.</li>
5051
</ul>
5152
<h5>Bug fixing</h5>
5253
<ul>

src/Simd/SimdAvx512bwSynetConvolution32fNhwcDepthwise.cpp

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -885,11 +885,16 @@ namespace Simd
885885

886886
//-------------------------------------------------------------------------------------------------
887887

888+
static SIMD_INLINE bool Preferable_k7p3d1s1w4(const ConvParam& p)
889+
{
890+
return p.IsKernel(7) && p.IsPad(3) && p.IsStride(1) && p.IsDilation(1) && p.srcW >= 7;
891+
}
892+
888893
template<::SimdConvolutionActivationType type> void Convolution32fNhwcDepthwise_k7p3d1s1w4(const float* src, const ConvParam& p, const float* weight, const float* bias, const float* params, float* dst)
889894
{
890-
assert(p.IsKernel(7) && p.IsPad(3) && p.IsStride(1) && p.IsDilation(1) && Aligned(p.srcW, 4));
895+
assert(p.IsKernel(7) && p.IsPad(3) && p.IsStride(1) && p.IsDilation(1) && p.srcW >= 7);
891896

892-
size_t dstC = p.dstC, dstW = p.dstW, srcH = p.srcH, end = dstW - 4;
897+
size_t dstC = p.dstC, dstW = p.dstW, srcH = p.srcH, endW = dstW - 4;
893898
__m512 s0, s1, w0, w1, w2, w3, w4, w5, w6, d0, d1, d2, d3, _params[2];
894899
_params[0] = _mm512_set1_ps(params[0]);
895900
if (type == SimdConvolutionActivationRestrictRange ||
@@ -898,7 +903,7 @@ namespace Simd
898903
_params[1] = _mm512_set1_ps(params[1]);
899904
for (size_t dy = 0; dy < p.dstH; ++dy)
900905
{
901-
for (size_t dx = 0; dx < dstW; dx += 4)
906+
for (size_t dx = 0;; dx += Min<size_t>(4, endW - dx))
902907
{
903908
for (size_t dc = 0; dc < dstC; dc += F)
904909
{
@@ -958,7 +963,7 @@ namespace Simd
958963
d1 = _mm512_fmadd_ps(s0, w5, d1);
959964
d2 = _mm512_fmadd_ps(s0, w4, d2);
960965
d3 = _mm512_fmadd_ps(s0, w3, d3);
961-
if (dx < end)
966+
if (dx < endW)
962967
{
963968
s1 = _mm512_maskz_loadu_ps(tail, ps + 7 * dstC);
964969
d1 = _mm512_fmadd_ps(s1, w6, d1);
@@ -980,6 +985,8 @@ namespace Simd
980985
_mm512_mask_storeu_ps(pd + 2 * dstC, tail, Activate<type>(d2, _params, 0));
981986
_mm512_mask_storeu_ps(pd + 3 * dstC, tail, Activate<type>(d3, _params, 0));
982987
}
988+
if (dx == endW)
989+
break;
983990
}
984991
}
985992
}
@@ -1256,7 +1263,7 @@ namespace Simd
12561263
return Convolution32fNhwcDepthwise_k7p3d1s1w8<type>;
12571264
else if (p.IsKernel(7) && p.IsPad(3) && p.IsStride(1) && p.IsDilation(1) && AlignedAny(p.srcW, 6))
12581265
return Convolution32fNhwcDepthwise_k7p3d1s1w6<type>;
1259-
else if (p.IsKernel(7) && p.IsPad(3) && p.IsStride(1) && p.IsDilation(1) && Aligned(p.srcW, 4))
1266+
else if (Preferable_k7p3d1s1w4(p))
12601267
return Convolution32fNhwcDepthwise_k7p3d1s1w4<type>;
12611268
else if (p.IsKernel(3) && p.IsDilation(1))
12621269
return Convolution32fNhwcDepthwise3x3<type>;

src/Test/TestSynetConvolution32f.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -245,10 +245,11 @@ namespace Test
245245
result = result && SynetConvolution32fForwardAutoTest(eps, Param(24, 2048, 6, 6, 255, _3, _1, _1, _0, _0, 1, a, t), f1, f2);
246246
#endif
247247
#if 1
248+
result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 816, 14, 14, 816, _5, _1, _1, _2, _2, 816, aRe, t), f1, f2);
248249
result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 304, 16, 16, 304, _3, _1, _1, _1, _1, 304, aRe, t), f1, f2);
249250
result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 304, 16, 16, 304, _7, _1, _1, _3, _3, 304, aPr, t), f1, f2);
250251
result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 608, 8, 8, 608, _7, _1, _1, _3, _3, 608, aRe, t), f1, f2);
251-
result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 608, 7, 6, 608, _7, _1, _1, _3, _3, 608, aRe, t), f1, f2);
252+
result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 608, 7, 7, 608, _7, _1, _1, _3, _3, 608, aRe, t), f1, f2);
252253
result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 152, 32, 32, 152, _7, _1, _1, _3, _3, 152, aRe, t), f1, f2);
253254
result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 76, 64, 64, 76, _7, _1, _1, _3, _3, 76, aRe, t), f1, f2);
254255
#endif

0 commit comments

Comments
 (0)