diff --git a/main/boofcv-ip/src/generate/java/boofcv/alg/filter/convolve/noborder/GenerateImplConvolveMean.java b/main/boofcv-ip/src/generate/java/boofcv/alg/filter/convolve/noborder/GenerateImplConvolveMean.java index b88c36f339..c15c457abc 100644 --- a/main/boofcv-ip/src/generate/java/boofcv/alg/filter/convolve/noborder/GenerateImplConvolveMean.java +++ b/main/boofcv-ip/src/generate/java/boofcv/alg/filter/convolve/noborder/GenerateImplConvolveMean.java @@ -92,25 +92,34 @@ public void printHorizontalBorder() { String body = "\t\t\tint indexDest = output.startIndex + y*output.stride;\n" + "\t\t\tint j = input.startIndex + y*input.stride;\n" + "\n" + - "\t\t\tfor (int i = 0; i < offset; i++) {\n" + - "\t\t\t\tint jEnd = j + i + length - offset;\n" + - "\t\t\t\t" + sumType + " total = 0;\n" + + "\t\t\t" + sumType + " total = 0;\n" + + "\t\t\tint count = length - offset;\n" + + "\t\t\tint jEnd = j + count;\n" + + "\t\t\tif (offset > 0) {\n" + "\t\t\t\tfor (int indexSrc = j; indexSrc < jEnd; indexSrc++) {\n" + "\t\t\t\t\ttotal += dataSrc[indexSrc]" + bitWise + ";\n" + "\t\t\t\t}\n" + - "\t\t\t\tint count = jEnd - j;\n" + "\t\t\t\tdataDst[indexDest++] = " + divide + ";\n" + "\t\t\t}\n" + "\n" + - "\t\t\tint jEnd = j + width;\n" + - "\t\t\tj += width - (offset + offsetR);\n" + - "\t\t\tindexDest += width - (offset + offsetR);\n" + - "\t\t\tfor (int i = 0; i < offsetR; i++) {\n" + - "\t\t\t\t" + sumType + " total = 0;\n" + - "\t\t\t\tfor (int indexSrc = j + i; indexSrc < jEnd; indexSrc++) {\n" + + "\t\t\twhile (++count < length) {\n" + + "\t\t\t\ttotal += dataSrc[jEnd++]" + bitWise + ";\n" + + "\t\t\t\tdataDst[indexDest++] = " + divide + ";\n" + + "\t\t\t}\n" + + "\n" + + "\t\t\tjEnd = j + width;\n" + + "\t\t\tcount = offset + offsetR;\n" + + "\t\t\tj += width - count;\n" + + "\t\t\tindexDest += width - count;\n" + + "\t\t\ttotal = 0;\n" + + "\t\t\tif (offsetR > 0) {\n" + + "\t\t\t\tfor (int indexSrc = j; indexSrc < jEnd; indexSrc++) {\n" + "\t\t\t\t\ttotal += dataSrc[indexSrc]" + bitWise + ";\n" + "\t\t\t\t}\n" + - "\t\t\t\tint count = jEnd - j - i;\n" + + "\t\t\t\tdataDst[indexDest++] = " + divide + ";\n" + + "\t\t\t}\n" + + "\t\t\twhile (--count > offset) {\n" + + "\t\t\t\ttotal -= dataSrc[j++]" + bitWise + ";\n" + "\t\t\t\tdataDst[indexDest++] = " + divide + ";\n" + "\t\t\t}\n"; @@ -125,7 +134,7 @@ public void printHorizontal() { String bitWise = imageIn.getBitWise(); String declareHalf = imageIn.isInteger() ? "\t\tfinal " + sumType + " halfDivisor = divisor/2;\n" : ""; - String divide = imageIn.isInteger() ? "(total+halfDivisor)/divisor" : "total/divisor"; + String divide = imageIn.isInteger() ? "(total + halfDivisor)/divisor" : "total/divisor"; out.print("\tpublic static void horizontal( " + imageIn.getSingleBandName() + " input," + imageOut.getSingleBandName() + " output, int offset, int length ) {\n" + "\t\tfinal " + sumType + " divisor = length;\n" + @@ -181,18 +190,26 @@ public void printVerticalBorder() { "\n" + "\t\t// Image Top\n" + "\t\tfor (int count = length - offset; count < length; count++) {\n" + - "\t\t\t{\n" + - "\t\t\t\tint indexIn = input.startIndex + x0;\n" + + "\t\t\tfinal int indexInRow = input.startIndex + x0;\n" + + "\t\t\tif (count == length - offset) {\n" + + "\t\t\t\tint indexIn = indexInRow;\n" + "\n" + "\t\t\t\tfor (int x = x0; x < x1; x++) {\n" + "\t\t\t\t\ttotals[x - x0] = dataSrc[indexIn++]" + bitWise + ";\n" + "\t\t\t\t}\n" + - "\t\t\t}\n" + - "\t\t\tfor (int y = 1; y < count; y++) {\n" + - "\t\t\t\tint indexIn = input.startIndex + x0 + y*input.stride;\n" + "\n" + - "\t\t\t\tfor (int x = x0; x < x1; x++) {\n" + - "\t\t\t\t\ttotals[x - x0] += dataSrc[indexIn++]" + bitWise + ";\n" + + "\t\t\t\tfor (int y = 1; y < count; y++) {\n" + + "\t\t\t\t\tindexIn = indexInRow + y*input.stride;\n" + + "\n" + + "\t\t\t\t\tfor (int x = x0; x < x1; x++) {\n" + + "\t\t\t\t\t\ttotals[x - x0] += dataSrc[indexIn++]" + bitWise + ";\n" + + "\t\t\t\t\t}\n" + + "\t\t\t\t}\n" + + "\t\t\t} else {\n" + + "\t\t\t\tint indexIn0 = indexInRow + (count - 1)*input.stride;\n" + + "\t\t\t\tint end = indexIn0 + x1 - x0;\n" + + "\t\t\t\tfor (int i = indexIn0; i < end; i++) {\n" + + "\t\t\t\t\ttotals[i - indexIn0] += dataSrc[i]" + bitWise + ";\n" + "\t\t\t\t}\n" + "\t\t\t}\n" + "\t\t\tint indexOut = output.startIndex + (count - (length - offset))*output.stride;\n" + @@ -202,19 +219,27 @@ public void printVerticalBorder() { "\t\t}\n" + "\t\t// Image Bottom\n" + "\t\tfor (int yStart = height - length + 1; yStart < height - offset; yStart++) {\n" + - "\t\t\t{\n" + - "\t\t\t\tint indexIn = input.startIndex + x0 + yStart*input.stride;\n" + + "\t\t\tfinal int indexInRow = input.startIndex + x0;\n" + + "\t\t\tif (yStart == height - length + 1) {\n" + + "\t\t\t\tint indexIn = indexInRow + yStart*input.stride;\n" + "\n" + "\t\t\t\tfor (int x = x0; x < x1; x++) {\n" + "\t\t\t\t\ttotals[x - x0] = dataSrc[indexIn++]" + bitWise + ";\n" + "\t\t\t\t}\n" + - "\t\t\t}\n" + "\n" + - "\t\t\tfor (int y = yStart + 1; y < height; y++) {\n" + - "\t\t\t\tint indexIn = input.startIndex + x0 + y*input.stride;\n" + + "\t\t\t\tfor (int y = yStart + 1; y < height; y++) {\n" + + "\t\t\t\t\tindexIn = indexInRow + y*input.stride;\n" + "\n" + - "\t\t\t\tfor (int x = x0; x < x1; x++) {\n" + - "\t\t\t\t\ttotals[x - x0] += dataSrc[indexIn++]" + bitWise + ";\n" + + "\t\t\t\t\tfor (int x = x0; x < x1; x++) {\n" + + "\t\t\t\t\t\ttotals[x - x0] += dataSrc[indexIn++]" + bitWise + ";\n" + + "\t\t\t\t\t}\n" + + "\t\t\t\t}\n" + + "\t\t\t} else {\n" + + "\t\t\t\tint indexIn0 = indexInRow + (yStart - 1)*input.stride;\n" + + "\t\t\t\tint indexIn1 = indexIn0 + x1 - x0;\n" + + "\n" + + "\t\t\t\tfor (int i = indexIn0; i < indexIn1; i++) {\n" + + "\t\t\t\t\ttotals[i - indexIn0] -= dataSrc[i]" + bitWise + ";\n" + "\t\t\t\t}\n" + "\t\t\t}\n" + "\n" + @@ -235,7 +260,7 @@ public void printVertical() { String bitWise = imageIn.getBitWise(); String declareHalf = imageIn.isInteger() ? "\t\tfinal " + sumType + " halfDivisor = divisor/2;\n" : ""; - String divide = imageIn.isInteger() ? "(total + halfDivisor)/divisor" : "total/divisor"; + String divide = imageIn.isInteger() ? typeCast + "((total + halfDivisor)/divisor)" : "total/divisor"; String workType = ("DogArray_" + imageIn.getKernelType()).replace("S32", "I32"); @@ -243,46 +268,52 @@ public void printVertical() { imageOut.getSingleBandName() + " output, int offset, int length, @Nullable GrowArray<" + workType + "> workspaces ) {\n" + "\t\tworkspaces = BoofMiscOps.checkDeclare(workspaces, " + workType + "::new);\n" + "\t\tfinal " + workType + " work = workspaces.grow(); //CONCURRENT_REMOVE_LINE\n" + - "\t\tfinal int backStep = length*input.stride;\n" + - "\t\tfinal int offsetEnd = length - offset - 1;\n" + "\n" + "\t\tfinal " + sumType + " divisor = length;\n" + declareHalf + - "\n" + - "\t\t// To reduce cache misses it is processed along rows instead of going down columns, which is\n" + - "\t\t// more natural for a vertical convolution. For parallel processes this requires building\n" + - "\t\t// a book keeping array for each thread.\n"); + "\t\tfinal int regionStepY = length*input.stride;\n"); String body = ""; - body += "\t\t" + sumType + "[] totals = BoofMiscOps.checkDeclare(work, input.width, false);\n" + - "\t\tfor (int x = 0; x < input.width; x++) {\n" + - "\t\t\tint indexIn = input.startIndex + (y0 - offset)*input.stride + x;\n" + - "\t\t\tint indexOut = output.startIndex + output.stride*y0 + x;\n" + + body += "\t\t" + sumType + "[] totals = BoofMiscOps.checkDeclare(work, x1 - x0, false);\n" + "\n" + - "\t\t\t" + sumType + " total = 0;\n" + - "\t\t\tint indexEnd = indexIn + input.stride*length;\n" + - "\t\t\tfor (; indexIn < indexEnd; indexIn += input.stride) {\n" + - "\t\t\t\ttotal += input.data[indexIn] " + bitWise + ";\n" + + "\t\t// Sum up along x-axis to avoid cache misses when reading from input image\n" + + "\t\t// Initialize recursion by summing up the first kernels along the x-axis\n" + + "\t\t{\n" + + "\t\t\tint indexIn = input.startIndex + x0;\n" + + "\n" + + "\t\t\tfor (int x = x0; x < x1; x++) {\n" + + "\t\t\t\ttotals[x - x0] = input.data[indexIn++]" + bitWise + ";\n" + + "\t\t\t}\n" + + "\t\t}\n" + + "\t\tfor (int y = 1; y < length; y++) {\n" + + "\t\t\tint indexIn = input.startIndex + y*input.stride + x0;\n" + + "\t\t\tint indexInEnd = indexIn + x1 - x0;\n" + + "\t\t\tfor (int i = indexIn; i < indexInEnd; i++) {\n" + + "\t\t\t\ttotals[i - indexIn] += input.data[i]" + bitWise + ";\n" + "\t\t\t}\n" + - "\t\t\ttotals[x] = total;\n" + - "\t\t\toutput.data[indexOut] = " + typeCast + "(" + divide + ");\n" + "\t\t}\n" + "\n" + - "\t\t// change the order it is processed in to reduce cache misses\n" + - "\t\tfor (int y = y0 + 1; y < y1; y++) {\n" + - "\t\t\tint indexIn = input.startIndex + (y + offsetEnd)*input.stride;\n" + - "\t\t\tint indexOut = output.startIndex + y*output.stride;\n" + - "\n" + - "\t\t\tfor (int x = 0; x < input.width; x++, indexIn++, indexOut++) {\n" + - "\t\t\t\t" + sumType + " total = totals[x] - (input.data[indexIn - backStep]" + bitWise + ");\n" + - "\t\t\t\ttotals[x] = total += input.data[indexIn]" + bitWise + ";\n" + + "\t\tint indexOut = output.startIndex + output.stride*offset + x0;\n" + + "\t\tfor (int x = x0; x < x1; x++, indexOut++) {\n" + + "\t\t\tfinal " + sumType + " total = totals[x - x0];\n" + + "\t\t\toutput.data[indexOut] = " + divide + ";\n" + + "\t\t}\n" + "\n" + - "\t\t\t\toutput.data[indexOut] = " + typeCast + "(" + divide + ");\n" + + "\t\t// For the reminder we only need to add and remove the first and last elements to update the solution\n" + + "\t\tfor (int y = 0; y < input.height - length; y++) {\n" + + "\t\t\tindexOut = output.startIndex + output.stride*(offset + y + 1) + x0;\n" + + "\t\t\tint indexIn = input.startIndex + y*input.stride + x0;\n" + + "\t\t\tint indexInEnd = indexIn + x1 - x0;\n" + + "\t\t\tfor (int i = indexIn; i < indexInEnd; i++, indexOut++) {\n" + + "\t\t\t\t" + sumType + " total = totals[i - indexIn] - (input.data[i]" + bitWise + ");\n" + + "\t\t\t\ttotal += input.data[i + regionStepY]" + bitWise + ";\n" + + "\t\t\t\toutput.data[indexOut] = " + divide + ";\n" + + "\t\t\t\ttotals[i - indexIn] = total;\n" + "\t\t\t}\n" + "\t\t}\n"; - printParallelBlock("y0", "y1", "offset", "output.height - offsetEnd", "length", body); + printParallelBlock("x0", "x1", "0", "input.width", "20", body); out.print("\t}\n\n"); } diff --git a/main/boofcv-ip/src/main/java/boofcv/alg/filter/convolve/noborder/ImplConvolveMean.java b/main/boofcv-ip/src/main/java/boofcv/alg/filter/convolve/noborder/ImplConvolveMean.java index bcf5a210fc..717221913c 100644 --- a/main/boofcv-ip/src/main/java/boofcv/alg/filter/convolve/noborder/ImplConvolveMean.java +++ b/main/boofcv-ip/src/main/java/boofcv/alg/filter/convolve/noborder/ImplConvolveMean.java @@ -57,25 +57,34 @@ public static void horizontalBorder( GrayU8 input, GrayI8 output, int offset, in int indexDest = output.startIndex + y*output.stride; int j = input.startIndex + y*input.stride; - for (int i = 0; i < offset; i++) { - int jEnd = j + i + length - offset; - int total = 0; + int total = 0; + int count = length - offset; + int jEnd = j + count; + if (offset > 0) { for (int indexSrc = j; indexSrc < jEnd; indexSrc++) { total += dataSrc[indexSrc]& 0xFF; } - int count = jEnd - j; dataDst[indexDest++] = (byte)((total + count/2)/count); } - int jEnd = j + width; - j += width - (offset + offsetR); - indexDest += width - (offset + offsetR); - for (int i = 0; i < offsetR; i++) { - int total = 0; - for (int indexSrc = j + i; indexSrc < jEnd; indexSrc++) { + while (++count < length) { + total += dataSrc[jEnd++]& 0xFF; + dataDst[indexDest++] = (byte)((total + count/2)/count); + } + + jEnd = j + width; + count = offset + offsetR; + j += width - count; + indexDest += width - count; + total = 0; + if (offsetR > 0) { + for (int indexSrc = j; indexSrc < jEnd; indexSrc++) { total += dataSrc[indexSrc]& 0xFF; } - int count = jEnd - j - i; + dataDst[indexDest++] = (byte)((total + count/2)/count); + } + while (--count > offset) { + total -= dataSrc[j++]& 0xFF; dataDst[indexDest++] = (byte)((total + count/2)/count); } } @@ -98,14 +107,14 @@ public static void horizontal( GrayU8 input,GrayI8 output, int offset, int lengt for (; indexIn < indexEnd; indexIn++) { total += input.data[indexIn] & 0xFF; } - output.data[indexOut++] = (byte)((total+halfDivisor)/divisor); + output.data[indexOut++] = (byte)((total + halfDivisor)/divisor); indexEnd = indexIn + input.width - length; for (; indexIn < indexEnd; indexIn++) { total -= input.data[indexIn - length] & 0xFF; total += input.data[indexIn] & 0xFF; - output.data[indexOut++] = (byte)((total+halfDivisor)/divisor); + output.data[indexOut++] = (byte)((total + halfDivisor)/divisor); } } //CONCURRENT_ABOVE }); @@ -128,18 +137,26 @@ public static void verticalBorder( GrayU8 input, GrayI8 output, int offset, int // Image Top for (int count = length - offset; count < length; count++) { - { - int indexIn = input.startIndex + x0; + final int indexInRow = input.startIndex + x0; + if (count == length - offset) { + int indexIn = indexInRow; for (int x = x0; x < x1; x++) { totals[x - x0] = dataSrc[indexIn++]& 0xFF; } - } - for (int y = 1; y < count; y++) { - int indexIn = input.startIndex + x0 + y*input.stride; - for (int x = x0; x < x1; x++) { - totals[x - x0] += dataSrc[indexIn++]& 0xFF; + for (int y = 1; y < count; y++) { + indexIn = indexInRow + y*input.stride; + + for (int x = x0; x < x1; x++) { + totals[x - x0] += dataSrc[indexIn++]& 0xFF; + } + } + } else { + int indexIn0 = indexInRow + (count - 1)*input.stride; + int end = indexIn0 + x1 - x0; + for (int i = indexIn0; i < end; i++) { + totals[i - indexIn0] += dataSrc[i]& 0xFF; } } int indexOut = output.startIndex + (count - (length - offset))*output.stride; @@ -149,19 +166,27 @@ public static void verticalBorder( GrayU8 input, GrayI8 output, int offset, int } // Image Bottom for (int yStart = height - length + 1; yStart < height - offset; yStart++) { - { - int indexIn = input.startIndex + x0 + yStart*input.stride; + final int indexInRow = input.startIndex + x0; + if (yStart == height - length + 1) { + int indexIn = indexInRow + yStart*input.stride; for (int x = x0; x < x1; x++) { totals[x - x0] = dataSrc[indexIn++]& 0xFF; } - } - for (int y = yStart + 1; y < height; y++) { - int indexIn = input.startIndex + x0 + y*input.stride; + for (int y = yStart + 1; y < height; y++) { + indexIn = indexInRow + y*input.stride; - for (int x = x0; x < x1; x++) { - totals[x - x0] += dataSrc[indexIn++]& 0xFF; + for (int x = x0; x < x1; x++) { + totals[x - x0] += dataSrc[indexIn++]& 0xFF; + } + } + } else { + int indexIn0 = indexInRow + (yStart - 1)*input.stride; + int indexIn1 = indexIn0 + x1 - x0; + + for (int i = indexIn0; i < indexIn1; i++) { + totals[i - indexIn0] -= dataSrc[i]& 0xFF; } } @@ -177,42 +202,48 @@ public static void verticalBorder( GrayU8 input, GrayI8 output, int offset, int public static void vertical( GrayU8 input, GrayI8 output, int offset, int length, @Nullable GrowArray workspaces ) { workspaces = BoofMiscOps.checkDeclare(workspaces, DogArray_I32::new); final DogArray_I32 work = workspaces.grow(); //CONCURRENT_REMOVE_LINE - final int backStep = length*input.stride; - final int offsetEnd = length - offset - 1; final int divisor = length; final int halfDivisor = divisor/2; + final int regionStepY = length*input.stride; - // To reduce cache misses it is processed along rows instead of going down columns, which is - // more natural for a vertical convolution. For parallel processes this requires building - // a book keeping array for each thread. + //CONCURRENT_BELOW BoofConcurrency.loopBlocks(0, input.width, 20, workspaces, (work, x0, x1)->{ + final int x0 = 0, x1 = input.width; + int[] totals = BoofMiscOps.checkDeclare(work, x1 - x0, false); - //CONCURRENT_BELOW BoofConcurrency.loopBlocks(offset, output.height - offsetEnd, length, workspaces, (work, y0, y1)->{ - final int y0 = offset, y1 = output.height - offsetEnd; - int[] totals = BoofMiscOps.checkDeclare(work, input.width, false); - for (int x = 0; x < input.width; x++) { - int indexIn = input.startIndex + (y0 - offset)*input.stride + x; - int indexOut = output.startIndex + output.stride*y0 + x; + // Sum up along x-axis to avoid cache misses when reading from input image + // Initialize recursion by summing up the first kernels along the x-axis + { + int indexIn = input.startIndex + x0; - int total = 0; - int indexEnd = indexIn + input.stride*length; - for (; indexIn < indexEnd; indexIn += input.stride) { - total += input.data[indexIn] & 0xFF; + for (int x = x0; x < x1; x++) { + totals[x - x0] = input.data[indexIn++]& 0xFF; + } + } + for (int y = 1; y < length; y++) { + int indexIn = input.startIndex + y*input.stride + x0; + int indexInEnd = indexIn + x1 - x0; + for (int i = indexIn; i < indexInEnd; i++) { + totals[i - indexIn] += input.data[i]& 0xFF; } - totals[x] = total; - output.data[indexOut] = (byte)((total + halfDivisor)/divisor); } - // change the order it is processed in to reduce cache misses - for (int y = y0 + 1; y < y1; y++) { - int indexIn = input.startIndex + (y + offsetEnd)*input.stride; - int indexOut = output.startIndex + y*output.stride; - - for (int x = 0; x < input.width; x++, indexIn++, indexOut++) { - int total = totals[x] - (input.data[indexIn - backStep]& 0xFF); - totals[x] = total += input.data[indexIn]& 0xFF; + int indexOut = output.startIndex + output.stride*offset + x0; + for (int x = x0; x < x1; x++, indexOut++) { + final int total = totals[x - x0]; + output.data[indexOut] = (byte)((total + halfDivisor)/divisor); + } + // For the reminder we only need to add and remove the first and last elements to update the solution + for (int y = 0; y < input.height - length; y++) { + indexOut = output.startIndex + output.stride*(offset + y + 1) + x0; + int indexIn = input.startIndex + y*input.stride + x0; + int indexInEnd = indexIn + x1 - x0; + for (int i = indexIn; i < indexInEnd; i++, indexOut++) { + int total = totals[i - indexIn] - (input.data[i]& 0xFF); + total += input.data[i + regionStepY]& 0xFF; output.data[indexOut] = (byte)((total + halfDivisor)/divisor); + totals[i - indexIn] = total; } } //CONCURRENT_INLINE }); @@ -232,25 +263,34 @@ public static void horizontalBorder( GrayS16 input, GrayI16 output, int offset, int indexDest = output.startIndex + y*output.stride; int j = input.startIndex + y*input.stride; - for (int i = 0; i < offset; i++) { - int jEnd = j + i + length - offset; - int total = 0; + int total = 0; + int count = length - offset; + int jEnd = j + count; + if (offset > 0) { for (int indexSrc = j; indexSrc < jEnd; indexSrc++) { total += dataSrc[indexSrc]; } - int count = jEnd - j; dataDst[indexDest++] = (short)((total + count/2)/count); } - int jEnd = j + width; - j += width - (offset + offsetR); - indexDest += width - (offset + offsetR); - for (int i = 0; i < offsetR; i++) { - int total = 0; - for (int indexSrc = j + i; indexSrc < jEnd; indexSrc++) { + while (++count < length) { + total += dataSrc[jEnd++]; + dataDst[indexDest++] = (short)((total + count/2)/count); + } + + jEnd = j + width; + count = offset + offsetR; + j += width - count; + indexDest += width - count; + total = 0; + if (offsetR > 0) { + for (int indexSrc = j; indexSrc < jEnd; indexSrc++) { total += dataSrc[indexSrc]; } - int count = jEnd - j - i; + dataDst[indexDest++] = (short)((total + count/2)/count); + } + while (--count > offset) { + total -= dataSrc[j++]; dataDst[indexDest++] = (short)((total + count/2)/count); } } @@ -273,14 +313,14 @@ public static void horizontal( GrayS16 input,GrayI16 output, int offset, int len for (; indexIn < indexEnd; indexIn++) { total += input.data[indexIn] ; } - output.data[indexOut++] = (short)((total+halfDivisor)/divisor); + output.data[indexOut++] = (short)((total + halfDivisor)/divisor); indexEnd = indexIn + input.width - length; for (; indexIn < indexEnd; indexIn++) { total -= input.data[indexIn - length] ; total += input.data[indexIn] ; - output.data[indexOut++] = (short)((total+halfDivisor)/divisor); + output.data[indexOut++] = (short)((total + halfDivisor)/divisor); } } //CONCURRENT_ABOVE }); @@ -303,18 +343,26 @@ public static void verticalBorder( GrayS16 input, GrayI16 output, int offset, in // Image Top for (int count = length - offset; count < length; count++) { - { - int indexIn = input.startIndex + x0; + final int indexInRow = input.startIndex + x0; + if (count == length - offset) { + int indexIn = indexInRow; for (int x = x0; x < x1; x++) { totals[x - x0] = dataSrc[indexIn++]; } - } - for (int y = 1; y < count; y++) { - int indexIn = input.startIndex + x0 + y*input.stride; - for (int x = x0; x < x1; x++) { - totals[x - x0] += dataSrc[indexIn++]; + for (int y = 1; y < count; y++) { + indexIn = indexInRow + y*input.stride; + + for (int x = x0; x < x1; x++) { + totals[x - x0] += dataSrc[indexIn++]; + } + } + } else { + int indexIn0 = indexInRow + (count - 1)*input.stride; + int end = indexIn0 + x1 - x0; + for (int i = indexIn0; i < end; i++) { + totals[i - indexIn0] += dataSrc[i]; } } int indexOut = output.startIndex + (count - (length - offset))*output.stride; @@ -324,19 +372,27 @@ public static void verticalBorder( GrayS16 input, GrayI16 output, int offset, in } // Image Bottom for (int yStart = height - length + 1; yStart < height - offset; yStart++) { - { - int indexIn = input.startIndex + x0 + yStart*input.stride; + final int indexInRow = input.startIndex + x0; + if (yStart == height - length + 1) { + int indexIn = indexInRow + yStart*input.stride; for (int x = x0; x < x1; x++) { totals[x - x0] = dataSrc[indexIn++]; } - } - for (int y = yStart + 1; y < height; y++) { - int indexIn = input.startIndex + x0 + y*input.stride; + for (int y = yStart + 1; y < height; y++) { + indexIn = indexInRow + y*input.stride; - for (int x = x0; x < x1; x++) { - totals[x - x0] += dataSrc[indexIn++]; + for (int x = x0; x < x1; x++) { + totals[x - x0] += dataSrc[indexIn++]; + } + } + } else { + int indexIn0 = indexInRow + (yStart - 1)*input.stride; + int indexIn1 = indexIn0 + x1 - x0; + + for (int i = indexIn0; i < indexIn1; i++) { + totals[i - indexIn0] -= dataSrc[i]; } } @@ -352,42 +408,48 @@ public static void verticalBorder( GrayS16 input, GrayI16 output, int offset, in public static void vertical( GrayS16 input, GrayI16 output, int offset, int length, @Nullable GrowArray workspaces ) { workspaces = BoofMiscOps.checkDeclare(workspaces, DogArray_I32::new); final DogArray_I32 work = workspaces.grow(); //CONCURRENT_REMOVE_LINE - final int backStep = length*input.stride; - final int offsetEnd = length - offset - 1; final int divisor = length; final int halfDivisor = divisor/2; + final int regionStepY = length*input.stride; - // To reduce cache misses it is processed along rows instead of going down columns, which is - // more natural for a vertical convolution. For parallel processes this requires building - // a book keeping array for each thread. + //CONCURRENT_BELOW BoofConcurrency.loopBlocks(0, input.width, 20, workspaces, (work, x0, x1)->{ + final int x0 = 0, x1 = input.width; + int[] totals = BoofMiscOps.checkDeclare(work, x1 - x0, false); - //CONCURRENT_BELOW BoofConcurrency.loopBlocks(offset, output.height - offsetEnd, length, workspaces, (work, y0, y1)->{ - final int y0 = offset, y1 = output.height - offsetEnd; - int[] totals = BoofMiscOps.checkDeclare(work, input.width, false); - for (int x = 0; x < input.width; x++) { - int indexIn = input.startIndex + (y0 - offset)*input.stride + x; - int indexOut = output.startIndex + output.stride*y0 + x; + // Sum up along x-axis to avoid cache misses when reading from input image + // Initialize recursion by summing up the first kernels along the x-axis + { + int indexIn = input.startIndex + x0; - int total = 0; - int indexEnd = indexIn + input.stride*length; - for (; indexIn < indexEnd; indexIn += input.stride) { - total += input.data[indexIn] ; + for (int x = x0; x < x1; x++) { + totals[x - x0] = input.data[indexIn++]; + } + } + for (int y = 1; y < length; y++) { + int indexIn = input.startIndex + y*input.stride + x0; + int indexInEnd = indexIn + x1 - x0; + for (int i = indexIn; i < indexInEnd; i++) { + totals[i - indexIn] += input.data[i]; } - totals[x] = total; - output.data[indexOut] = (short)((total + halfDivisor)/divisor); } - // change the order it is processed in to reduce cache misses - for (int y = y0 + 1; y < y1; y++) { - int indexIn = input.startIndex + (y + offsetEnd)*input.stride; - int indexOut = output.startIndex + y*output.stride; - - for (int x = 0; x < input.width; x++, indexIn++, indexOut++) { - int total = totals[x] - (input.data[indexIn - backStep]); - totals[x] = total += input.data[indexIn]; + int indexOut = output.startIndex + output.stride*offset + x0; + for (int x = x0; x < x1; x++, indexOut++) { + final int total = totals[x - x0]; + output.data[indexOut] = (short)((total + halfDivisor)/divisor); + } + // For the reminder we only need to add and remove the first and last elements to update the solution + for (int y = 0; y < input.height - length; y++) { + indexOut = output.startIndex + output.stride*(offset + y + 1) + x0; + int indexIn = input.startIndex + y*input.stride + x0; + int indexInEnd = indexIn + x1 - x0; + for (int i = indexIn; i < indexInEnd; i++, indexOut++) { + int total = totals[i - indexIn] - (input.data[i]); + total += input.data[i + regionStepY]; output.data[indexOut] = (short)((total + halfDivisor)/divisor); + totals[i - indexIn] = total; } } //CONCURRENT_INLINE }); @@ -407,25 +469,34 @@ public static void horizontalBorder( GrayU16 input, GrayI16 output, int offset, int indexDest = output.startIndex + y*output.stride; int j = input.startIndex + y*input.stride; - for (int i = 0; i < offset; i++) { - int jEnd = j + i + length - offset; - int total = 0; + int total = 0; + int count = length - offset; + int jEnd = j + count; + if (offset > 0) { for (int indexSrc = j; indexSrc < jEnd; indexSrc++) { total += dataSrc[indexSrc]& 0xFFFF; } - int count = jEnd - j; dataDst[indexDest++] = (short)((total + count/2)/count); } - int jEnd = j + width; - j += width - (offset + offsetR); - indexDest += width - (offset + offsetR); - for (int i = 0; i < offsetR; i++) { - int total = 0; - for (int indexSrc = j + i; indexSrc < jEnd; indexSrc++) { + while (++count < length) { + total += dataSrc[jEnd++]& 0xFFFF; + dataDst[indexDest++] = (short)((total + count/2)/count); + } + + jEnd = j + width; + count = offset + offsetR; + j += width - count; + indexDest += width - count; + total = 0; + if (offsetR > 0) { + for (int indexSrc = j; indexSrc < jEnd; indexSrc++) { total += dataSrc[indexSrc]& 0xFFFF; } - int count = jEnd - j - i; + dataDst[indexDest++] = (short)((total + count/2)/count); + } + while (--count > offset) { + total -= dataSrc[j++]& 0xFFFF; dataDst[indexDest++] = (short)((total + count/2)/count); } } @@ -448,14 +519,14 @@ public static void horizontal( GrayU16 input,GrayI16 output, int offset, int len for (; indexIn < indexEnd; indexIn++) { total += input.data[indexIn] & 0xFFFF; } - output.data[indexOut++] = (short)((total+halfDivisor)/divisor); + output.data[indexOut++] = (short)((total + halfDivisor)/divisor); indexEnd = indexIn + input.width - length; for (; indexIn < indexEnd; indexIn++) { total -= input.data[indexIn - length] & 0xFFFF; total += input.data[indexIn] & 0xFFFF; - output.data[indexOut++] = (short)((total+halfDivisor)/divisor); + output.data[indexOut++] = (short)((total + halfDivisor)/divisor); } } //CONCURRENT_ABOVE }); @@ -478,18 +549,26 @@ public static void verticalBorder( GrayU16 input, GrayI16 output, int offset, in // Image Top for (int count = length - offset; count < length; count++) { - { - int indexIn = input.startIndex + x0; + final int indexInRow = input.startIndex + x0; + if (count == length - offset) { + int indexIn = indexInRow; for (int x = x0; x < x1; x++) { totals[x - x0] = dataSrc[indexIn++]& 0xFFFF; } - } - for (int y = 1; y < count; y++) { - int indexIn = input.startIndex + x0 + y*input.stride; - for (int x = x0; x < x1; x++) { - totals[x - x0] += dataSrc[indexIn++]& 0xFFFF; + for (int y = 1; y < count; y++) { + indexIn = indexInRow + y*input.stride; + + for (int x = x0; x < x1; x++) { + totals[x - x0] += dataSrc[indexIn++]& 0xFFFF; + } + } + } else { + int indexIn0 = indexInRow + (count - 1)*input.stride; + int end = indexIn0 + x1 - x0; + for (int i = indexIn0; i < end; i++) { + totals[i - indexIn0] += dataSrc[i]& 0xFFFF; } } int indexOut = output.startIndex + (count - (length - offset))*output.stride; @@ -499,19 +578,27 @@ public static void verticalBorder( GrayU16 input, GrayI16 output, int offset, in } // Image Bottom for (int yStart = height - length + 1; yStart < height - offset; yStart++) { - { - int indexIn = input.startIndex + x0 + yStart*input.stride; + final int indexInRow = input.startIndex + x0; + if (yStart == height - length + 1) { + int indexIn = indexInRow + yStart*input.stride; for (int x = x0; x < x1; x++) { totals[x - x0] = dataSrc[indexIn++]& 0xFFFF; } - } - for (int y = yStart + 1; y < height; y++) { - int indexIn = input.startIndex + x0 + y*input.stride; + for (int y = yStart + 1; y < height; y++) { + indexIn = indexInRow + y*input.stride; - for (int x = x0; x < x1; x++) { - totals[x - x0] += dataSrc[indexIn++]& 0xFFFF; + for (int x = x0; x < x1; x++) { + totals[x - x0] += dataSrc[indexIn++]& 0xFFFF; + } + } + } else { + int indexIn0 = indexInRow + (yStart - 1)*input.stride; + int indexIn1 = indexIn0 + x1 - x0; + + for (int i = indexIn0; i < indexIn1; i++) { + totals[i - indexIn0] -= dataSrc[i]& 0xFFFF; } } @@ -527,42 +614,48 @@ public static void verticalBorder( GrayU16 input, GrayI16 output, int offset, in public static void vertical( GrayU16 input, GrayI16 output, int offset, int length, @Nullable GrowArray workspaces ) { workspaces = BoofMiscOps.checkDeclare(workspaces, DogArray_I32::new); final DogArray_I32 work = workspaces.grow(); //CONCURRENT_REMOVE_LINE - final int backStep = length*input.stride; - final int offsetEnd = length - offset - 1; final int divisor = length; final int halfDivisor = divisor/2; + final int regionStepY = length*input.stride; - // To reduce cache misses it is processed along rows instead of going down columns, which is - // more natural for a vertical convolution. For parallel processes this requires building - // a book keeping array for each thread. + //CONCURRENT_BELOW BoofConcurrency.loopBlocks(0, input.width, 20, workspaces, (work, x0, x1)->{ + final int x0 = 0, x1 = input.width; + int[] totals = BoofMiscOps.checkDeclare(work, x1 - x0, false); - //CONCURRENT_BELOW BoofConcurrency.loopBlocks(offset, output.height - offsetEnd, length, workspaces, (work, y0, y1)->{ - final int y0 = offset, y1 = output.height - offsetEnd; - int[] totals = BoofMiscOps.checkDeclare(work, input.width, false); - for (int x = 0; x < input.width; x++) { - int indexIn = input.startIndex + (y0 - offset)*input.stride + x; - int indexOut = output.startIndex + output.stride*y0 + x; + // Sum up along x-axis to avoid cache misses when reading from input image + // Initialize recursion by summing up the first kernels along the x-axis + { + int indexIn = input.startIndex + x0; - int total = 0; - int indexEnd = indexIn + input.stride*length; - for (; indexIn < indexEnd; indexIn += input.stride) { - total += input.data[indexIn] & 0xFFFF; + for (int x = x0; x < x1; x++) { + totals[x - x0] = input.data[indexIn++]& 0xFFFF; + } + } + for (int y = 1; y < length; y++) { + int indexIn = input.startIndex + y*input.stride + x0; + int indexInEnd = indexIn + x1 - x0; + for (int i = indexIn; i < indexInEnd; i++) { + totals[i - indexIn] += input.data[i]& 0xFFFF; } - totals[x] = total; - output.data[indexOut] = (short)((total + halfDivisor)/divisor); } - // change the order it is processed in to reduce cache misses - for (int y = y0 + 1; y < y1; y++) { - int indexIn = input.startIndex + (y + offsetEnd)*input.stride; - int indexOut = output.startIndex + y*output.stride; - - for (int x = 0; x < input.width; x++, indexIn++, indexOut++) { - int total = totals[x] - (input.data[indexIn - backStep]& 0xFFFF); - totals[x] = total += input.data[indexIn]& 0xFFFF; + int indexOut = output.startIndex + output.stride*offset + x0; + for (int x = x0; x < x1; x++, indexOut++) { + final int total = totals[x - x0]; + output.data[indexOut] = (short)((total + halfDivisor)/divisor); + } + // For the reminder we only need to add and remove the first and last elements to update the solution + for (int y = 0; y < input.height - length; y++) { + indexOut = output.startIndex + output.stride*(offset + y + 1) + x0; + int indexIn = input.startIndex + y*input.stride + x0; + int indexInEnd = indexIn + x1 - x0; + for (int i = indexIn; i < indexInEnd; i++, indexOut++) { + int total = totals[i - indexIn] - (input.data[i]& 0xFFFF); + total += input.data[i + regionStepY]& 0xFFFF; output.data[indexOut] = (short)((total + halfDivisor)/divisor); + totals[i - indexIn] = total; } } //CONCURRENT_INLINE }); @@ -582,25 +675,34 @@ public static void horizontalBorder( GrayF32 input, GrayF32 output, int offset, int indexDest = output.startIndex + y*output.stride; int j = input.startIndex + y*input.stride; - for (int i = 0; i < offset; i++) { - int jEnd = j + i + length - offset; - float total = 0; + float total = 0; + int count = length - offset; + int jEnd = j + count; + if (offset > 0) { for (int indexSrc = j; indexSrc < jEnd; indexSrc++) { total += dataSrc[indexSrc]; } - int count = jEnd - j; dataDst[indexDest++] = total/count; } - int jEnd = j + width; - j += width - (offset + offsetR); - indexDest += width - (offset + offsetR); - for (int i = 0; i < offsetR; i++) { - float total = 0; - for (int indexSrc = j + i; indexSrc < jEnd; indexSrc++) { + while (++count < length) { + total += dataSrc[jEnd++]; + dataDst[indexDest++] = total/count; + } + + jEnd = j + width; + count = offset + offsetR; + j += width - count; + indexDest += width - count; + total = 0; + if (offsetR > 0) { + for (int indexSrc = j; indexSrc < jEnd; indexSrc++) { total += dataSrc[indexSrc]; } - int count = jEnd - j - i; + dataDst[indexDest++] = total/count; + } + while (--count > offset) { + total -= dataSrc[j++]; dataDst[indexDest++] = total/count; } } @@ -652,18 +754,26 @@ public static void verticalBorder( GrayF32 input, GrayF32 output, int offset, in // Image Top for (int count = length - offset; count < length; count++) { - { - int indexIn = input.startIndex + x0; + final int indexInRow = input.startIndex + x0; + if (count == length - offset) { + int indexIn = indexInRow; for (int x = x0; x < x1; x++) { totals[x - x0] = dataSrc[indexIn++]; } - } - for (int y = 1; y < count; y++) { - int indexIn = input.startIndex + x0 + y*input.stride; - for (int x = x0; x < x1; x++) { - totals[x - x0] += dataSrc[indexIn++]; + for (int y = 1; y < count; y++) { + indexIn = indexInRow + y*input.stride; + + for (int x = x0; x < x1; x++) { + totals[x - x0] += dataSrc[indexIn++]; + } + } + } else { + int indexIn0 = indexInRow + (count - 1)*input.stride; + int end = indexIn0 + x1 - x0; + for (int i = indexIn0; i < end; i++) { + totals[i - indexIn0] += dataSrc[i]; } } int indexOut = output.startIndex + (count - (length - offset))*output.stride; @@ -673,19 +783,27 @@ public static void verticalBorder( GrayF32 input, GrayF32 output, int offset, in } // Image Bottom for (int yStart = height - length + 1; yStart < height - offset; yStart++) { - { - int indexIn = input.startIndex + x0 + yStart*input.stride; + final int indexInRow = input.startIndex + x0; + if (yStart == height - length + 1) { + int indexIn = indexInRow + yStart*input.stride; for (int x = x0; x < x1; x++) { totals[x - x0] = dataSrc[indexIn++]; } - } - for (int y = yStart + 1; y < height; y++) { - int indexIn = input.startIndex + x0 + y*input.stride; + for (int y = yStart + 1; y < height; y++) { + indexIn = indexInRow + y*input.stride; - for (int x = x0; x < x1; x++) { - totals[x - x0] += dataSrc[indexIn++]; + for (int x = x0; x < x1; x++) { + totals[x - x0] += dataSrc[indexIn++]; + } + } + } else { + int indexIn0 = indexInRow + (yStart - 1)*input.stride; + int indexIn1 = indexIn0 + x1 - x0; + + for (int i = indexIn0; i < indexIn1; i++) { + totals[i - indexIn0] -= dataSrc[i]; } } @@ -701,41 +819,47 @@ public static void verticalBorder( GrayF32 input, GrayF32 output, int offset, in public static void vertical( GrayF32 input, GrayF32 output, int offset, int length, @Nullable GrowArray workspaces ) { workspaces = BoofMiscOps.checkDeclare(workspaces, DogArray_F32::new); final DogArray_F32 work = workspaces.grow(); //CONCURRENT_REMOVE_LINE - final int backStep = length*input.stride; - final int offsetEnd = length - offset - 1; final float divisor = length; + final int regionStepY = length*input.stride; - // To reduce cache misses it is processed along rows instead of going down columns, which is - // more natural for a vertical convolution. For parallel processes this requires building - // a book keeping array for each thread. + //CONCURRENT_BELOW BoofConcurrency.loopBlocks(0, input.width, 20, workspaces, (work, x0, x1)->{ + final int x0 = 0, x1 = input.width; + float[] totals = BoofMiscOps.checkDeclare(work, x1 - x0, false); - //CONCURRENT_BELOW BoofConcurrency.loopBlocks(offset, output.height - offsetEnd, length, workspaces, (work, y0, y1)->{ - final int y0 = offset, y1 = output.height - offsetEnd; - float[] totals = BoofMiscOps.checkDeclare(work, input.width, false); - for (int x = 0; x < input.width; x++) { - int indexIn = input.startIndex + (y0 - offset)*input.stride + x; - int indexOut = output.startIndex + output.stride*y0 + x; + // Sum up along x-axis to avoid cache misses when reading from input image + // Initialize recursion by summing up the first kernels along the x-axis + { + int indexIn = input.startIndex + x0; - float total = 0; - int indexEnd = indexIn + input.stride*length; - for (; indexIn < indexEnd; indexIn += input.stride) { - total += input.data[indexIn] ; + for (int x = x0; x < x1; x++) { + totals[x - x0] = input.data[indexIn++]; + } + } + for (int y = 1; y < length; y++) { + int indexIn = input.startIndex + y*input.stride + x0; + int indexInEnd = indexIn + x1 - x0; + for (int i = indexIn; i < indexInEnd; i++) { + totals[i - indexIn] += input.data[i]; } - totals[x] = total; - output.data[indexOut] = (total/divisor); } - // change the order it is processed in to reduce cache misses - for (int y = y0 + 1; y < y1; y++) { - int indexIn = input.startIndex + (y + offsetEnd)*input.stride; - int indexOut = output.startIndex + y*output.stride; - - for (int x = 0; x < input.width; x++, indexIn++, indexOut++) { - float total = totals[x] - (input.data[indexIn - backStep]); - totals[x] = total += input.data[indexIn]; + int indexOut = output.startIndex + output.stride*offset + x0; + for (int x = x0; x < x1; x++, indexOut++) { + final float total = totals[x - x0]; + output.data[indexOut] = total/divisor; + } - output.data[indexOut] = (total/divisor); + // For the reminder we only need to add and remove the first and last elements to update the solution + for (int y = 0; y < input.height - length; y++) { + indexOut = output.startIndex + output.stride*(offset + y + 1) + x0; + int indexIn = input.startIndex + y*input.stride + x0; + int indexInEnd = indexIn + x1 - x0; + for (int i = indexIn; i < indexInEnd; i++, indexOut++) { + float total = totals[i - indexIn] - (input.data[i]); + total += input.data[i + regionStepY]; + output.data[indexOut] = total/divisor; + totals[i - indexIn] = total; } } //CONCURRENT_INLINE }); @@ -755,25 +879,34 @@ public static void horizontalBorder( GrayF64 input, GrayF64 output, int offset, int indexDest = output.startIndex + y*output.stride; int j = input.startIndex + y*input.stride; - for (int i = 0; i < offset; i++) { - int jEnd = j + i + length - offset; - double total = 0; + double total = 0; + int count = length - offset; + int jEnd = j + count; + if (offset > 0) { for (int indexSrc = j; indexSrc < jEnd; indexSrc++) { total += dataSrc[indexSrc]; } - int count = jEnd - j; dataDst[indexDest++] = total/count; } - int jEnd = j + width; - j += width - (offset + offsetR); - indexDest += width - (offset + offsetR); - for (int i = 0; i < offsetR; i++) { - double total = 0; - for (int indexSrc = j + i; indexSrc < jEnd; indexSrc++) { + while (++count < length) { + total += dataSrc[jEnd++]; + dataDst[indexDest++] = total/count; + } + + jEnd = j + width; + count = offset + offsetR; + j += width - count; + indexDest += width - count; + total = 0; + if (offsetR > 0) { + for (int indexSrc = j; indexSrc < jEnd; indexSrc++) { total += dataSrc[indexSrc]; } - int count = jEnd - j - i; + dataDst[indexDest++] = total/count; + } + while (--count > offset) { + total -= dataSrc[j++]; dataDst[indexDest++] = total/count; } } @@ -825,18 +958,26 @@ public static void verticalBorder( GrayF64 input, GrayF64 output, int offset, in // Image Top for (int count = length - offset; count < length; count++) { - { - int indexIn = input.startIndex + x0; + final int indexInRow = input.startIndex + x0; + if (count == length - offset) { + int indexIn = indexInRow; for (int x = x0; x < x1; x++) { totals[x - x0] = dataSrc[indexIn++]; } - } - for (int y = 1; y < count; y++) { - int indexIn = input.startIndex + x0 + y*input.stride; - for (int x = x0; x < x1; x++) { - totals[x - x0] += dataSrc[indexIn++]; + for (int y = 1; y < count; y++) { + indexIn = indexInRow + y*input.stride; + + for (int x = x0; x < x1; x++) { + totals[x - x0] += dataSrc[indexIn++]; + } + } + } else { + int indexIn0 = indexInRow + (count - 1)*input.stride; + int end = indexIn0 + x1 - x0; + for (int i = indexIn0; i < end; i++) { + totals[i - indexIn0] += dataSrc[i]; } } int indexOut = output.startIndex + (count - (length - offset))*output.stride; @@ -846,19 +987,27 @@ public static void verticalBorder( GrayF64 input, GrayF64 output, int offset, in } // Image Bottom for (int yStart = height - length + 1; yStart < height - offset; yStart++) { - { - int indexIn = input.startIndex + x0 + yStart*input.stride; + final int indexInRow = input.startIndex + x0; + if (yStart == height - length + 1) { + int indexIn = indexInRow + yStart*input.stride; for (int x = x0; x < x1; x++) { totals[x - x0] = dataSrc[indexIn++]; } - } - for (int y = yStart + 1; y < height; y++) { - int indexIn = input.startIndex + x0 + y*input.stride; + for (int y = yStart + 1; y < height; y++) { + indexIn = indexInRow + y*input.stride; - for (int x = x0; x < x1; x++) { - totals[x - x0] += dataSrc[indexIn++]; + for (int x = x0; x < x1; x++) { + totals[x - x0] += dataSrc[indexIn++]; + } + } + } else { + int indexIn0 = indexInRow + (yStart - 1)*input.stride; + int indexIn1 = indexIn0 + x1 - x0; + + for (int i = indexIn0; i < indexIn1; i++) { + totals[i - indexIn0] -= dataSrc[i]; } } @@ -874,41 +1023,47 @@ public static void verticalBorder( GrayF64 input, GrayF64 output, int offset, in public static void vertical( GrayF64 input, GrayF64 output, int offset, int length, @Nullable GrowArray workspaces ) { workspaces = BoofMiscOps.checkDeclare(workspaces, DogArray_F64::new); final DogArray_F64 work = workspaces.grow(); //CONCURRENT_REMOVE_LINE - final int backStep = length*input.stride; - final int offsetEnd = length - offset - 1; final double divisor = length; + final int regionStepY = length*input.stride; - // To reduce cache misses it is processed along rows instead of going down columns, which is - // more natural for a vertical convolution. For parallel processes this requires building - // a book keeping array for each thread. + //CONCURRENT_BELOW BoofConcurrency.loopBlocks(0, input.width, 20, workspaces, (work, x0, x1)->{ + final int x0 = 0, x1 = input.width; + double[] totals = BoofMiscOps.checkDeclare(work, x1 - x0, false); - //CONCURRENT_BELOW BoofConcurrency.loopBlocks(offset, output.height - offsetEnd, length, workspaces, (work, y0, y1)->{ - final int y0 = offset, y1 = output.height - offsetEnd; - double[] totals = BoofMiscOps.checkDeclare(work, input.width, false); - for (int x = 0; x < input.width; x++) { - int indexIn = input.startIndex + (y0 - offset)*input.stride + x; - int indexOut = output.startIndex + output.stride*y0 + x; + // Sum up along x-axis to avoid cache misses when reading from input image + // Initialize recursion by summing up the first kernels along the x-axis + { + int indexIn = input.startIndex + x0; - double total = 0; - int indexEnd = indexIn + input.stride*length; - for (; indexIn < indexEnd; indexIn += input.stride) { - total += input.data[indexIn] ; + for (int x = x0; x < x1; x++) { + totals[x - x0] = input.data[indexIn++]; + } + } + for (int y = 1; y < length; y++) { + int indexIn = input.startIndex + y*input.stride + x0; + int indexInEnd = indexIn + x1 - x0; + for (int i = indexIn; i < indexInEnd; i++) { + totals[i - indexIn] += input.data[i]; } - totals[x] = total; - output.data[indexOut] = (total/divisor); } - // change the order it is processed in to reduce cache misses - for (int y = y0 + 1; y < y1; y++) { - int indexIn = input.startIndex + (y + offsetEnd)*input.stride; - int indexOut = output.startIndex + y*output.stride; - - for (int x = 0; x < input.width; x++, indexIn++, indexOut++) { - double total = totals[x] - (input.data[indexIn - backStep]); - totals[x] = total += input.data[indexIn]; + int indexOut = output.startIndex + output.stride*offset + x0; + for (int x = x0; x < x1; x++, indexOut++) { + final double total = totals[x - x0]; + output.data[indexOut] = total/divisor; + } - output.data[indexOut] = (total/divisor); + // For the reminder we only need to add and remove the first and last elements to update the solution + for (int y = 0; y < input.height - length; y++) { + indexOut = output.startIndex + output.stride*(offset + y + 1) + x0; + int indexIn = input.startIndex + y*input.stride + x0; + int indexInEnd = indexIn + x1 - x0; + for (int i = indexIn; i < indexInEnd; i++, indexOut++) { + double total = totals[i - indexIn] - (input.data[i]); + total += input.data[i + regionStepY]; + output.data[indexOut] = total/divisor; + totals[i - indexIn] = total; } } //CONCURRENT_INLINE }); diff --git a/main/boofcv-ip/src/test/java/boofcv/alg/filter/convolve/TestConvolveImageMean.java b/main/boofcv-ip/src/test/java/boofcv/alg/filter/convolve/TestConvolveImageMean.java index eab7d732ec..a1e123c7b7 100644 --- a/main/boofcv-ip/src/test/java/boofcv/alg/filter/convolve/TestConvolveImageMean.java +++ b/main/boofcv-ip/src/test/java/boofcv/alg/filter/convolve/TestConvolveImageMean.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, Peter Abeles. All Rights Reserved. + * Copyright (c) 2025, Peter Abeles. All Rights Reserved. * * This file is part of BoofCV (http://boofcv.org). * @@ -53,34 +53,34 @@ public TestConvolveImageMean() { } @Override - protected boolean isTestMethod(Method m) { + protected boolean isTestMethod( Method m ) { Class params[] = m.getParameterTypes(); - if( params.length < 4 || params.length > 6) + if (params.length < 4 || params.length > 6) return false; return ImageGray.class.isAssignableFrom(params[0]); } @Override - protected boolean isEquivalent(Method validation, Method target) { + protected boolean isEquivalent( Method validation, Method target ) { Class[] v = validation.getParameterTypes(); Class[] c = target.getParameterTypes(); - if( !target.getName().equals(validation.getName())) + if (!target.getName().equals(validation.getName())) return false; - if( c[0] != v[1] || c[1] != v[2]) + if (c[0] != v[1] || c[1] != v[2]) return false; - if( target.getName().equals("vertical")) { - if( ImageBorder.class.isAssignableFrom(c[4]) ) { + if (target.getName().equals("vertical")) { + if (ImageBorder.class.isAssignableFrom(c[4])) { return v.length >= 4 && ImageBorder.class.isAssignableFrom(v[3]); - } else if( v.length != 3 ){ + } else if (v.length != 3) { return false; } - } else if( (c.length == 4) ^ (v.length == 3)) { + } else if ((c.length == 4) ^ (v.length == 3)) { return false; } @@ -88,7 +88,7 @@ protected boolean isEquivalent(Method validation, Method target) { } @Override - protected Object[][] createInputParam(Method candidate, Method validation) { + protected Object[][] createInputParam( Method candidate, Method validation ) { Class[] c = candidate.getParameterTypes(); @@ -97,16 +97,16 @@ protected Object[][] createInputParam(Method candidate, Method validation) { GImageMiscOps.fillUniform(input, rand, 0, 100); - ImageBorder border = FactoryImageBorder.generic(BorderType.REFLECT,input.getImageType()); + ImageBorder border = FactoryImageBorder.generic(BorderType.REFLECT, input.getImageType()); Object[][] ret = new Object[2][]; - if( c.length == 4 ) { + if (c.length == 4) { ret[0] = new Object[]{input, output, offset1, length1}; ret[1] = new Object[]{input, output, offset2, length2}; - } else if( c.length == 5 ) { + } else if (c.length == 5) { ret[0] = new Object[]{input, output, offset1, length1, null}; ret[1] = new Object[]{input, output, offset2, length2, null}; - if( ImageBorder.class.isAssignableFrom(c[4]) ) { + if (ImageBorder.class.isAssignableFrom(c[4])) { ret[0][4] = border; ret[1][4] = border; } @@ -119,13 +119,13 @@ protected Object[][] createInputParam(Method candidate, Method validation) { } @Override - protected Object[] reformatForValidation(Method m, Object[] targetParam) { + protected Object[] reformatForValidation( Method m, Object[] targetParam ) { Class[] params = m.getParameterTypes(); - Object kernel = createTableKernel(params[0],(Integer)targetParam[2],(Integer)targetParam[3]); + Object kernel = createTableKernel(params[0], (Integer)targetParam[2], (Integer)targetParam[3]); ImageGray output = (ImageGray)((ImageGray)targetParam[1]).clone(); - if( ImageBorder.class.isAssignableFrom(params[params.length-1])) { + if (ImageBorder.class.isAssignableFrom(params[params.length - 1])) { return new Object[]{kernel, targetParam[0], output, targetParam[4]}; } else { return new Object[]{kernel, targetParam[0], output}; @@ -133,18 +133,20 @@ protected Object[] reformatForValidation(Method m, Object[] targetParam) { } @Override - protected void compareResults(Object targetResult, Object[] targetParam, Object validationResult, Object[] validationParam) { - - if (validationParam.length == 3) { - ImageGray expected = (ImageGray) validationParam[2]; - ImageGray found = (ImageGray) targetParam[1]; - - BoofTesting.assertEquals(expected, found, 1e-4); - } else { - ImageGray expected = (ImageGray) validationParam[2]; - ImageGray found = (ImageGray) targetParam[1]; - - BoofTesting.assertEquals(expected, found, 1e-4); + protected void compareResults( Object targetResult, Object[] targetParam, Object validationResult, Object[] validationParam ) { + ImageGray expected = (ImageGray)validationParam[2]; + ImageGray found = (ImageGray)targetParam[1]; + try { + if (validationParam.length == 3) { + BoofTesting.assertEquals(expected, found, 1e-4); + } else { + BoofTesting.assertEquals(expected, found, 1e-4); + } + } catch (Exception e) { + expected.print(); + System.out.println("----"); + found.print(); + throw e; } } } diff --git a/main/boofcv-ip/src/test/java/boofcv/alg/filter/convolve/noborder/TestImplConvolveMean.java b/main/boofcv-ip/src/test/java/boofcv/alg/filter/convolve/noborder/TestImplConvolveMean.java index 7a0973b55d..c6eb591f52 100644 --- a/main/boofcv-ip/src/test/java/boofcv/alg/filter/convolve/noborder/TestImplConvolveMean.java +++ b/main/boofcv-ip/src/test/java/boofcv/alg/filter/convolve/noborder/TestImplConvolveMean.java @@ -93,18 +93,27 @@ protected Object[][] createInputParam( Method candidate, Method validation ) { ImageGray input = GeneralizedImageOps.createSingleBand(candidateParam[0], width, height); ImageGray output = GeneralizedImageOps.createSingleBand(candidateParam[1], width, height); - GImageMiscOps.fillUniform(input, rand, 0, 50); + // Make sure the bitwise operators do something. If the value is too small it won't change the value + double maxValue = 100.0; + if (input.getDataType().isInteger()) { + maxValue = input.getDataType().getMaxValue(); + } + GImageMiscOps.fillUniform(input, rand, 0, maxValue); - Object[][] ret = new Object[3][]; + Object[][] ret = new Object[5][]; if (candidateParam.length == 4) { ret[0] = new Object[]{input, output, kernelOffset, kernelLength}; ret[1] = new Object[]{input, output, kernelOffset + 1, kernelLength}; ret[2] = new Object[]{input, output, kernelOffset, kernelLength - 1}; + ret[3] = new Object[]{input, output, 0, kernelLength}; + ret[4] = new Object[]{input, output, kernelLength - 1, kernelLength}; } else { // vertical has one more argument ret[0] = new Object[]{input, output, kernelOffset, kernelLength, null}; ret[1] = new Object[]{input, output, kernelOffset + 1, kernelLength, null}; ret[2] = new Object[]{input, output, kernelOffset, kernelLength - 1, null}; + ret[3] = new Object[]{input, output, 0, kernelLength, null}; + ret[4] = new Object[]{input, output, kernelLength - 1, kernelLength, null}; } return ret; @@ -131,7 +140,14 @@ protected void compareResults( Object targetResult, Object[] targetParam, Object ImageGray expected = (ImageGray)validationParam[2]; ImageGray found = (ImageGray)targetParam[1]; - BoofTesting.assertEquals(expected, found, 1e-4); + try { + BoofTesting.assertEquals(expected, found, 1e-4); + } catch( RuntimeException e ) { + expected.print(); + System.out.println("-------"); + found.print(); + throw e; + } } public static Object createTableKernel( Class kernelType, int offset, int length ) { @@ -148,7 +164,7 @@ public static Object createTableKernel( Class kernelType, int offset, int len return kernel; } - @Test void horizontalBorder() throws NoSuchMethodException, InvocationTargetException, IllegalAccessException { + @Test void horizontalBorder() throws InvocationTargetException, IllegalAccessException { int count = 0; Method[] methods = ImplConvolveMean.class.getMethods(); for (int i = 0; i < methods.length; i++) { @@ -164,6 +180,9 @@ public static Object createTableKernel( Class kernelType, int offset, int len GImageMiscOps.fillUniform(input, rand, 0, 50); +// if (input.getClass() != GrayU8.class) +// continue; + Method testMethod = null; Class[] testParams = null; for (Method method : ConvolveNormalized_JustBorder_SB.class.getMethods()) { @@ -185,6 +204,11 @@ public static Object createTableKernel( Class kernelType, int offset, int len testMethod.invoke(null, kernel, input, expected); m.invoke(null, input, found, offset, length); +// System.out.println("------"); +// ((GrayU8)expected).print(); +// System.out.println(); +// ((GrayU8)found).print(); + BoofTesting.assertEqualsBorder(expected, found, 1e-4, length, offset); } count++; @@ -193,7 +217,7 @@ public static Object createTableKernel( Class kernelType, int offset, int len assertEquals(5, count); } - @Test void verticalBorder() throws NoSuchMethodException, InvocationTargetException, IllegalAccessException { + @Test void verticalBorder() throws InvocationTargetException, IllegalAccessException { int count = 0; Method[] methods = ImplConvolveMean.class.getMethods(); for (int i = 0; i < methods.length; i++) { @@ -207,6 +231,9 @@ public static Object createTableKernel( Class kernelType, int offset, int len ImageBase expected = GeneralizedImageOps.createImage((Class)params[1], width, height, 1); ImageBase found = GeneralizedImageOps.createImage((Class)params[1], width, height, 1); +// if (input.getClass() != GrayU8.class) +// continue; + GImageMiscOps.fillUniform(input, rand, 0, 50); Method testMethod = null; @@ -233,6 +260,11 @@ public static Object createTableKernel( Class kernelType, int offset, int len testMethod.invoke(null, kernel, input, expected); m.invoke(null, input, found, offset, length, null); +// System.out.println("------"); +// ((GrayU8)expected).print(); +// System.out.println(); +// ((GrayU8)found).print(); + BoofTesting.assertEqualsBorder(expected, found, 1e-4, length, offset); } count++;